]> asedeno.scripts.mit.edu Git - linux.git/blob - tools/perf/builtin-trace.c
7336552c22cf55125c592c5370af85b2a1baf9eb
[linux.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/cgroup.h"
23 #include "util/color.h"
24 #include "util/debug.h"
25 #include "util/env.h"
26 #include "util/event.h"
27 #include "util/evlist.h"
28 #include <subcmd/exec-cmd.h>
29 #include "util/machine.h"
30 #include "util/path.h"
31 #include "util/session.h"
32 #include "util/thread.h"
33 #include <subcmd/parse-options.h>
34 #include "util/strlist.h"
35 #include "util/intlist.h"
36 #include "util/thread_map.h"
37 #include "util/stat.h"
38 #include "trace/beauty/beauty.h"
39 #include "trace-event.h"
40 #include "util/parse-events.h"
41 #include "util/bpf-loader.h"
42 #include "callchain.h"
43 #include "print_binary.h"
44 #include "string2.h"
45 #include "syscalltbl.h"
46 #include "rb_resort.h"
47
48 #include <errno.h>
49 #include <inttypes.h>
50 #include <poll.h>
51 #include <signal.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <linux/err.h>
55 #include <linux/filter.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60 #include <fcntl.h>
61
62 #include "sane_ctype.h"
63
64 #ifndef O_CLOEXEC
65 # define O_CLOEXEC              02000000
66 #endif
67
68 #ifndef F_LINUX_SPECIFIC_BASE
69 # define F_LINUX_SPECIFIC_BASE  1024
70 #endif
71
72 struct trace {
73         struct perf_tool        tool;
74         struct syscalltbl       *sctbl;
75         struct {
76                 int             max;
77                 struct syscall  *table;
78                 struct {
79                         struct perf_evsel *sys_enter,
80                                           *sys_exit;
81                 }               events;
82         } syscalls;
83         struct record_opts      opts;
84         struct perf_evlist      *evlist;
85         struct machine          *host;
86         struct thread           *current;
87         struct cgroup           *cgroup;
88         u64                     base_time;
89         FILE                    *output;
90         unsigned long           nr_events;
91         struct strlist          *ev_qualifier;
92         struct {
93                 size_t          nr;
94                 int             *entries;
95         }                       ev_qualifier_ids;
96         struct {
97                 size_t          nr;
98                 pid_t           *entries;
99         }                       filter_pids;
100         double                  duration_filter;
101         double                  runtime_ms;
102         struct {
103                 u64             vfs_getname,
104                                 proc_getname;
105         } stats;
106         unsigned int            max_stack;
107         unsigned int            min_stack;
108         bool                    not_ev_qualifier;
109         bool                    live;
110         bool                    full_time;
111         bool                    sched;
112         bool                    multiple_threads;
113         bool                    summary;
114         bool                    summary_only;
115         bool                    failure_only;
116         bool                    show_comm;
117         bool                    print_sample;
118         bool                    show_tool_stats;
119         bool                    trace_syscalls;
120         bool                    kernel_syscallchains;
121         bool                    force;
122         bool                    vfs_getname;
123         int                     trace_pgfaults;
124 };
125
126 struct tp_field {
127         int offset;
128         union {
129                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
130                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
131         };
132 };
133
134 #define TP_UINT_FIELD(bits) \
135 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
136 { \
137         u##bits value; \
138         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
139         return value;  \
140 }
141
142 TP_UINT_FIELD(8);
143 TP_UINT_FIELD(16);
144 TP_UINT_FIELD(32);
145 TP_UINT_FIELD(64);
146
147 #define TP_UINT_FIELD__SWAPPED(bits) \
148 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
149 { \
150         u##bits value; \
151         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
152         return bswap_##bits(value);\
153 }
154
155 TP_UINT_FIELD__SWAPPED(16);
156 TP_UINT_FIELD__SWAPPED(32);
157 TP_UINT_FIELD__SWAPPED(64);
158
159 static int tp_field__init_uint(struct tp_field *field,
160                                struct format_field *format_field,
161                                bool needs_swap)
162 {
163         field->offset = format_field->offset;
164
165         switch (format_field->size) {
166         case 1:
167                 field->integer = tp_field__u8;
168                 break;
169         case 2:
170                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
171                 break;
172         case 4:
173                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
174                 break;
175         case 8:
176                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
177                 break;
178         default:
179                 return -1;
180         }
181
182         return 0;
183 }
184
185 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
186 {
187         return sample->raw_data + field->offset;
188 }
189
190 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
191 {
192         field->offset = format_field->offset;
193         field->pointer = tp_field__ptr;
194         return 0;
195 }
196
197 struct syscall_tp {
198         struct tp_field id;
199         union {
200                 struct tp_field args, ret;
201         };
202 };
203
204 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
205                                           struct tp_field *field,
206                                           const char *name)
207 {
208         struct format_field *format_field = perf_evsel__field(evsel, name);
209
210         if (format_field == NULL)
211                 return -1;
212
213         return tp_field__init_uint(field, format_field, evsel->needs_swap);
214 }
215
216 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
217         ({ struct syscall_tp *sc = evsel->priv;\
218            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
219
220 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
221                                          struct tp_field *field,
222                                          const char *name)
223 {
224         struct format_field *format_field = perf_evsel__field(evsel, name);
225
226         if (format_field == NULL)
227                 return -1;
228
229         return tp_field__init_ptr(field, format_field);
230 }
231
232 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
233         ({ struct syscall_tp *sc = evsel->priv;\
234            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
235
236 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
237 {
238         zfree(&evsel->priv);
239         perf_evsel__delete(evsel);
240 }
241
242 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
243 {
244         evsel->priv = malloc(sizeof(struct syscall_tp));
245         if (evsel->priv != NULL) {
246                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
247                         goto out_delete;
248
249                 evsel->handler = handler;
250                 return 0;
251         }
252
253         return -ENOMEM;
254
255 out_delete:
256         zfree(&evsel->priv);
257         return -ENOENT;
258 }
259
260 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
261 {
262         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
263
264         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
265         if (IS_ERR(evsel))
266                 evsel = perf_evsel__newtp("syscalls", direction);
267
268         if (IS_ERR(evsel))
269                 return NULL;
270
271         if (perf_evsel__init_syscall_tp(evsel, handler))
272                 goto out_delete;
273
274         return evsel;
275
276 out_delete:
277         perf_evsel__delete_priv(evsel);
278         return NULL;
279 }
280
281 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
282         ({ struct syscall_tp *fields = evsel->priv; \
283            fields->name.integer(&fields->name, sample); })
284
285 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
286         ({ struct syscall_tp *fields = evsel->priv; \
287            fields->name.pointer(&fields->name, sample); })
288
289 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
290 {
291         int idx = val - sa->offset;
292
293         if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL)
294                 return scnprintf(bf, size, intfmt, val);
295
296         return scnprintf(bf, size, "%s", sa->entries[idx]);
297 }
298
299 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
300                                                 const char *intfmt,
301                                                 struct syscall_arg *arg)
302 {
303         return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
304 }
305
306 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
307                                               struct syscall_arg *arg)
308 {
309         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
310 }
311
312 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
313
314 struct strarrays {
315         int             nr_entries;
316         struct strarray **entries;
317 };
318
319 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
320         .nr_entries = ARRAY_SIZE(array), \
321         .entries = array, \
322 }
323
324 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
325                                         struct syscall_arg *arg)
326 {
327         struct strarrays *sas = arg->parm;
328         int i;
329
330         for (i = 0; i < sas->nr_entries; ++i) {
331                 struct strarray *sa = sas->entries[i];
332                 int idx = arg->val - sa->offset;
333
334                 if (idx >= 0 && idx < sa->nr_entries) {
335                         if (sa->entries[idx] == NULL)
336                                 break;
337                         return scnprintf(bf, size, "%s", sa->entries[idx]);
338                 }
339         }
340
341         return scnprintf(bf, size, "%d", arg->val);
342 }
343
344 #ifndef AT_FDCWD
345 #define AT_FDCWD        -100
346 #endif
347
348 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
349                                            struct syscall_arg *arg)
350 {
351         int fd = arg->val;
352
353         if (fd == AT_FDCWD)
354                 return scnprintf(bf, size, "CWD");
355
356         return syscall_arg__scnprintf_fd(bf, size, arg);
357 }
358
359 #define SCA_FDAT syscall_arg__scnprintf_fd_at
360
361 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
362                                               struct syscall_arg *arg);
363
364 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
365
366 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
367 {
368         return scnprintf(bf, size, "%#lx", arg->val);
369 }
370
371 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
372 {
373         return scnprintf(bf, size, "%d", arg->val);
374 }
375
376 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
377 {
378         return scnprintf(bf, size, "%ld", arg->val);
379 }
380
381 static const char *bpf_cmd[] = {
382         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
383         "MAP_GET_NEXT_KEY", "PROG_LOAD",
384 };
385 static DEFINE_STRARRAY(bpf_cmd);
386
387 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
388 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
389
390 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
391 static DEFINE_STRARRAY(itimers);
392
393 static const char *keyctl_options[] = {
394         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
395         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
396         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
397         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
398         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
399 };
400 static DEFINE_STRARRAY(keyctl_options);
401
402 static const char *whences[] = { "SET", "CUR", "END",
403 #ifdef SEEK_DATA
404 "DATA",
405 #endif
406 #ifdef SEEK_HOLE
407 "HOLE",
408 #endif
409 };
410 static DEFINE_STRARRAY(whences);
411
412 static const char *fcntl_cmds[] = {
413         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
414         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
415         "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
416         "GETOWNER_UIDS",
417 };
418 static DEFINE_STRARRAY(fcntl_cmds);
419
420 static const char *fcntl_linux_specific_cmds[] = {
421         "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
422         "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
423         "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
424 };
425
426 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
427
428 static struct strarray *fcntl_cmds_arrays[] = {
429         &strarray__fcntl_cmds,
430         &strarray__fcntl_linux_specific_cmds,
431 };
432
433 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
434
435 static const char *rlimit_resources[] = {
436         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
437         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
438         "RTTIME",
439 };
440 static DEFINE_STRARRAY(rlimit_resources);
441
442 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
443 static DEFINE_STRARRAY(sighow);
444
445 static const char *clockid[] = {
446         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
447         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
448         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
449 };
450 static DEFINE_STRARRAY(clockid);
451
452 static const char *socket_families[] = {
453         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
454         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
455         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
456         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
457         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
458         "ALG", "NFC", "VSOCK",
459 };
460 static DEFINE_STRARRAY(socket_families);
461
462 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
463                                                  struct syscall_arg *arg)
464 {
465         size_t printed = 0;
466         int mode = arg->val;
467
468         if (mode == F_OK) /* 0 */
469                 return scnprintf(bf, size, "F");
470 #define P_MODE(n) \
471         if (mode & n##_OK) { \
472                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
473                 mode &= ~n##_OK; \
474         }
475
476         P_MODE(R);
477         P_MODE(W);
478         P_MODE(X);
479 #undef P_MODE
480
481         if (mode)
482                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
483
484         return printed;
485 }
486
487 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
488
489 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
490                                               struct syscall_arg *arg);
491
492 #define SCA_FILENAME syscall_arg__scnprintf_filename
493
494 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
495                                                 struct syscall_arg *arg)
496 {
497         int printed = 0, flags = arg->val;
498
499 #define P_FLAG(n) \
500         if (flags & O_##n) { \
501                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
502                 flags &= ~O_##n; \
503         }
504
505         P_FLAG(CLOEXEC);
506         P_FLAG(NONBLOCK);
507 #undef P_FLAG
508
509         if (flags)
510                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
511
512         return printed;
513 }
514
515 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
516
517 #ifndef GRND_NONBLOCK
518 #define GRND_NONBLOCK   0x0001
519 #endif
520 #ifndef GRND_RANDOM
521 #define GRND_RANDOM     0x0002
522 #endif
523
524 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
525                                                    struct syscall_arg *arg)
526 {
527         int printed = 0, flags = arg->val;
528
529 #define P_FLAG(n) \
530         if (flags & GRND_##n) { \
531                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
532                 flags &= ~GRND_##n; \
533         }
534
535         P_FLAG(RANDOM);
536         P_FLAG(NONBLOCK);
537 #undef P_FLAG
538
539         if (flags)
540                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
541
542         return printed;
543 }
544
545 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
546
547 #define STRARRAY(name, array) \
548           { .scnprintf  = SCA_STRARRAY, \
549             .parm       = &strarray__##array, }
550
551 #include "trace/beauty/arch_errno_names.c"
552 #include "trace/beauty/eventfd.c"
553 #include "trace/beauty/futex_op.c"
554 #include "trace/beauty/futex_val3.c"
555 #include "trace/beauty/mmap.c"
556 #include "trace/beauty/mode_t.c"
557 #include "trace/beauty/msg_flags.c"
558 #include "trace/beauty/open_flags.c"
559 #include "trace/beauty/perf_event_open.c"
560 #include "trace/beauty/pid.c"
561 #include "trace/beauty/sched_policy.c"
562 #include "trace/beauty/seccomp.c"
563 #include "trace/beauty/signum.c"
564 #include "trace/beauty/socket_type.c"
565 #include "trace/beauty/waitid_options.c"
566
567 struct syscall_arg_fmt {
568         size_t     (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
569         void       *parm;
570         const char *name;
571         bool       show_zero;
572 };
573
574 static struct syscall_fmt {
575         const char *name;
576         const char *alias;
577         struct syscall_arg_fmt arg[6];
578         u8         nr_args;
579         bool       errpid;
580         bool       timeout;
581         bool       hexret;
582 } syscall_fmts[] = {
583         { .name     = "access",
584           .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
585         { .name     = "bpf",
586           .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
587         { .name     = "brk",        .hexret = true,
588           .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
589         { .name     = "clock_gettime",
590           .arg = { [0] = STRARRAY(clk_id, clockid), }, },
591         { .name     = "clone",      .errpid = true, .nr_args = 5,
592           .arg = { [0] = { .name = "flags",         .scnprintf = SCA_CLONE_FLAGS, },
593                    [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
594                    [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
595                    [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
596                    [4] = { .name = "tls",           .scnprintf = SCA_HEX, }, }, },
597         { .name     = "close",
598           .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
599         { .name     = "epoll_ctl",
600           .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
601         { .name     = "eventfd2",
602           .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
603         { .name     = "fchmodat",
604           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
605         { .name     = "fchownat",
606           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
607         { .name     = "fcntl",
608           .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
609                            .parm      = &strarrays__fcntl_cmds_arrays,
610                            .show_zero = true, },
611                    [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
612         { .name     = "flock",
613           .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
614         { .name     = "fstat", .alias = "newfstat", },
615         { .name     = "fstatat", .alias = "newfstatat", },
616         { .name     = "futex",
617           .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
618                    [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
619         { .name     = "futimesat",
620           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
621         { .name     = "getitimer",
622           .arg = { [0] = STRARRAY(which, itimers), }, },
623         { .name     = "getpid",     .errpid = true, },
624         { .name     = "getpgid",    .errpid = true, },
625         { .name     = "getppid",    .errpid = true, },
626         { .name     = "getrandom",
627           .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
628         { .name     = "getrlimit",
629           .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
630         { .name     = "gettid",     .errpid = true, },
631         { .name     = "ioctl",
632           .arg = {
633 #if defined(__i386__) || defined(__x86_64__)
634 /*
635  * FIXME: Make this available to all arches.
636  */
637                    [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
638                    [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
639 #else
640                    [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
641 #endif
642         { .name     = "kcmp",       .nr_args = 5,
643           .arg = { [0] = { .name = "pid1",      .scnprintf = SCA_PID, },
644                    [1] = { .name = "pid2",      .scnprintf = SCA_PID, },
645                    [2] = { .name = "type",      .scnprintf = SCA_KCMP_TYPE, },
646                    [3] = { .name = "idx1",      .scnprintf = SCA_KCMP_IDX, },
647                    [4] = { .name = "idx2",      .scnprintf = SCA_KCMP_IDX, }, }, },
648         { .name     = "keyctl",
649           .arg = { [0] = STRARRAY(option, keyctl_options), }, },
650         { .name     = "kill",
651           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
652         { .name     = "linkat",
653           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
654         { .name     = "lseek",
655           .arg = { [2] = STRARRAY(whence, whences), }, },
656         { .name     = "lstat", .alias = "newlstat", },
657         { .name     = "madvise",
658           .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
659                    [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
660         { .name     = "mkdirat",
661           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
662         { .name     = "mknodat",
663           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
664         { .name     = "mlock",
665           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
666         { .name     = "mlockall",
667           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
668         { .name     = "mmap",       .hexret = true,
669 /* The standard mmap maps to old_mmap on s390x */
670 #if defined(__s390x__)
671         .alias = "old_mmap",
672 #endif
673           .arg = { [0] = { .scnprintf = SCA_HEX,        /* addr */ },
674                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
675                    [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, },
676         { .name     = "mprotect",
677           .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
678                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ }, }, },
679         { .name     = "mq_unlink",
680           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
681         { .name     = "mremap",     .hexret = true,
682           .arg = { [0] = { .scnprintf = SCA_HEX,          /* addr */ },
683                    [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
684                    [4] = { .scnprintf = SCA_HEX,          /* new_addr */ }, }, },
685         { .name     = "munlock",
686           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
687         { .name     = "munmap",
688           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
689         { .name     = "name_to_handle_at",
690           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
691         { .name     = "newfstatat",
692           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
693         { .name     = "open",
694           .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
695         { .name     = "open_by_handle_at",
696           .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
697                    [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
698         { .name     = "openat",
699           .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
700                    [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
701         { .name     = "perf_event_open",
702           .arg = { [2] = { .scnprintf = SCA_INT,        /* cpu */ },
703                    [3] = { .scnprintf = SCA_FD,         /* group_fd */ },
704                    [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
705         { .name     = "pipe2",
706           .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
707         { .name     = "pkey_alloc",
708           .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,   /* access_rights */ }, }, },
709         { .name     = "pkey_free",
710           .arg = { [0] = { .scnprintf = SCA_INT,        /* key */ }, }, },
711         { .name     = "pkey_mprotect",
712           .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
713                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
714                    [3] = { .scnprintf = SCA_INT,        /* pkey */ }, }, },
715         { .name     = "poll", .timeout = true, },
716         { .name     = "ppoll", .timeout = true, },
717         { .name     = "prctl", .alias = "arch_prctl",
718           .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
719                    [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
720                    [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
721         { .name     = "pread", .alias = "pread64", },
722         { .name     = "preadv", .alias = "pread", },
723         { .name     = "prlimit64",
724           .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
725         { .name     = "pwrite", .alias = "pwrite64", },
726         { .name     = "readlinkat",
727           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
728         { .name     = "recvfrom",
729           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
730         { .name     = "recvmmsg",
731           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
732         { .name     = "recvmsg",
733           .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
734         { .name     = "renameat",
735           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
736         { .name     = "rt_sigaction",
737           .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
738         { .name     = "rt_sigprocmask",
739           .arg = { [0] = STRARRAY(how, sighow), }, },
740         { .name     = "rt_sigqueueinfo",
741           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
742         { .name     = "rt_tgsigqueueinfo",
743           .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
744         { .name     = "sched_setscheduler",
745           .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
746         { .name     = "seccomp",
747           .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,    /* op */ },
748                    [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
749         { .name     = "select", .timeout = true, },
750         { .name     = "sendmmsg",
751           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
752         { .name     = "sendmsg",
753           .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
754         { .name     = "sendto",
755           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
756         { .name     = "set_tid_address", .errpid = true, },
757         { .name     = "setitimer",
758           .arg = { [0] = STRARRAY(which, itimers), }, },
759         { .name     = "setrlimit",
760           .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
761         { .name     = "socket",
762           .arg = { [0] = STRARRAY(family, socket_families),
763                    [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
764                    [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
765         { .name     = "socketpair",
766           .arg = { [0] = STRARRAY(family, socket_families),
767                    [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
768                    [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
769         { .name     = "stat", .alias = "newstat", },
770         { .name     = "statx",
771           .arg = { [0] = { .scnprintf = SCA_FDAT,        /* fdat */ },
772                    [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
773                    [3] = { .scnprintf = SCA_STATX_MASK,  /* mask */ }, }, },
774         { .name     = "swapoff",
775           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
776         { .name     = "swapon",
777           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
778         { .name     = "symlinkat",
779           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
780         { .name     = "tgkill",
781           .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
782         { .name     = "tkill",
783           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
784         { .name     = "uname", .alias = "newuname", },
785         { .name     = "unlinkat",
786           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
787         { .name     = "utimensat",
788           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
789         { .name     = "wait4",      .errpid = true,
790           .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
791         { .name     = "waitid",     .errpid = true,
792           .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
793 };
794
795 static int syscall_fmt__cmp(const void *name, const void *fmtp)
796 {
797         const struct syscall_fmt *fmt = fmtp;
798         return strcmp(name, fmt->name);
799 }
800
801 static struct syscall_fmt *syscall_fmt__find(const char *name)
802 {
803         const int nmemb = ARRAY_SIZE(syscall_fmts);
804         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
805 }
806
807 /*
808  * is_exit: is this "exit" or "exit_group"?
809  * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
810  */
811 struct syscall {
812         struct event_format *tp_format;
813         int                 nr_args;
814         bool                is_exit;
815         bool                is_open;
816         struct format_field *args;
817         const char          *name;
818         struct syscall_fmt  *fmt;
819         struct syscall_arg_fmt *arg_fmt;
820 };
821
822 /*
823  * We need to have this 'calculated' boolean because in some cases we really
824  * don't know what is the duration of a syscall, for instance, when we start
825  * a session and some threads are waiting for a syscall to finish, say 'poll',
826  * in which case all we can do is to print "( ? ) for duration and for the
827  * start timestamp.
828  */
829 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
830 {
831         double duration = (double)t / NSEC_PER_MSEC;
832         size_t printed = fprintf(fp, "(");
833
834         if (!calculated)
835                 printed += fprintf(fp, "         ");
836         else if (duration >= 1.0)
837                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
838         else if (duration >= 0.01)
839                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
840         else
841                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
842         return printed + fprintf(fp, "): ");
843 }
844
845 /**
846  * filename.ptr: The filename char pointer that will be vfs_getname'd
847  * filename.entry_str_pos: Where to insert the string translated from
848  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
849  * ret_scnprintf: syscall args may set this to a different syscall return
850  *                formatter, for instance, fcntl may return fds, file flags, etc.
851  */
852 struct thread_trace {
853         u64               entry_time;
854         bool              entry_pending;
855         unsigned long     nr_events;
856         unsigned long     pfmaj, pfmin;
857         char              *entry_str;
858         double            runtime_ms;
859         size_t            (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
860         struct {
861                 unsigned long ptr;
862                 short int     entry_str_pos;
863                 bool          pending_open;
864                 unsigned int  namelen;
865                 char          *name;
866         } filename;
867         struct {
868                 int       max;
869                 char      **table;
870         } paths;
871
872         struct intlist *syscall_stats;
873 };
874
875 static struct thread_trace *thread_trace__new(void)
876 {
877         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
878
879         if (ttrace)
880                 ttrace->paths.max = -1;
881
882         ttrace->syscall_stats = intlist__new(NULL);
883
884         return ttrace;
885 }
886
887 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
888 {
889         struct thread_trace *ttrace;
890
891         if (thread == NULL)
892                 goto fail;
893
894         if (thread__priv(thread) == NULL)
895                 thread__set_priv(thread, thread_trace__new());
896
897         if (thread__priv(thread) == NULL)
898                 goto fail;
899
900         ttrace = thread__priv(thread);
901         ++ttrace->nr_events;
902
903         return ttrace;
904 fail:
905         color_fprintf(fp, PERF_COLOR_RED,
906                       "WARNING: not enough memory, dropping samples!\n");
907         return NULL;
908 }
909
910
911 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
912                                     size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
913 {
914         struct thread_trace *ttrace = thread__priv(arg->thread);
915
916         ttrace->ret_scnprintf = ret_scnprintf;
917 }
918
919 #define TRACE_PFMAJ             (1 << 0)
920 #define TRACE_PFMIN             (1 << 1)
921
922 static const size_t trace__entry_str_size = 2048;
923
924 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
925 {
926         struct thread_trace *ttrace = thread__priv(thread);
927
928         if (fd > ttrace->paths.max) {
929                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
930
931                 if (npath == NULL)
932                         return -1;
933
934                 if (ttrace->paths.max != -1) {
935                         memset(npath + ttrace->paths.max + 1, 0,
936                                (fd - ttrace->paths.max) * sizeof(char *));
937                 } else {
938                         memset(npath, 0, (fd + 1) * sizeof(char *));
939                 }
940
941                 ttrace->paths.table = npath;
942                 ttrace->paths.max   = fd;
943         }
944
945         ttrace->paths.table[fd] = strdup(pathname);
946
947         return ttrace->paths.table[fd] != NULL ? 0 : -1;
948 }
949
950 static int thread__read_fd_path(struct thread *thread, int fd)
951 {
952         char linkname[PATH_MAX], pathname[PATH_MAX];
953         struct stat st;
954         int ret;
955
956         if (thread->pid_ == thread->tid) {
957                 scnprintf(linkname, sizeof(linkname),
958                           "/proc/%d/fd/%d", thread->pid_, fd);
959         } else {
960                 scnprintf(linkname, sizeof(linkname),
961                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
962         }
963
964         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
965                 return -1;
966
967         ret = readlink(linkname, pathname, sizeof(pathname));
968
969         if (ret < 0 || ret > st.st_size)
970                 return -1;
971
972         pathname[ret] = '\0';
973         return trace__set_fd_pathname(thread, fd, pathname);
974 }
975
976 static const char *thread__fd_path(struct thread *thread, int fd,
977                                    struct trace *trace)
978 {
979         struct thread_trace *ttrace = thread__priv(thread);
980
981         if (ttrace == NULL)
982                 return NULL;
983
984         if (fd < 0)
985                 return NULL;
986
987         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
988                 if (!trace->live)
989                         return NULL;
990                 ++trace->stats.proc_getname;
991                 if (thread__read_fd_path(thread, fd))
992                         return NULL;
993         }
994
995         return ttrace->paths.table[fd];
996 }
997
998 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
999 {
1000         int fd = arg->val;
1001         size_t printed = scnprintf(bf, size, "%d", fd);
1002         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1003
1004         if (path)
1005                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1006
1007         return printed;
1008 }
1009
1010 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1011 {
1012         size_t printed = scnprintf(bf, size, "%d", fd);
1013         struct thread *thread = machine__find_thread(trace->host, pid, pid);
1014
1015         if (thread) {
1016                 const char *path = thread__fd_path(thread, fd, trace);
1017
1018                 if (path)
1019                         printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1020
1021                 thread__put(thread);
1022         }
1023
1024         return printed;
1025 }
1026
1027 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1028                                               struct syscall_arg *arg)
1029 {
1030         int fd = arg->val;
1031         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1032         struct thread_trace *ttrace = thread__priv(arg->thread);
1033
1034         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1035                 zfree(&ttrace->paths.table[fd]);
1036
1037         return printed;
1038 }
1039
1040 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1041                                      unsigned long ptr)
1042 {
1043         struct thread_trace *ttrace = thread__priv(thread);
1044
1045         ttrace->filename.ptr = ptr;
1046         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1047 }
1048
1049 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1050                                               struct syscall_arg *arg)
1051 {
1052         unsigned long ptr = arg->val;
1053
1054         if (!arg->trace->vfs_getname)
1055                 return scnprintf(bf, size, "%#x", ptr);
1056
1057         thread__set_filename_pos(arg->thread, bf, ptr);
1058         return 0;
1059 }
1060
1061 static bool trace__filter_duration(struct trace *trace, double t)
1062 {
1063         return t < (trace->duration_filter * NSEC_PER_MSEC);
1064 }
1065
1066 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1067 {
1068         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1069
1070         return fprintf(fp, "%10.3f ", ts);
1071 }
1072
1073 /*
1074  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1075  * using ttrace->entry_time for a thread that receives a sys_exit without
1076  * first having received a sys_enter ("poll" issued before tracing session
1077  * starts, lost sys_enter exit due to ring buffer overflow).
1078  */
1079 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1080 {
1081         if (tstamp > 0)
1082                 return __trace__fprintf_tstamp(trace, tstamp, fp);
1083
1084         return fprintf(fp, "         ? ");
1085 }
1086
1087 static bool done = false;
1088 static bool interrupted = false;
1089
1090 static void sig_handler(int sig)
1091 {
1092         done = true;
1093         interrupted = sig == SIGINT;
1094 }
1095
1096 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1097                                         u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1098 {
1099         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1100         printed += fprintf_duration(duration, duration_calculated, fp);
1101
1102         if (trace->multiple_threads) {
1103                 if (trace->show_comm)
1104                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1105                 printed += fprintf(fp, "%d ", thread->tid);
1106         }
1107
1108         return printed;
1109 }
1110
1111 static int trace__process_event(struct trace *trace, struct machine *machine,
1112                                 union perf_event *event, struct perf_sample *sample)
1113 {
1114         int ret = 0;
1115
1116         switch (event->header.type) {
1117         case PERF_RECORD_LOST:
1118                 color_fprintf(trace->output, PERF_COLOR_RED,
1119                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1120                 ret = machine__process_lost_event(machine, event, sample);
1121                 break;
1122         default:
1123                 ret = machine__process_event(machine, event, sample);
1124                 break;
1125         }
1126
1127         return ret;
1128 }
1129
1130 static int trace__tool_process(struct perf_tool *tool,
1131                                union perf_event *event,
1132                                struct perf_sample *sample,
1133                                struct machine *machine)
1134 {
1135         struct trace *trace = container_of(tool, struct trace, tool);
1136         return trace__process_event(trace, machine, event, sample);
1137 }
1138
1139 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1140 {
1141         struct machine *machine = vmachine;
1142
1143         if (machine->kptr_restrict_warned)
1144                 return NULL;
1145
1146         if (symbol_conf.kptr_restrict) {
1147                 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1148                            "Check /proc/sys/kernel/kptr_restrict.\n\n"
1149                            "Kernel samples will not be resolved.\n");
1150                 machine->kptr_restrict_warned = true;
1151                 return NULL;
1152         }
1153
1154         return machine__resolve_kernel_addr(vmachine, addrp, modp);
1155 }
1156
1157 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1158 {
1159         int err = symbol__init(NULL);
1160
1161         if (err)
1162                 return err;
1163
1164         trace->host = machine__new_host();
1165         if (trace->host == NULL)
1166                 return -ENOMEM;
1167
1168         err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1169         if (err < 0)
1170                 goto out;
1171
1172         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1173                                             evlist->threads, trace__tool_process, false,
1174                                             trace->opts.proc_map_timeout, 1);
1175 out:
1176         if (err)
1177                 symbol__exit();
1178
1179         return err;
1180 }
1181
1182 static void trace__symbols__exit(struct trace *trace)
1183 {
1184         machine__exit(trace->host);
1185         trace->host = NULL;
1186
1187         symbol__exit();
1188 }
1189
1190 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1191 {
1192         int idx;
1193
1194         if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1195                 nr_args = sc->fmt->nr_args;
1196
1197         sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1198         if (sc->arg_fmt == NULL)
1199                 return -1;
1200
1201         for (idx = 0; idx < nr_args; ++idx) {
1202                 if (sc->fmt)
1203                         sc->arg_fmt[idx] = sc->fmt->arg[idx];
1204         }
1205
1206         sc->nr_args = nr_args;
1207         return 0;
1208 }
1209
1210 static int syscall__set_arg_fmts(struct syscall *sc)
1211 {
1212         struct format_field *field;
1213         int idx = 0, len;
1214
1215         for (field = sc->args; field; field = field->next, ++idx) {
1216                 if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1217                         continue;
1218
1219                 if (strcmp(field->type, "const char *") == 0 &&
1220                          (strcmp(field->name, "filename") == 0 ||
1221                           strcmp(field->name, "path") == 0 ||
1222                           strcmp(field->name, "pathname") == 0))
1223                         sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1224                 else if (field->flags & FIELD_IS_POINTER)
1225                         sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1226                 else if (strcmp(field->type, "pid_t") == 0)
1227                         sc->arg_fmt[idx].scnprintf = SCA_PID;
1228                 else if (strcmp(field->type, "umode_t") == 0)
1229                         sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1230                 else if ((strcmp(field->type, "int") == 0 ||
1231                           strcmp(field->type, "unsigned int") == 0 ||
1232                           strcmp(field->type, "long") == 0) &&
1233                          (len = strlen(field->name)) >= 2 &&
1234                          strcmp(field->name + len - 2, "fd") == 0) {
1235                         /*
1236                          * /sys/kernel/tracing/events/syscalls/sys_enter*
1237                          * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1238                          * 65 int
1239                          * 23 unsigned int
1240                          * 7 unsigned long
1241                          */
1242                         sc->arg_fmt[idx].scnprintf = SCA_FD;
1243                 }
1244         }
1245
1246         return 0;
1247 }
1248
1249 static int trace__read_syscall_info(struct trace *trace, int id)
1250 {
1251         char tp_name[128];
1252         struct syscall *sc;
1253         const char *name = syscalltbl__name(trace->sctbl, id);
1254
1255         if (name == NULL)
1256                 return -1;
1257
1258         if (id > trace->syscalls.max) {
1259                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1260
1261                 if (nsyscalls == NULL)
1262                         return -1;
1263
1264                 if (trace->syscalls.max != -1) {
1265                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1266                                (id - trace->syscalls.max) * sizeof(*sc));
1267                 } else {
1268                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1269                 }
1270
1271                 trace->syscalls.table = nsyscalls;
1272                 trace->syscalls.max   = id;
1273         }
1274
1275         sc = trace->syscalls.table + id;
1276         sc->name = name;
1277
1278         sc->fmt  = syscall_fmt__find(sc->name);
1279
1280         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1281         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1282
1283         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1284                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1285                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1286         }
1287
1288         if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1289                 return -1;
1290
1291         if (IS_ERR(sc->tp_format))
1292                 return -1;
1293
1294         sc->args = sc->tp_format->format.fields;
1295         /*
1296          * We need to check and discard the first variable '__syscall_nr'
1297          * or 'nr' that mean the syscall number. It is needless here.
1298          * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1299          */
1300         if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1301                 sc->args = sc->args->next;
1302                 --sc->nr_args;
1303         }
1304
1305         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1306         sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
1307
1308         return syscall__set_arg_fmts(sc);
1309 }
1310
1311 static int trace__validate_ev_qualifier(struct trace *trace)
1312 {
1313         int err = 0, i;
1314         size_t nr_allocated;
1315         struct str_node *pos;
1316
1317         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1318         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1319                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1320
1321         if (trace->ev_qualifier_ids.entries == NULL) {
1322                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1323                        trace->output);
1324                 err = -EINVAL;
1325                 goto out;
1326         }
1327
1328         nr_allocated = trace->ev_qualifier_ids.nr;
1329         i = 0;
1330
1331         strlist__for_each_entry(pos, trace->ev_qualifier) {
1332                 const char *sc = pos->s;
1333                 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1334
1335                 if (id < 0) {
1336                         id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1337                         if (id >= 0)
1338                                 goto matches;
1339
1340                         if (err == 0) {
1341                                 fputs("Error:\tInvalid syscall ", trace->output);
1342                                 err = -EINVAL;
1343                         } else {
1344                                 fputs(", ", trace->output);
1345                         }
1346
1347                         fputs(sc, trace->output);
1348                 }
1349 matches:
1350                 trace->ev_qualifier_ids.entries[i++] = id;
1351                 if (match_next == -1)
1352                         continue;
1353
1354                 while (1) {
1355                         id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1356                         if (id < 0)
1357                                 break;
1358                         if (nr_allocated == trace->ev_qualifier_ids.nr) {
1359                                 void *entries;
1360
1361                                 nr_allocated += 8;
1362                                 entries = realloc(trace->ev_qualifier_ids.entries,
1363                                                   nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1364                                 if (entries == NULL) {
1365                                         err = -ENOMEM;
1366                                         fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1367                                         goto out_free;
1368                                 }
1369                                 trace->ev_qualifier_ids.entries = entries;
1370                         }
1371                         trace->ev_qualifier_ids.nr++;
1372                         trace->ev_qualifier_ids.entries[i++] = id;
1373                 }
1374         }
1375
1376         if (err < 0) {
1377                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1378                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1379 out_free:
1380                 zfree(&trace->ev_qualifier_ids.entries);
1381                 trace->ev_qualifier_ids.nr = 0;
1382         }
1383 out:
1384         return err;
1385 }
1386
1387 /*
1388  * args is to be interpreted as a series of longs but we need to handle
1389  * 8-byte unaligned accesses. args points to raw_data within the event
1390  * and raw_data is guaranteed to be 8-byte unaligned because it is
1391  * preceded by raw_size which is a u32. So we need to copy args to a temp
1392  * variable to read it. Most notably this avoids extended load instructions
1393  * on unaligned addresses
1394  */
1395 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1396 {
1397         unsigned long val;
1398         unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1399
1400         memcpy(&val, p, sizeof(val));
1401         return val;
1402 }
1403
1404 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1405                                       struct syscall_arg *arg)
1406 {
1407         if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1408                 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1409
1410         return scnprintf(bf, size, "arg%d: ", arg->idx);
1411 }
1412
1413 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1414                                      struct syscall_arg *arg, unsigned long val)
1415 {
1416         if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1417                 arg->val = val;
1418                 if (sc->arg_fmt[arg->idx].parm)
1419                         arg->parm = sc->arg_fmt[arg->idx].parm;
1420                 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1421         }
1422         return scnprintf(bf, size, "%ld", val);
1423 }
1424
1425 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1426                                       unsigned char *args, struct trace *trace,
1427                                       struct thread *thread)
1428 {
1429         size_t printed = 0;
1430         unsigned long val;
1431         u8 bit = 1;
1432         struct syscall_arg arg = {
1433                 .args   = args,
1434                 .idx    = 0,
1435                 .mask   = 0,
1436                 .trace  = trace,
1437                 .thread = thread,
1438         };
1439         struct thread_trace *ttrace = thread__priv(thread);
1440
1441         /*
1442          * Things like fcntl will set this in its 'cmd' formatter to pick the
1443          * right formatter for the return value (an fd? file flags?), which is
1444          * not needed for syscalls that always return a given type, say an fd.
1445          */
1446         ttrace->ret_scnprintf = NULL;
1447
1448         if (sc->args != NULL) {
1449                 struct format_field *field;
1450
1451                 for (field = sc->args; field;
1452                      field = field->next, ++arg.idx, bit <<= 1) {
1453                         if (arg.mask & bit)
1454                                 continue;
1455
1456                         val = syscall_arg__val(&arg, arg.idx);
1457
1458                         /*
1459                          * Suppress this argument if its value is zero and
1460                          * and we don't have a string associated in an
1461                          * strarray for it.
1462                          */
1463                         if (val == 0 &&
1464                             !(sc->arg_fmt &&
1465                               (sc->arg_fmt[arg.idx].show_zero ||
1466                                sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1467                                sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1468                               sc->arg_fmt[arg.idx].parm))
1469                                 continue;
1470
1471                         printed += scnprintf(bf + printed, size - printed,
1472                                              "%s%s: ", printed ? ", " : "", field->name);
1473                         printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1474                 }
1475         } else if (IS_ERR(sc->tp_format)) {
1476                 /*
1477                  * If we managed to read the tracepoint /format file, then we
1478                  * may end up not having any args, like with gettid(), so only
1479                  * print the raw args when we didn't manage to read it.
1480                  */
1481                 while (arg.idx < sc->nr_args) {
1482                         if (arg.mask & bit)
1483                                 goto next_arg;
1484                         val = syscall_arg__val(&arg, arg.idx);
1485                         if (printed)
1486                                 printed += scnprintf(bf + printed, size - printed, ", ");
1487                         printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1488                         printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1489 next_arg:
1490                         ++arg.idx;
1491                         bit <<= 1;
1492                 }
1493         }
1494
1495         return printed;
1496 }
1497
1498 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1499                                   union perf_event *event,
1500                                   struct perf_sample *sample);
1501
1502 static struct syscall *trace__syscall_info(struct trace *trace,
1503                                            struct perf_evsel *evsel, int id)
1504 {
1505
1506         if (id < 0) {
1507
1508                 /*
1509                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1510                  * before that, leaving at a higher verbosity level till that is
1511                  * explained. Reproduced with plain ftrace with:
1512                  *
1513                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1514                  * grep "NR -1 " /t/trace_pipe
1515                  *
1516                  * After generating some load on the machine.
1517                  */
1518                 if (verbose > 1) {
1519                         static u64 n;
1520                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1521                                 id, perf_evsel__name(evsel), ++n);
1522                 }
1523                 return NULL;
1524         }
1525
1526         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1527             trace__read_syscall_info(trace, id))
1528                 goto out_cant_read;
1529
1530         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1531                 goto out_cant_read;
1532
1533         return &trace->syscalls.table[id];
1534
1535 out_cant_read:
1536         if (verbose > 0) {
1537                 fprintf(trace->output, "Problems reading syscall %d", id);
1538                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1539                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1540                 fputs(" information\n", trace->output);
1541         }
1542         return NULL;
1543 }
1544
1545 static void thread__update_stats(struct thread_trace *ttrace,
1546                                  int id, struct perf_sample *sample)
1547 {
1548         struct int_node *inode;
1549         struct stats *stats;
1550         u64 duration = 0;
1551
1552         inode = intlist__findnew(ttrace->syscall_stats, id);
1553         if (inode == NULL)
1554                 return;
1555
1556         stats = inode->priv;
1557         if (stats == NULL) {
1558                 stats = malloc(sizeof(struct stats));
1559                 if (stats == NULL)
1560                         return;
1561                 init_stats(stats);
1562                 inode->priv = stats;
1563         }
1564
1565         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1566                 duration = sample->time - ttrace->entry_time;
1567
1568         update_stats(stats, duration);
1569 }
1570
1571 static int trace__printf_interrupted_entry(struct trace *trace)
1572 {
1573         struct thread_trace *ttrace;
1574         size_t printed;
1575
1576         if (trace->failure_only || trace->current == NULL)
1577                 return 0;
1578
1579         ttrace = thread__priv(trace->current);
1580
1581         if (!ttrace->entry_pending)
1582                 return 0;
1583
1584         printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1585         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1586         ttrace->entry_pending = false;
1587
1588         return printed;
1589 }
1590
1591 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1592                                  struct perf_sample *sample, struct thread *thread)
1593 {
1594         int printed = 0;
1595
1596         if (trace->print_sample) {
1597                 double ts = (double)sample->time / NSEC_PER_MSEC;
1598
1599                 printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1600                                    perf_evsel__name(evsel), ts,
1601                                    thread__comm_str(thread),
1602                                    sample->pid, sample->tid, sample->cpu);
1603         }
1604
1605         return printed;
1606 }
1607
1608 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1609                             union perf_event *event __maybe_unused,
1610                             struct perf_sample *sample)
1611 {
1612         char *msg;
1613         void *args;
1614         size_t printed = 0;
1615         struct thread *thread;
1616         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1617         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1618         struct thread_trace *ttrace;
1619
1620         if (sc == NULL)
1621                 return -1;
1622
1623         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1624         ttrace = thread__trace(thread, trace->output);
1625         if (ttrace == NULL)
1626                 goto out_put;
1627
1628         trace__fprintf_sample(trace, evsel, sample, thread);
1629
1630         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1631
1632         if (ttrace->entry_str == NULL) {
1633                 ttrace->entry_str = malloc(trace__entry_str_size);
1634                 if (!ttrace->entry_str)
1635                         goto out_put;
1636         }
1637
1638         if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1639                 trace__printf_interrupted_entry(trace);
1640
1641         ttrace->entry_time = sample->time;
1642         msg = ttrace->entry_str;
1643         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1644
1645         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1646                                            args, trace, thread);
1647
1648         if (sc->is_exit) {
1649                 if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1650                         trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1651                         fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1652                 }
1653         } else {
1654                 ttrace->entry_pending = true;
1655                 /* See trace__vfs_getname & trace__sys_exit */
1656                 ttrace->filename.pending_open = false;
1657         }
1658
1659         if (trace->current != thread) {
1660                 thread__put(trace->current);
1661                 trace->current = thread__get(thread);
1662         }
1663         err = 0;
1664 out_put:
1665         thread__put(thread);
1666         return err;
1667 }
1668
1669 static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evsel,
1670                                     struct perf_sample *sample)
1671 {
1672         struct format_field *field = perf_evsel__field(evsel, "__syscall_nr");
1673         struct thread_trace *ttrace;
1674         struct thread *thread;
1675         struct syscall *sc;
1676         char msg[1024];
1677         int id, err = -1;
1678         void *args;
1679
1680         if (field == NULL)
1681                 return -1;
1682
1683         id = format_field__intval(field, sample, evsel->needs_swap);
1684         sc = trace__syscall_info(trace, evsel, id);
1685
1686         if (sc == NULL)
1687                 return -1;
1688
1689         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1690         ttrace = thread__trace(thread, trace->output);
1691         /*
1692          * We need to get ttrace just to make sure it is there when syscall__scnprintf_args()
1693          * and the rest of the beautifiers accessing it via struct syscall_arg touches it.
1694          */
1695         if (ttrace == NULL)
1696                 goto out_put;
1697
1698         args = sample->raw_data + field->offset + sizeof(u64); /* skip __syscall_nr, there is where args are */
1699         syscall__scnprintf_args(sc, msg, sizeof(msg), args, trace, thread);
1700         fprintf(trace->output, "%s", msg);
1701         err = 0;
1702 out_put:
1703         thread__put(thread);
1704         return err;
1705 }
1706
1707 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1708                                     struct perf_sample *sample,
1709                                     struct callchain_cursor *cursor)
1710 {
1711         struct addr_location al;
1712         int max_stack = evsel->attr.sample_max_stack ?
1713                         evsel->attr.sample_max_stack :
1714                         trace->max_stack;
1715
1716         if (machine__resolve(trace->host, &al, sample) < 0 ||
1717             thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1718                 return -1;
1719
1720         return 0;
1721 }
1722
1723 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1724 {
1725         /* TODO: user-configurable print_opts */
1726         const unsigned int print_opts = EVSEL__PRINT_SYM |
1727                                         EVSEL__PRINT_DSO |
1728                                         EVSEL__PRINT_UNKNOWN_AS_ADDR;
1729
1730         return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1731 }
1732
1733 static const char *errno_to_name(struct perf_evsel *evsel, int err)
1734 {
1735         struct perf_env *env = perf_evsel__env(evsel);
1736         const char *arch_name = perf_env__arch(env);
1737
1738         return arch_syscalls__strerrno(arch_name, err);
1739 }
1740
1741 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1742                            union perf_event *event __maybe_unused,
1743                            struct perf_sample *sample)
1744 {
1745         long ret;
1746         u64 duration = 0;
1747         bool duration_calculated = false;
1748         struct thread *thread;
1749         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1750         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1751         struct thread_trace *ttrace;
1752
1753         if (sc == NULL)
1754                 return -1;
1755
1756         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1757         ttrace = thread__trace(thread, trace->output);
1758         if (ttrace == NULL)
1759                 goto out_put;
1760
1761         trace__fprintf_sample(trace, evsel, sample, thread);
1762
1763         if (trace->summary)
1764                 thread__update_stats(ttrace, id, sample);
1765
1766         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1767
1768         if (sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
1769                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1770                 ttrace->filename.pending_open = false;
1771                 ++trace->stats.vfs_getname;
1772         }
1773
1774         if (ttrace->entry_time) {
1775                 duration = sample->time - ttrace->entry_time;
1776                 if (trace__filter_duration(trace, duration))
1777                         goto out;
1778                 duration_calculated = true;
1779         } else if (trace->duration_filter)
1780                 goto out;
1781
1782         if (sample->callchain) {
1783                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1784                 if (callchain_ret == 0) {
1785                         if (callchain_cursor.nr < trace->min_stack)
1786                                 goto out;
1787                         callchain_ret = 1;
1788                 }
1789         }
1790
1791         if (trace->summary_only || (ret >= 0 && trace->failure_only))
1792                 goto out;
1793
1794         trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1795
1796         if (ttrace->entry_pending) {
1797                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1798         } else {
1799                 fprintf(trace->output, " ... [");
1800                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1801                 fprintf(trace->output, "]: %s()", sc->name);
1802         }
1803
1804         if (sc->fmt == NULL) {
1805                 if (ret < 0)
1806                         goto errno_print;
1807 signed_print:
1808                 fprintf(trace->output, ") = %ld", ret);
1809         } else if (ret < 0) {
1810 errno_print: {
1811                 char bf[STRERR_BUFSIZE];
1812                 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1813                            *e = errno_to_name(evsel, -ret);
1814
1815                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1816         }
1817         } else if (ret == 0 && sc->fmt->timeout)
1818                 fprintf(trace->output, ") = 0 Timeout");
1819         else if (ttrace->ret_scnprintf) {
1820                 char bf[1024];
1821                 struct syscall_arg arg = {
1822                         .val    = ret,
1823                         .thread = thread,
1824                         .trace  = trace,
1825                 };
1826                 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1827                 ttrace->ret_scnprintf = NULL;
1828                 fprintf(trace->output, ") = %s", bf);
1829         } else if (sc->fmt->hexret)
1830                 fprintf(trace->output, ") = %#lx", ret);
1831         else if (sc->fmt->errpid) {
1832                 struct thread *child = machine__find_thread(trace->host, ret, ret);
1833
1834                 if (child != NULL) {
1835                         fprintf(trace->output, ") = %ld", ret);
1836                         if (child->comm_set)
1837                                 fprintf(trace->output, " (%s)", thread__comm_str(child));
1838                         thread__put(child);
1839                 }
1840         } else
1841                 goto signed_print;
1842
1843         fputc('\n', trace->output);
1844
1845         if (callchain_ret > 0)
1846                 trace__fprintf_callchain(trace, sample);
1847         else if (callchain_ret < 0)
1848                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1849 out:
1850         ttrace->entry_pending = false;
1851         err = 0;
1852 out_put:
1853         thread__put(thread);
1854         return err;
1855 }
1856
1857 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1858                               union perf_event *event __maybe_unused,
1859                               struct perf_sample *sample)
1860 {
1861         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1862         struct thread_trace *ttrace;
1863         size_t filename_len, entry_str_len, to_move;
1864         ssize_t remaining_space;
1865         char *pos;
1866         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1867
1868         if (!thread)
1869                 goto out;
1870
1871         ttrace = thread__priv(thread);
1872         if (!ttrace)
1873                 goto out_put;
1874
1875         filename_len = strlen(filename);
1876         if (filename_len == 0)
1877                 goto out_put;
1878
1879         if (ttrace->filename.namelen < filename_len) {
1880                 char *f = realloc(ttrace->filename.name, filename_len + 1);
1881
1882                 if (f == NULL)
1883                         goto out_put;
1884
1885                 ttrace->filename.namelen = filename_len;
1886                 ttrace->filename.name = f;
1887         }
1888
1889         strcpy(ttrace->filename.name, filename);
1890         ttrace->filename.pending_open = true;
1891
1892         if (!ttrace->filename.ptr)
1893                 goto out_put;
1894
1895         entry_str_len = strlen(ttrace->entry_str);
1896         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1897         if (remaining_space <= 0)
1898                 goto out_put;
1899
1900         if (filename_len > (size_t)remaining_space) {
1901                 filename += filename_len - remaining_space;
1902                 filename_len = remaining_space;
1903         }
1904
1905         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1906         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1907         memmove(pos + filename_len, pos, to_move);
1908         memcpy(pos, filename, filename_len);
1909
1910         ttrace->filename.ptr = 0;
1911         ttrace->filename.entry_str_pos = 0;
1912 out_put:
1913         thread__put(thread);
1914 out:
1915         return 0;
1916 }
1917
1918 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1919                                      union perf_event *event __maybe_unused,
1920                                      struct perf_sample *sample)
1921 {
1922         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1923         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1924         struct thread *thread = machine__findnew_thread(trace->host,
1925                                                         sample->pid,
1926                                                         sample->tid);
1927         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1928
1929         if (ttrace == NULL)
1930                 goto out_dump;
1931
1932         ttrace->runtime_ms += runtime_ms;
1933         trace->runtime_ms += runtime_ms;
1934 out_put:
1935         thread__put(thread);
1936         return 0;
1937
1938 out_dump:
1939         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1940                evsel->name,
1941                perf_evsel__strval(evsel, sample, "comm"),
1942                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1943                runtime,
1944                perf_evsel__intval(evsel, sample, "vruntime"));
1945         goto out_put;
1946 }
1947
1948 static int bpf_output__printer(enum binary_printer_ops op,
1949                                unsigned int val, void *extra __maybe_unused, FILE *fp)
1950 {
1951         unsigned char ch = (unsigned char)val;
1952
1953         switch (op) {
1954         case BINARY_PRINT_CHAR_DATA:
1955                 return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1956         case BINARY_PRINT_DATA_BEGIN:
1957         case BINARY_PRINT_LINE_BEGIN:
1958         case BINARY_PRINT_ADDR:
1959         case BINARY_PRINT_NUM_DATA:
1960         case BINARY_PRINT_NUM_PAD:
1961         case BINARY_PRINT_SEP:
1962         case BINARY_PRINT_CHAR_PAD:
1963         case BINARY_PRINT_LINE_END:
1964         case BINARY_PRINT_DATA_END:
1965         default:
1966                 break;
1967         }
1968
1969         return 0;
1970 }
1971
1972 static void bpf_output__fprintf(struct trace *trace,
1973                                 struct perf_sample *sample)
1974 {
1975         binary__fprintf(sample->raw_data, sample->raw_size, 8,
1976                         bpf_output__printer, NULL, trace->output);
1977 }
1978
1979 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1980                                 union perf_event *event __maybe_unused,
1981                                 struct perf_sample *sample)
1982 {
1983         int callchain_ret = 0;
1984
1985         if (sample->callchain) {
1986                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1987                 if (callchain_ret == 0) {
1988                         if (callchain_cursor.nr < trace->min_stack)
1989                                 goto out;
1990                         callchain_ret = 1;
1991                 }
1992         }
1993
1994         trace__printf_interrupted_entry(trace);
1995         trace__fprintf_tstamp(trace, sample->time, trace->output);
1996
1997         if (trace->trace_syscalls)
1998                 fprintf(trace->output, "(         ): ");
1999
2000         fprintf(trace->output, "%s:", evsel->name);
2001
2002         if (perf_evsel__is_bpf_output(evsel)) {
2003                 bpf_output__fprintf(trace, sample);
2004         } else if (evsel->tp_format) {
2005                 if (strncmp(evsel->tp_format->name, "sys_enter_", 10) ||
2006                     trace__fprintf_sys_enter(trace, evsel, sample)) {
2007                         event_format__fprintf(evsel->tp_format, sample->cpu,
2008                                               sample->raw_data, sample->raw_size,
2009                                               trace->output);
2010                 }
2011         }
2012
2013         fprintf(trace->output, "\n");
2014
2015         if (callchain_ret > 0)
2016                 trace__fprintf_callchain(trace, sample);
2017         else if (callchain_ret < 0)
2018                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2019 out:
2020         return 0;
2021 }
2022
2023 static void print_location(FILE *f, struct perf_sample *sample,
2024                            struct addr_location *al,
2025                            bool print_dso, bool print_sym)
2026 {
2027
2028         if ((verbose > 0 || print_dso) && al->map)
2029                 fprintf(f, "%s@", al->map->dso->long_name);
2030
2031         if ((verbose > 0 || print_sym) && al->sym)
2032                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2033                         al->addr - al->sym->start);
2034         else if (al->map)
2035                 fprintf(f, "0x%" PRIx64, al->addr);
2036         else
2037                 fprintf(f, "0x%" PRIx64, sample->addr);
2038 }
2039
2040 static int trace__pgfault(struct trace *trace,
2041                           struct perf_evsel *evsel,
2042                           union perf_event *event __maybe_unused,
2043                           struct perf_sample *sample)
2044 {
2045         struct thread *thread;
2046         struct addr_location al;
2047         char map_type = 'd';
2048         struct thread_trace *ttrace;
2049         int err = -1;
2050         int callchain_ret = 0;
2051
2052         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2053
2054         if (sample->callchain) {
2055                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2056                 if (callchain_ret == 0) {
2057                         if (callchain_cursor.nr < trace->min_stack)
2058                                 goto out_put;
2059                         callchain_ret = 1;
2060                 }
2061         }
2062
2063         ttrace = thread__trace(thread, trace->output);
2064         if (ttrace == NULL)
2065                 goto out_put;
2066
2067         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2068                 ttrace->pfmaj++;
2069         else
2070                 ttrace->pfmin++;
2071
2072         if (trace->summary_only)
2073                 goto out;
2074
2075         thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
2076
2077         trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2078
2079         fprintf(trace->output, "%sfault [",
2080                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2081                 "maj" : "min");
2082
2083         print_location(trace->output, sample, &al, false, true);
2084
2085         fprintf(trace->output, "] => ");
2086
2087         thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2088
2089         if (!al.map) {
2090                 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2091
2092                 if (al.map)
2093                         map_type = 'x';
2094                 else
2095                         map_type = '?';
2096         }
2097
2098         print_location(trace->output, sample, &al, true, false);
2099
2100         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2101
2102         if (callchain_ret > 0)
2103                 trace__fprintf_callchain(trace, sample);
2104         else if (callchain_ret < 0)
2105                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2106 out:
2107         err = 0;
2108 out_put:
2109         thread__put(thread);
2110         return err;
2111 }
2112
2113 static void trace__set_base_time(struct trace *trace,
2114                                  struct perf_evsel *evsel,
2115                                  struct perf_sample *sample)
2116 {
2117         /*
2118          * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2119          * and don't use sample->time unconditionally, we may end up having
2120          * some other event in the future without PERF_SAMPLE_TIME for good
2121          * reason, i.e. we may not be interested in its timestamps, just in
2122          * it taking place, picking some piece of information when it
2123          * appears in our event stream (vfs_getname comes to mind).
2124          */
2125         if (trace->base_time == 0 && !trace->full_time &&
2126             (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2127                 trace->base_time = sample->time;
2128 }
2129
2130 static int trace__process_sample(struct perf_tool *tool,
2131                                  union perf_event *event,
2132                                  struct perf_sample *sample,
2133                                  struct perf_evsel *evsel,
2134                                  struct machine *machine __maybe_unused)
2135 {
2136         struct trace *trace = container_of(tool, struct trace, tool);
2137         struct thread *thread;
2138         int err = 0;
2139
2140         tracepoint_handler handler = evsel->handler;
2141
2142         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2143         if (thread && thread__is_filtered(thread))
2144                 goto out;
2145
2146         trace__set_base_time(trace, evsel, sample);
2147
2148         if (handler) {
2149                 ++trace->nr_events;
2150                 handler(trace, evsel, event, sample);
2151         }
2152 out:
2153         thread__put(thread);
2154         return err;
2155 }
2156
2157 static int trace__record(struct trace *trace, int argc, const char **argv)
2158 {
2159         unsigned int rec_argc, i, j;
2160         const char **rec_argv;
2161         const char * const record_args[] = {
2162                 "record",
2163                 "-R",
2164                 "-m", "1024",
2165                 "-c", "1",
2166         };
2167
2168         const char * const sc_args[] = { "-e", };
2169         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2170         const char * const majpf_args[] = { "-e", "major-faults" };
2171         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2172         const char * const minpf_args[] = { "-e", "minor-faults" };
2173         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2174
2175         /* +1 is for the event string below */
2176         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2177                 majpf_args_nr + minpf_args_nr + argc;
2178         rec_argv = calloc(rec_argc + 1, sizeof(char *));
2179
2180         if (rec_argv == NULL)
2181                 return -ENOMEM;
2182
2183         j = 0;
2184         for (i = 0; i < ARRAY_SIZE(record_args); i++)
2185                 rec_argv[j++] = record_args[i];
2186
2187         if (trace->trace_syscalls) {
2188                 for (i = 0; i < sc_args_nr; i++)
2189                         rec_argv[j++] = sc_args[i];
2190
2191                 /* event string may be different for older kernels - e.g., RHEL6 */
2192                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2193                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2194                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2195                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2196                 else {
2197                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2198                         free(rec_argv);
2199                         return -1;
2200                 }
2201         }
2202
2203         if (trace->trace_pgfaults & TRACE_PFMAJ)
2204                 for (i = 0; i < majpf_args_nr; i++)
2205                         rec_argv[j++] = majpf_args[i];
2206
2207         if (trace->trace_pgfaults & TRACE_PFMIN)
2208                 for (i = 0; i < minpf_args_nr; i++)
2209                         rec_argv[j++] = minpf_args[i];
2210
2211         for (i = 0; i < (unsigned int)argc; i++)
2212                 rec_argv[j++] = argv[i];
2213
2214         return cmd_record(j, rec_argv);
2215 }
2216
2217 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2218
2219 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2220 {
2221         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2222
2223         if (IS_ERR(evsel))
2224                 return false;
2225
2226         if (perf_evsel__field(evsel, "pathname") == NULL) {
2227                 perf_evsel__delete(evsel);
2228                 return false;
2229         }
2230
2231         evsel->handler = trace__vfs_getname;
2232         perf_evlist__add(evlist, evsel);
2233         return true;
2234 }
2235
2236 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2237 {
2238         struct perf_evsel *evsel;
2239         struct perf_event_attr attr = {
2240                 .type = PERF_TYPE_SOFTWARE,
2241                 .mmap_data = 1,
2242         };
2243
2244         attr.config = config;
2245         attr.sample_period = 1;
2246
2247         event_attr_init(&attr);
2248
2249         evsel = perf_evsel__new(&attr);
2250         if (evsel)
2251                 evsel->handler = trace__pgfault;
2252
2253         return evsel;
2254 }
2255
2256 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2257 {
2258         const u32 type = event->header.type;
2259         struct perf_evsel *evsel;
2260
2261         if (type != PERF_RECORD_SAMPLE) {
2262                 trace__process_event(trace, trace->host, event, sample);
2263                 return;
2264         }
2265
2266         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2267         if (evsel == NULL) {
2268                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2269                 return;
2270         }
2271
2272         trace__set_base_time(trace, evsel, sample);
2273
2274         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2275             sample->raw_data == NULL) {
2276                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2277                        perf_evsel__name(evsel), sample->tid,
2278                        sample->cpu, sample->raw_size);
2279         } else {
2280                 tracepoint_handler handler = evsel->handler;
2281                 handler(trace, evsel, event, sample);
2282         }
2283 }
2284
2285 static int trace__add_syscall_newtp(struct trace *trace)
2286 {
2287         int ret = -1;
2288         struct perf_evlist *evlist = trace->evlist;
2289         struct perf_evsel *sys_enter, *sys_exit;
2290
2291         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2292         if (sys_enter == NULL)
2293                 goto out;
2294
2295         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2296                 goto out_delete_sys_enter;
2297
2298         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2299         if (sys_exit == NULL)
2300                 goto out_delete_sys_enter;
2301
2302         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2303                 goto out_delete_sys_exit;
2304
2305         perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2306         perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2307
2308         perf_evlist__add(evlist, sys_enter);
2309         perf_evlist__add(evlist, sys_exit);
2310
2311         if (callchain_param.enabled && !trace->kernel_syscallchains) {
2312                 /*
2313                  * We're interested only in the user space callchain
2314                  * leading to the syscall, allow overriding that for
2315                  * debugging reasons using --kernel_syscall_callchains
2316                  */
2317                 sys_exit->attr.exclude_callchain_kernel = 1;
2318         }
2319
2320         trace->syscalls.events.sys_enter = sys_enter;
2321         trace->syscalls.events.sys_exit  = sys_exit;
2322
2323         ret = 0;
2324 out:
2325         return ret;
2326
2327 out_delete_sys_exit:
2328         perf_evsel__delete_priv(sys_exit);
2329 out_delete_sys_enter:
2330         perf_evsel__delete_priv(sys_enter);
2331         goto out;
2332 }
2333
2334 static int trace__set_ev_qualifier_filter(struct trace *trace)
2335 {
2336         int err = -1;
2337         struct perf_evsel *sys_exit;
2338         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2339                                                 trace->ev_qualifier_ids.nr,
2340                                                 trace->ev_qualifier_ids.entries);
2341
2342         if (filter == NULL)
2343                 goto out_enomem;
2344
2345         if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2346                                           filter)) {
2347                 sys_exit = trace->syscalls.events.sys_exit;
2348                 err = perf_evsel__append_tp_filter(sys_exit, filter);
2349         }
2350
2351         free(filter);
2352 out:
2353         return err;
2354 out_enomem:
2355         errno = ENOMEM;
2356         goto out;
2357 }
2358
2359 static int trace__set_filter_loop_pids(struct trace *trace)
2360 {
2361         unsigned int nr = 1;
2362         pid_t pids[32] = {
2363                 getpid(),
2364         };
2365         struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2366
2367         while (thread && nr < ARRAY_SIZE(pids)) {
2368                 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2369
2370                 if (parent == NULL)
2371                         break;
2372
2373                 if (!strcmp(thread__comm_str(parent), "sshd")) {
2374                         pids[nr++] = parent->tid;
2375                         break;
2376                 }
2377                 thread = parent;
2378         }
2379
2380         return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2381 }
2382
2383 static int trace__run(struct trace *trace, int argc, const char **argv)
2384 {
2385         struct perf_evlist *evlist = trace->evlist;
2386         struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2387         int err = -1, i;
2388         unsigned long before;
2389         const bool forks = argc > 0;
2390         bool draining = false;
2391
2392         trace->live = true;
2393
2394         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2395                 goto out_error_raw_syscalls;
2396
2397         if (trace->trace_syscalls)
2398                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2399
2400         if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2401                 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2402                 if (pgfault_maj == NULL)
2403                         goto out_error_mem;
2404                 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2405                 perf_evlist__add(evlist, pgfault_maj);
2406         }
2407
2408         if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2409                 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2410                 if (pgfault_min == NULL)
2411                         goto out_error_mem;
2412                 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2413                 perf_evlist__add(evlist, pgfault_min);
2414         }
2415
2416         if (trace->sched &&
2417             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2418                                    trace__sched_stat_runtime))
2419                 goto out_error_sched_stat_runtime;
2420
2421         /*
2422          * If a global cgroup was set, apply it to all the events without an
2423          * explicit cgroup. I.e.:
2424          *
2425          *      trace -G A -e sched:*switch
2426          *
2427          * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2428          * _and_ sched:sched_switch to the 'A' cgroup, while:
2429          *
2430          * trace -e sched:*switch -G A
2431          *
2432          * will only set the sched:sched_switch event to the 'A' cgroup, all the
2433          * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
2434          * a cgroup (on the root cgroup, sys wide, etc).
2435          *
2436          * Multiple cgroups:
2437          *
2438          * trace -G A -e sched:*switch -G B
2439          *
2440          * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
2441          * to the 'B' cgroup.
2442          *
2443          * evlist__set_default_cgroup() grabs a reference of the passed cgroup
2444          * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
2445          */
2446         if (trace->cgroup)
2447                 evlist__set_default_cgroup(trace->evlist, trace->cgroup);
2448
2449         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2450         if (err < 0) {
2451                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2452                 goto out_delete_evlist;
2453         }
2454
2455         err = trace__symbols_init(trace, evlist);
2456         if (err < 0) {
2457                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2458                 goto out_delete_evlist;
2459         }
2460
2461         perf_evlist__config(evlist, &trace->opts, &callchain_param);
2462
2463         signal(SIGCHLD, sig_handler);
2464         signal(SIGINT, sig_handler);
2465
2466         if (forks) {
2467                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2468                                                     argv, false, NULL);
2469                 if (err < 0) {
2470                         fprintf(trace->output, "Couldn't run the workload!\n");
2471                         goto out_delete_evlist;
2472                 }
2473         }
2474
2475         err = perf_evlist__open(evlist);
2476         if (err < 0)
2477                 goto out_error_open;
2478
2479         err = bpf__apply_obj_config();
2480         if (err) {
2481                 char errbuf[BUFSIZ];
2482
2483                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2484                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2485                          errbuf);
2486                 goto out_error_open;
2487         }
2488
2489         /*
2490          * Better not use !target__has_task() here because we need to cover the
2491          * case where no threads were specified in the command line, but a
2492          * workload was, and in that case we will fill in the thread_map when
2493          * we fork the workload in perf_evlist__prepare_workload.
2494          */
2495         if (trace->filter_pids.nr > 0)
2496                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2497         else if (thread_map__pid(evlist->threads, 0) == -1)
2498                 err = trace__set_filter_loop_pids(trace);
2499
2500         if (err < 0)
2501                 goto out_error_mem;
2502
2503         if (trace->ev_qualifier_ids.nr > 0) {
2504                 err = trace__set_ev_qualifier_filter(trace);
2505                 if (err < 0)
2506                         goto out_errno;
2507
2508                 pr_debug("event qualifier tracepoint filter: %s\n",
2509                          trace->syscalls.events.sys_exit->filter);
2510         }
2511
2512         err = perf_evlist__apply_filters(evlist, &evsel);
2513         if (err < 0)
2514                 goto out_error_apply_filters;
2515
2516         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2517         if (err < 0)
2518                 goto out_error_mmap;
2519
2520         if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2521                 perf_evlist__enable(evlist);
2522
2523         if (forks)
2524                 perf_evlist__start_workload(evlist);
2525
2526         if (trace->opts.initial_delay) {
2527                 usleep(trace->opts.initial_delay * 1000);
2528                 perf_evlist__enable(evlist);
2529         }
2530
2531         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2532                                   evlist->threads->nr > 1 ||
2533                                   perf_evlist__first(evlist)->attr.inherit;
2534
2535         /*
2536          * Now that we already used evsel->attr to ask the kernel to setup the
2537          * events, lets reuse evsel->attr.sample_max_stack as the limit in
2538          * trace__resolve_callchain(), allowing per-event max-stack settings
2539          * to override an explicitely set --max-stack global setting.
2540          */
2541         evlist__for_each_entry(evlist, evsel) {
2542                 if (evsel__has_callchain(evsel) &&
2543                     evsel->attr.sample_max_stack == 0)
2544                         evsel->attr.sample_max_stack = trace->max_stack;
2545         }
2546 again:
2547         before = trace->nr_events;
2548
2549         for (i = 0; i < evlist->nr_mmaps; i++) {
2550                 union perf_event *event;
2551                 struct perf_mmap *md;
2552
2553                 md = &evlist->mmap[i];
2554                 if (perf_mmap__read_init(md) < 0)
2555                         continue;
2556
2557                 while ((event = perf_mmap__read_event(md)) != NULL) {
2558                         struct perf_sample sample;
2559
2560                         ++trace->nr_events;
2561
2562                         err = perf_evlist__parse_sample(evlist, event, &sample);
2563                         if (err) {
2564                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2565                                 goto next_event;
2566                         }
2567
2568                         trace__handle_event(trace, event, &sample);
2569 next_event:
2570                         perf_mmap__consume(md);
2571
2572                         if (interrupted)
2573                                 goto out_disable;
2574
2575                         if (done && !draining) {
2576                                 perf_evlist__disable(evlist);
2577                                 draining = true;
2578                         }
2579                 }
2580                 perf_mmap__read_done(md);
2581         }
2582
2583         if (trace->nr_events == before) {
2584                 int timeout = done ? 100 : -1;
2585
2586                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2587                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2588                                 draining = true;
2589
2590                         goto again;
2591                 }
2592         } else {
2593                 goto again;
2594         }
2595
2596 out_disable:
2597         thread__zput(trace->current);
2598
2599         perf_evlist__disable(evlist);
2600
2601         if (!err) {
2602                 if (trace->summary)
2603                         trace__fprintf_thread_summary(trace, trace->output);
2604
2605                 if (trace->show_tool_stats) {
2606                         fprintf(trace->output, "Stats:\n "
2607                                                " vfs_getname : %" PRIu64 "\n"
2608                                                " proc_getname: %" PRIu64 "\n",
2609                                 trace->stats.vfs_getname,
2610                                 trace->stats.proc_getname);
2611                 }
2612         }
2613
2614 out_delete_evlist:
2615         trace__symbols__exit(trace);
2616
2617         perf_evlist__delete(evlist);
2618         cgroup__put(trace->cgroup);
2619         trace->evlist = NULL;
2620         trace->live = false;
2621         return err;
2622 {
2623         char errbuf[BUFSIZ];
2624
2625 out_error_sched_stat_runtime:
2626         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2627         goto out_error;
2628
2629 out_error_raw_syscalls:
2630         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2631         goto out_error;
2632
2633 out_error_mmap:
2634         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2635         goto out_error;
2636
2637 out_error_open:
2638         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2639
2640 out_error:
2641         fprintf(trace->output, "%s\n", errbuf);
2642         goto out_delete_evlist;
2643
2644 out_error_apply_filters:
2645         fprintf(trace->output,
2646                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2647                 evsel->filter, perf_evsel__name(evsel), errno,
2648                 str_error_r(errno, errbuf, sizeof(errbuf)));
2649         goto out_delete_evlist;
2650 }
2651 out_error_mem:
2652         fprintf(trace->output, "Not enough memory to run!\n");
2653         goto out_delete_evlist;
2654
2655 out_errno:
2656         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2657         goto out_delete_evlist;
2658 }
2659
2660 static int trace__replay(struct trace *trace)
2661 {
2662         const struct perf_evsel_str_handler handlers[] = {
2663                 { "probe:vfs_getname",       trace__vfs_getname, },
2664         };
2665         struct perf_data data = {
2666                 .file      = {
2667                         .path = input_name,
2668                 },
2669                 .mode      = PERF_DATA_MODE_READ,
2670                 .force     = trace->force,
2671         };
2672         struct perf_session *session;
2673         struct perf_evsel *evsel;
2674         int err = -1;
2675
2676         trace->tool.sample        = trace__process_sample;
2677         trace->tool.mmap          = perf_event__process_mmap;
2678         trace->tool.mmap2         = perf_event__process_mmap2;
2679         trace->tool.comm          = perf_event__process_comm;
2680         trace->tool.exit          = perf_event__process_exit;
2681         trace->tool.fork          = perf_event__process_fork;
2682         trace->tool.attr          = perf_event__process_attr;
2683         trace->tool.tracing_data  = perf_event__process_tracing_data;
2684         trace->tool.build_id      = perf_event__process_build_id;
2685         trace->tool.namespaces    = perf_event__process_namespaces;
2686
2687         trace->tool.ordered_events = true;
2688         trace->tool.ordering_requires_timestamps = true;
2689
2690         /* add tid to output */
2691         trace->multiple_threads = true;
2692
2693         session = perf_session__new(&data, false, &trace->tool);
2694         if (session == NULL)
2695                 return -1;
2696
2697         if (trace->opts.target.pid)
2698                 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2699
2700         if (trace->opts.target.tid)
2701                 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2702
2703         if (symbol__init(&session->header.env) < 0)
2704                 goto out;
2705
2706         trace->host = &session->machines.host;
2707
2708         err = perf_session__set_tracepoints_handlers(session, handlers);
2709         if (err)
2710                 goto out;
2711
2712         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2713                                                      "raw_syscalls:sys_enter");
2714         /* older kernels have syscalls tp versus raw_syscalls */
2715         if (evsel == NULL)
2716                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2717                                                              "syscalls:sys_enter");
2718
2719         if (evsel &&
2720             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2721             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2722                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2723                 goto out;
2724         }
2725
2726         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2727                                                      "raw_syscalls:sys_exit");
2728         if (evsel == NULL)
2729                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2730                                                              "syscalls:sys_exit");
2731         if (evsel &&
2732             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2733             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2734                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2735                 goto out;
2736         }
2737
2738         evlist__for_each_entry(session->evlist, evsel) {
2739                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2740                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2741                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2742                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2743                         evsel->handler = trace__pgfault;
2744         }
2745
2746         setup_pager();
2747
2748         err = perf_session__process_events(session);
2749         if (err)
2750                 pr_err("Failed to process events, error %d", err);
2751
2752         else if (trace->summary)
2753                 trace__fprintf_thread_summary(trace, trace->output);
2754
2755 out:
2756         perf_session__delete(session);
2757
2758         return err;
2759 }
2760
2761 static size_t trace__fprintf_threads_header(FILE *fp)
2762 {
2763         size_t printed;
2764
2765         printed  = fprintf(fp, "\n Summary of events:\n\n");
2766
2767         return printed;
2768 }
2769
2770 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2771         struct stats    *stats;
2772         double          msecs;
2773         int             syscall;
2774 )
2775 {
2776         struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2777         struct stats *stats = source->priv;
2778
2779         entry->syscall = source->i;
2780         entry->stats   = stats;
2781         entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2782 }
2783
2784 static size_t thread__dump_stats(struct thread_trace *ttrace,
2785                                  struct trace *trace, FILE *fp)
2786 {
2787         size_t printed = 0;
2788         struct syscall *sc;
2789         struct rb_node *nd;
2790         DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2791
2792         if (syscall_stats == NULL)
2793                 return 0;
2794
2795         printed += fprintf(fp, "\n");
2796
2797         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2798         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2799         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2800
2801         resort_rb__for_each_entry(nd, syscall_stats) {
2802                 struct stats *stats = syscall_stats_entry->stats;
2803                 if (stats) {
2804                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2805                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2806                         double avg = avg_stats(stats);
2807                         double pct;
2808                         u64 n = (u64) stats->n;
2809
2810                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2811                         avg /= NSEC_PER_MSEC;
2812
2813                         sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2814                         printed += fprintf(fp, "   %-15s", sc->name);
2815                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2816                                            n, syscall_stats_entry->msecs, min, avg);
2817                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2818                 }
2819         }
2820
2821         resort_rb__delete(syscall_stats);
2822         printed += fprintf(fp, "\n\n");
2823
2824         return printed;
2825 }
2826
2827 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2828 {
2829         size_t printed = 0;
2830         struct thread_trace *ttrace = thread__priv(thread);
2831         double ratio;
2832
2833         if (ttrace == NULL)
2834                 return 0;
2835
2836         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2837
2838         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2839         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2840         printed += fprintf(fp, "%.1f%%", ratio);
2841         if (ttrace->pfmaj)
2842                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2843         if (ttrace->pfmin)
2844                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2845         if (trace->sched)
2846                 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2847         else if (fputc('\n', fp) != EOF)
2848                 ++printed;
2849
2850         printed += thread__dump_stats(ttrace, trace, fp);
2851
2852         return printed;
2853 }
2854
2855 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2856 {
2857         return ttrace ? ttrace->nr_events : 0;
2858 }
2859
2860 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2861         struct thread *thread;
2862 )
2863 {
2864         entry->thread = rb_entry(nd, struct thread, rb_node);
2865 }
2866
2867 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2868 {
2869         size_t printed = trace__fprintf_threads_header(fp);
2870         struct rb_node *nd;
2871         int i;
2872
2873         for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2874                 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2875
2876                 if (threads == NULL) {
2877                         fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2878                         return 0;
2879                 }
2880
2881                 resort_rb__for_each_entry(nd, threads)
2882                         printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2883
2884                 resort_rb__delete(threads);
2885         }
2886         return printed;
2887 }
2888
2889 static int trace__set_duration(const struct option *opt, const char *str,
2890                                int unset __maybe_unused)
2891 {
2892         struct trace *trace = opt->value;
2893
2894         trace->duration_filter = atof(str);
2895         return 0;
2896 }
2897
2898 static int trace__set_filter_pids(const struct option *opt, const char *str,
2899                                   int unset __maybe_unused)
2900 {
2901         int ret = -1;
2902         size_t i;
2903         struct trace *trace = opt->value;
2904         /*
2905          * FIXME: introduce a intarray class, plain parse csv and create a
2906          * { int nr, int entries[] } struct...
2907          */
2908         struct intlist *list = intlist__new(str);
2909
2910         if (list == NULL)
2911                 return -1;
2912
2913         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2914         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2915
2916         if (trace->filter_pids.entries == NULL)
2917                 goto out;
2918
2919         trace->filter_pids.entries[0] = getpid();
2920
2921         for (i = 1; i < trace->filter_pids.nr; ++i)
2922                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2923
2924         intlist__delete(list);
2925         ret = 0;
2926 out:
2927         return ret;
2928 }
2929
2930 static int trace__open_output(struct trace *trace, const char *filename)
2931 {
2932         struct stat st;
2933
2934         if (!stat(filename, &st) && st.st_size) {
2935                 char oldname[PATH_MAX];
2936
2937                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2938                 unlink(oldname);
2939                 rename(filename, oldname);
2940         }
2941
2942         trace->output = fopen(filename, "w");
2943
2944         return trace->output == NULL ? -errno : 0;
2945 }
2946
2947 static int parse_pagefaults(const struct option *opt, const char *str,
2948                             int unset __maybe_unused)
2949 {
2950         int *trace_pgfaults = opt->value;
2951
2952         if (strcmp(str, "all") == 0)
2953                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2954         else if (strcmp(str, "maj") == 0)
2955                 *trace_pgfaults |= TRACE_PFMAJ;
2956         else if (strcmp(str, "min") == 0)
2957                 *trace_pgfaults |= TRACE_PFMIN;
2958         else
2959                 return -1;
2960
2961         return 0;
2962 }
2963
2964 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2965 {
2966         struct perf_evsel *evsel;
2967
2968         evlist__for_each_entry(evlist, evsel)
2969                 evsel->handler = handler;
2970 }
2971
2972 /*
2973  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2974  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2975  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2976  *
2977  * It'd be better to introduce a parse_options() variant that would return a
2978  * list with the terms it didn't match to an event...
2979  */
2980 static int trace__parse_events_option(const struct option *opt, const char *str,
2981                                       int unset __maybe_unused)
2982 {
2983         struct trace *trace = (struct trace *)opt->value;
2984         const char *s = str;
2985         char *sep = NULL, *lists[2] = { NULL, NULL, };
2986         int len = strlen(str) + 1, err = -1, list, idx;
2987         char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2988         char group_name[PATH_MAX];
2989
2990         if (strace_groups_dir == NULL)
2991                 return -1;
2992
2993         if (*s == '!') {
2994                 ++s;
2995                 trace->not_ev_qualifier = true;
2996         }
2997
2998         while (1) {
2999                 if ((sep = strchr(s, ',')) != NULL)
3000                         *sep = '\0';
3001
3002                 list = 0;
3003                 if (syscalltbl__id(trace->sctbl, s) >= 0 ||
3004                     syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
3005                         list = 1;
3006                 } else {
3007                         path__join(group_name, sizeof(group_name), strace_groups_dir, s);
3008                         if (access(group_name, R_OK) == 0)
3009                                 list = 1;
3010                 }
3011
3012                 if (lists[list]) {
3013                         sprintf(lists[list] + strlen(lists[list]), ",%s", s);
3014                 } else {
3015                         lists[list] = malloc(len);
3016                         if (lists[list] == NULL)
3017                                 goto out;
3018                         strcpy(lists[list], s);
3019                 }
3020
3021                 if (!sep)
3022                         break;
3023
3024                 *sep = ',';
3025                 s = sep + 1;
3026         }
3027
3028         if (lists[1] != NULL) {
3029                 struct strlist_config slist_config = {
3030                         .dirname = strace_groups_dir,
3031                 };
3032
3033                 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
3034                 if (trace->ev_qualifier == NULL) {
3035                         fputs("Not enough memory to parse event qualifier", trace->output);
3036                         goto out;
3037                 }
3038
3039                 if (trace__validate_ev_qualifier(trace))
3040                         goto out;
3041                 trace->trace_syscalls = true;
3042         }
3043
3044         err = 0;
3045
3046         if (lists[0]) {
3047                 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3048                                                "event selector. use 'perf list' to list available events",
3049                                                parse_events_option);
3050                 err = parse_events_option(&o, lists[0], 0);
3051         }
3052 out:
3053         if (sep)
3054                 *sep = ',';
3055
3056         return err;
3057 }
3058
3059 static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3060 {
3061         struct trace *trace = opt->value;
3062
3063         if (!list_empty(&trace->evlist->entries))
3064                 return parse_cgroups(opt, str, unset);
3065
3066         trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3067
3068         return 0;
3069 }
3070
3071 int cmd_trace(int argc, const char **argv)
3072 {
3073         const char *trace_usage[] = {
3074                 "perf trace [<options>] [<command>]",
3075                 "perf trace [<options>] -- <command> [<options>]",
3076                 "perf trace record [<options>] [<command>]",
3077                 "perf trace record [<options>] -- <command> [<options>]",
3078                 NULL
3079         };
3080         struct trace trace = {
3081                 .syscalls = {
3082                         . max = -1,
3083                 },
3084                 .opts = {
3085                         .target = {
3086                                 .uid       = UINT_MAX,
3087                                 .uses_mmap = true,
3088                         },
3089                         .user_freq     = UINT_MAX,
3090                         .user_interval = ULLONG_MAX,
3091                         .no_buffering  = true,
3092                         .mmap_pages    = UINT_MAX,
3093                         .proc_map_timeout  = 500,
3094                 },
3095                 .output = stderr,
3096                 .show_comm = true,
3097                 .trace_syscalls = false,
3098                 .kernel_syscallchains = false,
3099                 .max_stack = UINT_MAX,
3100         };
3101         const char *output_name = NULL;
3102         const struct option trace_options[] = {
3103         OPT_CALLBACK('e', "event", &trace, "event",
3104                      "event/syscall selector. use 'perf list' to list available events",
3105                      trace__parse_events_option),
3106         OPT_BOOLEAN(0, "comm", &trace.show_comm,
3107                     "show the thread COMM next to its id"),
3108         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3109         OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3110                      trace__parse_events_option),
3111         OPT_STRING('o', "output", &output_name, "file", "output file name"),
3112         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3113         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3114                     "trace events on existing process id"),
3115         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3116                     "trace events on existing thread id"),
3117         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3118                      "pids to filter (by the kernel)", trace__set_filter_pids),
3119         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3120                     "system-wide collection from all CPUs"),
3121         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3122                     "list of cpus to monitor"),
3123         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3124                     "child tasks do not inherit counters"),
3125         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3126                      "number of mmap data pages",
3127                      perf_evlist__parse_mmap_pages),
3128         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3129                    "user to profile"),
3130         OPT_CALLBACK(0, "duration", &trace, "float",
3131                      "show only events with duration > N.M ms",
3132                      trace__set_duration),
3133         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3134         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3135         OPT_BOOLEAN('T', "time", &trace.full_time,
3136                     "Show full timestamp, not time relative to first start"),
3137         OPT_BOOLEAN(0, "failure", &trace.failure_only,
3138                     "Show only syscalls that failed"),
3139         OPT_BOOLEAN('s', "summary", &trace.summary_only,
3140                     "Show only syscall summary with statistics"),
3141         OPT_BOOLEAN('S', "with-summary", &trace.summary,
3142                     "Show all syscalls and summary with statistics"),
3143         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3144                      "Trace pagefaults", parse_pagefaults, "maj"),
3145         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3146         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3147         OPT_CALLBACK(0, "call-graph", &trace.opts,
3148                      "record_mode[,record_size]", record_callchain_help,
3149                      &record_parse_callchain_opt),
3150         OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3151                     "Show the kernel callchains on the syscall exit path"),
3152         OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3153                      "Set the minimum stack depth when parsing the callchain, "
3154                      "anything below the specified depth will be ignored."),
3155         OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3156                      "Set the maximum stack depth when parsing the callchain, "
3157                      "anything beyond the specified depth will be ignored. "
3158                      "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3159         OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3160                         "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3161         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3162                         "per thread proc mmap processing timeout in ms"),
3163         OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3164                      trace__parse_cgroups),
3165         OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3166                      "ms to wait before starting measurement after program "
3167                      "start"),
3168         OPT_END()
3169         };
3170         bool __maybe_unused max_stack_user_set = true;
3171         bool mmap_pages_user_set = true;
3172         const char * const trace_subcommands[] = { "record", NULL };
3173         int err;
3174         char bf[BUFSIZ];
3175
3176         signal(SIGSEGV, sighandler_dump_stack);
3177         signal(SIGFPE, sighandler_dump_stack);
3178
3179         trace.evlist = perf_evlist__new();
3180         trace.sctbl = syscalltbl__new();
3181
3182         if (trace.evlist == NULL || trace.sctbl == NULL) {
3183                 pr_err("Not enough memory to run!\n");
3184                 err = -ENOMEM;
3185                 goto out;
3186         }
3187
3188         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3189                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3190
3191         if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3192                 usage_with_options_msg(trace_usage, trace_options,
3193                                        "cgroup monitoring only available in system-wide mode");
3194         }
3195
3196         err = bpf__setup_stdout(trace.evlist);
3197         if (err) {
3198                 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3199                 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3200                 goto out;
3201         }
3202
3203         err = -1;
3204
3205         if (trace.trace_pgfaults) {
3206                 trace.opts.sample_address = true;
3207                 trace.opts.sample_time = true;
3208         }
3209
3210         if (trace.opts.mmap_pages == UINT_MAX)
3211                 mmap_pages_user_set = false;
3212
3213         if (trace.max_stack == UINT_MAX) {
3214                 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
3215                 max_stack_user_set = false;
3216         }
3217
3218 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3219         if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3220                 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3221         }
3222 #endif
3223
3224         if (callchain_param.enabled) {
3225                 if (!mmap_pages_user_set && geteuid() == 0)
3226                         trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3227
3228                 symbol_conf.use_callchain = true;
3229         }
3230
3231         if (trace.evlist->nr_entries > 0)
3232                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3233
3234         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3235                 return trace__record(&trace, argc-1, &argv[1]);
3236
3237         /* summary_only implies summary option, but don't overwrite summary if set */
3238         if (trace.summary_only)
3239                 trace.summary = trace.summary_only;
3240
3241         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3242             trace.evlist->nr_entries == 0 /* Was --events used? */) {
3243                 trace.trace_syscalls = true;
3244         }
3245
3246         if (output_name != NULL) {
3247                 err = trace__open_output(&trace, output_name);
3248                 if (err < 0) {
3249                         perror("failed to create output file");
3250                         goto out;
3251                 }
3252         }
3253
3254         err = target__validate(&trace.opts.target);
3255         if (err) {
3256                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3257                 fprintf(trace.output, "%s", bf);
3258                 goto out_close;
3259         }
3260
3261         err = target__parse_uid(&trace.opts.target);
3262         if (err) {
3263                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3264                 fprintf(trace.output, "%s", bf);
3265                 goto out_close;
3266         }
3267
3268         if (!argc && target__none(&trace.opts.target))
3269                 trace.opts.target.system_wide = true;
3270
3271         if (input_name)
3272                 err = trace__replay(&trace);
3273         else
3274                 err = trace__run(&trace, argc, argv);
3275
3276 out_close:
3277         if (output_name != NULL)
3278                 fclose(trace.output);
3279 out:
3280         return err;
3281 }