]> asedeno.scripts.mit.edu Git - linux.git/blob - tools/perf/builtin-trace.c
Merge tag 'mfd-next-4.19' of git://git.kernel.org/pub/scm/linux/kernel/git/lee/mfd
[linux.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/cgroup.h"
23 #include "util/color.h"
24 #include "util/debug.h"
25 #include "util/env.h"
26 #include "util/event.h"
27 #include "util/evlist.h"
28 #include <subcmd/exec-cmd.h>
29 #include "util/machine.h"
30 #include "util/path.h"
31 #include "util/session.h"
32 #include "util/thread.h"
33 #include <subcmd/parse-options.h>
34 #include "util/strlist.h"
35 #include "util/intlist.h"
36 #include "util/thread_map.h"
37 #include "util/stat.h"
38 #include "trace/beauty/beauty.h"
39 #include "trace-event.h"
40 #include "util/parse-events.h"
41 #include "util/bpf-loader.h"
42 #include "callchain.h"
43 #include "print_binary.h"
44 #include "string2.h"
45 #include "syscalltbl.h"
46 #include "rb_resort.h"
47
48 #include <errno.h>
49 #include <inttypes.h>
50 #include <poll.h>
51 #include <signal.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <linux/err.h>
55 #include <linux/filter.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60 #include <fcntl.h>
61
62 #include "sane_ctype.h"
63
64 #ifndef O_CLOEXEC
65 # define O_CLOEXEC              02000000
66 #endif
67
68 #ifndef F_LINUX_SPECIFIC_BASE
69 # define F_LINUX_SPECIFIC_BASE  1024
70 #endif
71
72 struct trace {
73         struct perf_tool        tool;
74         struct syscalltbl       *sctbl;
75         struct {
76                 int             max;
77                 struct syscall  *table;
78                 struct {
79                         struct perf_evsel *sys_enter,
80                                           *sys_exit;
81                 }               events;
82         } syscalls;
83         struct record_opts      opts;
84         struct perf_evlist      *evlist;
85         struct machine          *host;
86         struct thread           *current;
87         struct cgroup           *cgroup;
88         u64                     base_time;
89         FILE                    *output;
90         unsigned long           nr_events;
91         struct strlist          *ev_qualifier;
92         struct {
93                 size_t          nr;
94                 int             *entries;
95         }                       ev_qualifier_ids;
96         struct {
97                 size_t          nr;
98                 pid_t           *entries;
99         }                       filter_pids;
100         double                  duration_filter;
101         double                  runtime_ms;
102         struct {
103                 u64             vfs_getname,
104                                 proc_getname;
105         } stats;
106         unsigned int            max_stack;
107         unsigned int            min_stack;
108         bool                    not_ev_qualifier;
109         bool                    live;
110         bool                    full_time;
111         bool                    sched;
112         bool                    multiple_threads;
113         bool                    summary;
114         bool                    summary_only;
115         bool                    failure_only;
116         bool                    show_comm;
117         bool                    print_sample;
118         bool                    show_tool_stats;
119         bool                    trace_syscalls;
120         bool                    kernel_syscallchains;
121         bool                    force;
122         bool                    vfs_getname;
123         int                     trace_pgfaults;
124         int                     open_id;
125 };
126
127 struct tp_field {
128         int offset;
129         union {
130                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
131                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
132         };
133 };
134
135 #define TP_UINT_FIELD(bits) \
136 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
137 { \
138         u##bits value; \
139         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
140         return value;  \
141 }
142
143 TP_UINT_FIELD(8);
144 TP_UINT_FIELD(16);
145 TP_UINT_FIELD(32);
146 TP_UINT_FIELD(64);
147
148 #define TP_UINT_FIELD__SWAPPED(bits) \
149 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
150 { \
151         u##bits value; \
152         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
153         return bswap_##bits(value);\
154 }
155
156 TP_UINT_FIELD__SWAPPED(16);
157 TP_UINT_FIELD__SWAPPED(32);
158 TP_UINT_FIELD__SWAPPED(64);
159
160 static int tp_field__init_uint(struct tp_field *field,
161                                struct format_field *format_field,
162                                bool needs_swap)
163 {
164         field->offset = format_field->offset;
165
166         switch (format_field->size) {
167         case 1:
168                 field->integer = tp_field__u8;
169                 break;
170         case 2:
171                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
172                 break;
173         case 4:
174                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
175                 break;
176         case 8:
177                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
178                 break;
179         default:
180                 return -1;
181         }
182
183         return 0;
184 }
185
186 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
187 {
188         return sample->raw_data + field->offset;
189 }
190
191 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
192 {
193         field->offset = format_field->offset;
194         field->pointer = tp_field__ptr;
195         return 0;
196 }
197
198 struct syscall_tp {
199         struct tp_field id;
200         union {
201                 struct tp_field args, ret;
202         };
203 };
204
205 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
206                                           struct tp_field *field,
207                                           const char *name)
208 {
209         struct format_field *format_field = perf_evsel__field(evsel, name);
210
211         if (format_field == NULL)
212                 return -1;
213
214         return tp_field__init_uint(field, format_field, evsel->needs_swap);
215 }
216
217 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
218         ({ struct syscall_tp *sc = evsel->priv;\
219            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
220
221 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
222                                          struct tp_field *field,
223                                          const char *name)
224 {
225         struct format_field *format_field = perf_evsel__field(evsel, name);
226
227         if (format_field == NULL)
228                 return -1;
229
230         return tp_field__init_ptr(field, format_field);
231 }
232
233 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
234         ({ struct syscall_tp *sc = evsel->priv;\
235            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
236
237 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
238 {
239         zfree(&evsel->priv);
240         perf_evsel__delete(evsel);
241 }
242
243 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
244 {
245         evsel->priv = malloc(sizeof(struct syscall_tp));
246         if (evsel->priv != NULL) {
247                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
248                         goto out_delete;
249
250                 evsel->handler = handler;
251                 return 0;
252         }
253
254         return -ENOMEM;
255
256 out_delete:
257         zfree(&evsel->priv);
258         return -ENOENT;
259 }
260
261 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
262 {
263         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
264
265         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
266         if (IS_ERR(evsel))
267                 evsel = perf_evsel__newtp("syscalls", direction);
268
269         if (IS_ERR(evsel))
270                 return NULL;
271
272         if (perf_evsel__init_syscall_tp(evsel, handler))
273                 goto out_delete;
274
275         return evsel;
276
277 out_delete:
278         perf_evsel__delete_priv(evsel);
279         return NULL;
280 }
281
282 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
283         ({ struct syscall_tp *fields = evsel->priv; \
284            fields->name.integer(&fields->name, sample); })
285
286 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
287         ({ struct syscall_tp *fields = evsel->priv; \
288            fields->name.pointer(&fields->name, sample); })
289
290 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
291 {
292         int idx = val - sa->offset;
293
294         if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL)
295                 return scnprintf(bf, size, intfmt, val);
296
297         return scnprintf(bf, size, "%s", sa->entries[idx]);
298 }
299
300 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
301                                                 const char *intfmt,
302                                                 struct syscall_arg *arg)
303 {
304         return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
305 }
306
307 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
308                                               struct syscall_arg *arg)
309 {
310         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
311 }
312
313 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
314
315 struct strarrays {
316         int             nr_entries;
317         struct strarray **entries;
318 };
319
320 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
321         .nr_entries = ARRAY_SIZE(array), \
322         .entries = array, \
323 }
324
325 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
326                                         struct syscall_arg *arg)
327 {
328         struct strarrays *sas = arg->parm;
329         int i;
330
331         for (i = 0; i < sas->nr_entries; ++i) {
332                 struct strarray *sa = sas->entries[i];
333                 int idx = arg->val - sa->offset;
334
335                 if (idx >= 0 && idx < sa->nr_entries) {
336                         if (sa->entries[idx] == NULL)
337                                 break;
338                         return scnprintf(bf, size, "%s", sa->entries[idx]);
339                 }
340         }
341
342         return scnprintf(bf, size, "%d", arg->val);
343 }
344
345 #ifndef AT_FDCWD
346 #define AT_FDCWD        -100
347 #endif
348
349 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
350                                            struct syscall_arg *arg)
351 {
352         int fd = arg->val;
353
354         if (fd == AT_FDCWD)
355                 return scnprintf(bf, size, "CWD");
356
357         return syscall_arg__scnprintf_fd(bf, size, arg);
358 }
359
360 #define SCA_FDAT syscall_arg__scnprintf_fd_at
361
362 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
363                                               struct syscall_arg *arg);
364
365 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
366
367 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
368 {
369         return scnprintf(bf, size, "%#lx", arg->val);
370 }
371
372 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
373 {
374         return scnprintf(bf, size, "%d", arg->val);
375 }
376
377 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
378 {
379         return scnprintf(bf, size, "%ld", arg->val);
380 }
381
382 static const char *bpf_cmd[] = {
383         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
384         "MAP_GET_NEXT_KEY", "PROG_LOAD",
385 };
386 static DEFINE_STRARRAY(bpf_cmd);
387
388 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
389 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
390
391 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
392 static DEFINE_STRARRAY(itimers);
393
394 static const char *keyctl_options[] = {
395         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
396         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
397         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
398         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
399         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
400 };
401 static DEFINE_STRARRAY(keyctl_options);
402
403 static const char *whences[] = { "SET", "CUR", "END",
404 #ifdef SEEK_DATA
405 "DATA",
406 #endif
407 #ifdef SEEK_HOLE
408 "HOLE",
409 #endif
410 };
411 static DEFINE_STRARRAY(whences);
412
413 static const char *fcntl_cmds[] = {
414         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
415         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
416         "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
417         "GETOWNER_UIDS",
418 };
419 static DEFINE_STRARRAY(fcntl_cmds);
420
421 static const char *fcntl_linux_specific_cmds[] = {
422         "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
423         "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
424         "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
425 };
426
427 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
428
429 static struct strarray *fcntl_cmds_arrays[] = {
430         &strarray__fcntl_cmds,
431         &strarray__fcntl_linux_specific_cmds,
432 };
433
434 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
435
436 static const char *rlimit_resources[] = {
437         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
438         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
439         "RTTIME",
440 };
441 static DEFINE_STRARRAY(rlimit_resources);
442
443 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
444 static DEFINE_STRARRAY(sighow);
445
446 static const char *clockid[] = {
447         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
448         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
449         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
450 };
451 static DEFINE_STRARRAY(clockid);
452
453 static const char *socket_families[] = {
454         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
455         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
456         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
457         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
458         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
459         "ALG", "NFC", "VSOCK",
460 };
461 static DEFINE_STRARRAY(socket_families);
462
463 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
464                                                  struct syscall_arg *arg)
465 {
466         size_t printed = 0;
467         int mode = arg->val;
468
469         if (mode == F_OK) /* 0 */
470                 return scnprintf(bf, size, "F");
471 #define P_MODE(n) \
472         if (mode & n##_OK) { \
473                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
474                 mode &= ~n##_OK; \
475         }
476
477         P_MODE(R);
478         P_MODE(W);
479         P_MODE(X);
480 #undef P_MODE
481
482         if (mode)
483                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
484
485         return printed;
486 }
487
488 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
489
490 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
491                                               struct syscall_arg *arg);
492
493 #define SCA_FILENAME syscall_arg__scnprintf_filename
494
495 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
496                                                 struct syscall_arg *arg)
497 {
498         int printed = 0, flags = arg->val;
499
500 #define P_FLAG(n) \
501         if (flags & O_##n) { \
502                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
503                 flags &= ~O_##n; \
504         }
505
506         P_FLAG(CLOEXEC);
507         P_FLAG(NONBLOCK);
508 #undef P_FLAG
509
510         if (flags)
511                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
512
513         return printed;
514 }
515
516 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
517
518 #ifndef GRND_NONBLOCK
519 #define GRND_NONBLOCK   0x0001
520 #endif
521 #ifndef GRND_RANDOM
522 #define GRND_RANDOM     0x0002
523 #endif
524
525 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
526                                                    struct syscall_arg *arg)
527 {
528         int printed = 0, flags = arg->val;
529
530 #define P_FLAG(n) \
531         if (flags & GRND_##n) { \
532                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
533                 flags &= ~GRND_##n; \
534         }
535
536         P_FLAG(RANDOM);
537         P_FLAG(NONBLOCK);
538 #undef P_FLAG
539
540         if (flags)
541                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
542
543         return printed;
544 }
545
546 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
547
548 #define STRARRAY(name, array) \
549           { .scnprintf  = SCA_STRARRAY, \
550             .parm       = &strarray__##array, }
551
552 #include "trace/beauty/arch_errno_names.c"
553 #include "trace/beauty/eventfd.c"
554 #include "trace/beauty/futex_op.c"
555 #include "trace/beauty/futex_val3.c"
556 #include "trace/beauty/mmap.c"
557 #include "trace/beauty/mode_t.c"
558 #include "trace/beauty/msg_flags.c"
559 #include "trace/beauty/open_flags.c"
560 #include "trace/beauty/perf_event_open.c"
561 #include "trace/beauty/pid.c"
562 #include "trace/beauty/sched_policy.c"
563 #include "trace/beauty/seccomp.c"
564 #include "trace/beauty/signum.c"
565 #include "trace/beauty/socket_type.c"
566 #include "trace/beauty/waitid_options.c"
567
568 struct syscall_arg_fmt {
569         size_t     (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
570         void       *parm;
571         const char *name;
572         bool       show_zero;
573 };
574
575 static struct syscall_fmt {
576         const char *name;
577         const char *alias;
578         struct syscall_arg_fmt arg[6];
579         u8         nr_args;
580         bool       errpid;
581         bool       timeout;
582         bool       hexret;
583 } syscall_fmts[] = {
584         { .name     = "access",
585           .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
586         { .name     = "bpf",
587           .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
588         { .name     = "brk",        .hexret = true,
589           .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
590         { .name     = "clock_gettime",
591           .arg = { [0] = STRARRAY(clk_id, clockid), }, },
592         { .name     = "clone",      .errpid = true, .nr_args = 5,
593           .arg = { [0] = { .name = "flags",         .scnprintf = SCA_CLONE_FLAGS, },
594                    [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
595                    [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
596                    [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
597                    [4] = { .name = "tls",           .scnprintf = SCA_HEX, }, }, },
598         { .name     = "close",
599           .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
600         { .name     = "epoll_ctl",
601           .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
602         { .name     = "eventfd2",
603           .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
604         { .name     = "fchmodat",
605           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
606         { .name     = "fchownat",
607           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
608         { .name     = "fcntl",
609           .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
610                            .parm      = &strarrays__fcntl_cmds_arrays,
611                            .show_zero = true, },
612                    [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
613         { .name     = "flock",
614           .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
615         { .name     = "fstat", .alias = "newfstat", },
616         { .name     = "fstatat", .alias = "newfstatat", },
617         { .name     = "futex",
618           .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
619                    [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
620         { .name     = "futimesat",
621           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
622         { .name     = "getitimer",
623           .arg = { [0] = STRARRAY(which, itimers), }, },
624         { .name     = "getpid",     .errpid = true, },
625         { .name     = "getpgid",    .errpid = true, },
626         { .name     = "getppid",    .errpid = true, },
627         { .name     = "getrandom",
628           .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
629         { .name     = "getrlimit",
630           .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
631         { .name     = "gettid",     .errpid = true, },
632         { .name     = "ioctl",
633           .arg = {
634 #if defined(__i386__) || defined(__x86_64__)
635 /*
636  * FIXME: Make this available to all arches.
637  */
638                    [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
639                    [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
640 #else
641                    [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
642 #endif
643         { .name     = "kcmp",       .nr_args = 5,
644           .arg = { [0] = { .name = "pid1",      .scnprintf = SCA_PID, },
645                    [1] = { .name = "pid2",      .scnprintf = SCA_PID, },
646                    [2] = { .name = "type",      .scnprintf = SCA_KCMP_TYPE, },
647                    [3] = { .name = "idx1",      .scnprintf = SCA_KCMP_IDX, },
648                    [4] = { .name = "idx2",      .scnprintf = SCA_KCMP_IDX, }, }, },
649         { .name     = "keyctl",
650           .arg = { [0] = STRARRAY(option, keyctl_options), }, },
651         { .name     = "kill",
652           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
653         { .name     = "linkat",
654           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
655         { .name     = "lseek",
656           .arg = { [2] = STRARRAY(whence, whences), }, },
657         { .name     = "lstat", .alias = "newlstat", },
658         { .name     = "madvise",
659           .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
660                    [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
661         { .name     = "mkdirat",
662           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
663         { .name     = "mknodat",
664           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
665         { .name     = "mlock",
666           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
667         { .name     = "mlockall",
668           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
669         { .name     = "mmap",       .hexret = true,
670 /* The standard mmap maps to old_mmap on s390x */
671 #if defined(__s390x__)
672         .alias = "old_mmap",
673 #endif
674           .arg = { [0] = { .scnprintf = SCA_HEX,        /* addr */ },
675                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
676                    [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, },
677         { .name     = "mprotect",
678           .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
679                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ }, }, },
680         { .name     = "mq_unlink",
681           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
682         { .name     = "mremap",     .hexret = true,
683           .arg = { [0] = { .scnprintf = SCA_HEX,          /* addr */ },
684                    [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
685                    [4] = { .scnprintf = SCA_HEX,          /* new_addr */ }, }, },
686         { .name     = "munlock",
687           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
688         { .name     = "munmap",
689           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
690         { .name     = "name_to_handle_at",
691           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
692         { .name     = "newfstatat",
693           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
694         { .name     = "open",
695           .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
696         { .name     = "open_by_handle_at",
697           .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
698                    [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
699         { .name     = "openat",
700           .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
701                    [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
702         { .name     = "perf_event_open",
703           .arg = { [2] = { .scnprintf = SCA_INT,        /* cpu */ },
704                    [3] = { .scnprintf = SCA_FD,         /* group_fd */ },
705                    [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
706         { .name     = "pipe2",
707           .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
708         { .name     = "pkey_alloc",
709           .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,   /* access_rights */ }, }, },
710         { .name     = "pkey_free",
711           .arg = { [0] = { .scnprintf = SCA_INT,        /* key */ }, }, },
712         { .name     = "pkey_mprotect",
713           .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
714                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
715                    [3] = { .scnprintf = SCA_INT,        /* pkey */ }, }, },
716         { .name     = "poll", .timeout = true, },
717         { .name     = "ppoll", .timeout = true, },
718         { .name     = "prctl", .alias = "arch_prctl",
719           .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
720                    [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
721                    [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
722         { .name     = "pread", .alias = "pread64", },
723         { .name     = "preadv", .alias = "pread", },
724         { .name     = "prlimit64",
725           .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
726         { .name     = "pwrite", .alias = "pwrite64", },
727         { .name     = "readlinkat",
728           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
729         { .name     = "recvfrom",
730           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
731         { .name     = "recvmmsg",
732           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
733         { .name     = "recvmsg",
734           .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
735         { .name     = "renameat",
736           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
737         { .name     = "rt_sigaction",
738           .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
739         { .name     = "rt_sigprocmask",
740           .arg = { [0] = STRARRAY(how, sighow), }, },
741         { .name     = "rt_sigqueueinfo",
742           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
743         { .name     = "rt_tgsigqueueinfo",
744           .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
745         { .name     = "sched_setscheduler",
746           .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
747         { .name     = "seccomp",
748           .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,    /* op */ },
749                    [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
750         { .name     = "select", .timeout = true, },
751         { .name     = "sendmmsg",
752           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
753         { .name     = "sendmsg",
754           .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
755         { .name     = "sendto",
756           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
757         { .name     = "set_tid_address", .errpid = true, },
758         { .name     = "setitimer",
759           .arg = { [0] = STRARRAY(which, itimers), }, },
760         { .name     = "setrlimit",
761           .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
762         { .name     = "socket",
763           .arg = { [0] = STRARRAY(family, socket_families),
764                    [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
765                    [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
766         { .name     = "socketpair",
767           .arg = { [0] = STRARRAY(family, socket_families),
768                    [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
769                    [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
770         { .name     = "stat", .alias = "newstat", },
771         { .name     = "statx",
772           .arg = { [0] = { .scnprintf = SCA_FDAT,        /* fdat */ },
773                    [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
774                    [3] = { .scnprintf = SCA_STATX_MASK,  /* mask */ }, }, },
775         { .name     = "swapoff",
776           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
777         { .name     = "swapon",
778           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
779         { .name     = "symlinkat",
780           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
781         { .name     = "tgkill",
782           .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
783         { .name     = "tkill",
784           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
785         { .name     = "uname", .alias = "newuname", },
786         { .name     = "unlinkat",
787           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
788         { .name     = "utimensat",
789           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
790         { .name     = "wait4",      .errpid = true,
791           .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
792         { .name     = "waitid",     .errpid = true,
793           .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
794 };
795
796 static int syscall_fmt__cmp(const void *name, const void *fmtp)
797 {
798         const struct syscall_fmt *fmt = fmtp;
799         return strcmp(name, fmt->name);
800 }
801
802 static struct syscall_fmt *syscall_fmt__find(const char *name)
803 {
804         const int nmemb = ARRAY_SIZE(syscall_fmts);
805         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
806 }
807
808 struct syscall {
809         struct event_format *tp_format;
810         int                 nr_args;
811         struct format_field *args;
812         const char          *name;
813         bool                is_exit;
814         struct syscall_fmt  *fmt;
815         struct syscall_arg_fmt *arg_fmt;
816 };
817
818 /*
819  * We need to have this 'calculated' boolean because in some cases we really
820  * don't know what is the duration of a syscall, for instance, when we start
821  * a session and some threads are waiting for a syscall to finish, say 'poll',
822  * in which case all we can do is to print "( ? ) for duration and for the
823  * start timestamp.
824  */
825 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
826 {
827         double duration = (double)t / NSEC_PER_MSEC;
828         size_t printed = fprintf(fp, "(");
829
830         if (!calculated)
831                 printed += fprintf(fp, "         ");
832         else if (duration >= 1.0)
833                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
834         else if (duration >= 0.01)
835                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
836         else
837                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
838         return printed + fprintf(fp, "): ");
839 }
840
841 /**
842  * filename.ptr: The filename char pointer that will be vfs_getname'd
843  * filename.entry_str_pos: Where to insert the string translated from
844  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
845  * ret_scnprintf: syscall args may set this to a different syscall return
846  *                formatter, for instance, fcntl may return fds, file flags, etc.
847  */
848 struct thread_trace {
849         u64               entry_time;
850         bool              entry_pending;
851         unsigned long     nr_events;
852         unsigned long     pfmaj, pfmin;
853         char              *entry_str;
854         double            runtime_ms;
855         size_t            (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
856         struct {
857                 unsigned long ptr;
858                 short int     entry_str_pos;
859                 bool          pending_open;
860                 unsigned int  namelen;
861                 char          *name;
862         } filename;
863         struct {
864                 int       max;
865                 char      **table;
866         } paths;
867
868         struct intlist *syscall_stats;
869 };
870
871 static struct thread_trace *thread_trace__new(void)
872 {
873         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
874
875         if (ttrace)
876                 ttrace->paths.max = -1;
877
878         ttrace->syscall_stats = intlist__new(NULL);
879
880         return ttrace;
881 }
882
883 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
884 {
885         struct thread_trace *ttrace;
886
887         if (thread == NULL)
888                 goto fail;
889
890         if (thread__priv(thread) == NULL)
891                 thread__set_priv(thread, thread_trace__new());
892
893         if (thread__priv(thread) == NULL)
894                 goto fail;
895
896         ttrace = thread__priv(thread);
897         ++ttrace->nr_events;
898
899         return ttrace;
900 fail:
901         color_fprintf(fp, PERF_COLOR_RED,
902                       "WARNING: not enough memory, dropping samples!\n");
903         return NULL;
904 }
905
906
907 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
908                                     size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
909 {
910         struct thread_trace *ttrace = thread__priv(arg->thread);
911
912         ttrace->ret_scnprintf = ret_scnprintf;
913 }
914
915 #define TRACE_PFMAJ             (1 << 0)
916 #define TRACE_PFMIN             (1 << 1)
917
918 static const size_t trace__entry_str_size = 2048;
919
920 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
921 {
922         struct thread_trace *ttrace = thread__priv(thread);
923
924         if (fd > ttrace->paths.max) {
925                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
926
927                 if (npath == NULL)
928                         return -1;
929
930                 if (ttrace->paths.max != -1) {
931                         memset(npath + ttrace->paths.max + 1, 0,
932                                (fd - ttrace->paths.max) * sizeof(char *));
933                 } else {
934                         memset(npath, 0, (fd + 1) * sizeof(char *));
935                 }
936
937                 ttrace->paths.table = npath;
938                 ttrace->paths.max   = fd;
939         }
940
941         ttrace->paths.table[fd] = strdup(pathname);
942
943         return ttrace->paths.table[fd] != NULL ? 0 : -1;
944 }
945
946 static int thread__read_fd_path(struct thread *thread, int fd)
947 {
948         char linkname[PATH_MAX], pathname[PATH_MAX];
949         struct stat st;
950         int ret;
951
952         if (thread->pid_ == thread->tid) {
953                 scnprintf(linkname, sizeof(linkname),
954                           "/proc/%d/fd/%d", thread->pid_, fd);
955         } else {
956                 scnprintf(linkname, sizeof(linkname),
957                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
958         }
959
960         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
961                 return -1;
962
963         ret = readlink(linkname, pathname, sizeof(pathname));
964
965         if (ret < 0 || ret > st.st_size)
966                 return -1;
967
968         pathname[ret] = '\0';
969         return trace__set_fd_pathname(thread, fd, pathname);
970 }
971
972 static const char *thread__fd_path(struct thread *thread, int fd,
973                                    struct trace *trace)
974 {
975         struct thread_trace *ttrace = thread__priv(thread);
976
977         if (ttrace == NULL)
978                 return NULL;
979
980         if (fd < 0)
981                 return NULL;
982
983         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
984                 if (!trace->live)
985                         return NULL;
986                 ++trace->stats.proc_getname;
987                 if (thread__read_fd_path(thread, fd))
988                         return NULL;
989         }
990
991         return ttrace->paths.table[fd];
992 }
993
994 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
995 {
996         int fd = arg->val;
997         size_t printed = scnprintf(bf, size, "%d", fd);
998         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
999
1000         if (path)
1001                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1002
1003         return printed;
1004 }
1005
1006 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1007 {
1008         size_t printed = scnprintf(bf, size, "%d", fd);
1009         struct thread *thread = machine__find_thread(trace->host, pid, pid);
1010
1011         if (thread) {
1012                 const char *path = thread__fd_path(thread, fd, trace);
1013
1014                 if (path)
1015                         printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1016
1017                 thread__put(thread);
1018         }
1019
1020         return printed;
1021 }
1022
1023 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1024                                               struct syscall_arg *arg)
1025 {
1026         int fd = arg->val;
1027         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1028         struct thread_trace *ttrace = thread__priv(arg->thread);
1029
1030         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1031                 zfree(&ttrace->paths.table[fd]);
1032
1033         return printed;
1034 }
1035
1036 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1037                                      unsigned long ptr)
1038 {
1039         struct thread_trace *ttrace = thread__priv(thread);
1040
1041         ttrace->filename.ptr = ptr;
1042         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1043 }
1044
1045 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1046                                               struct syscall_arg *arg)
1047 {
1048         unsigned long ptr = arg->val;
1049
1050         if (!arg->trace->vfs_getname)
1051                 return scnprintf(bf, size, "%#x", ptr);
1052
1053         thread__set_filename_pos(arg->thread, bf, ptr);
1054         return 0;
1055 }
1056
1057 static bool trace__filter_duration(struct trace *trace, double t)
1058 {
1059         return t < (trace->duration_filter * NSEC_PER_MSEC);
1060 }
1061
1062 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1063 {
1064         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1065
1066         return fprintf(fp, "%10.3f ", ts);
1067 }
1068
1069 /*
1070  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1071  * using ttrace->entry_time for a thread that receives a sys_exit without
1072  * first having received a sys_enter ("poll" issued before tracing session
1073  * starts, lost sys_enter exit due to ring buffer overflow).
1074  */
1075 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1076 {
1077         if (tstamp > 0)
1078                 return __trace__fprintf_tstamp(trace, tstamp, fp);
1079
1080         return fprintf(fp, "         ? ");
1081 }
1082
1083 static bool done = false;
1084 static bool interrupted = false;
1085
1086 static void sig_handler(int sig)
1087 {
1088         done = true;
1089         interrupted = sig == SIGINT;
1090 }
1091
1092 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1093                                         u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1094 {
1095         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1096         printed += fprintf_duration(duration, duration_calculated, fp);
1097
1098         if (trace->multiple_threads) {
1099                 if (trace->show_comm)
1100                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1101                 printed += fprintf(fp, "%d ", thread->tid);
1102         }
1103
1104         return printed;
1105 }
1106
1107 static int trace__process_event(struct trace *trace, struct machine *machine,
1108                                 union perf_event *event, struct perf_sample *sample)
1109 {
1110         int ret = 0;
1111
1112         switch (event->header.type) {
1113         case PERF_RECORD_LOST:
1114                 color_fprintf(trace->output, PERF_COLOR_RED,
1115                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1116                 ret = machine__process_lost_event(machine, event, sample);
1117                 break;
1118         default:
1119                 ret = machine__process_event(machine, event, sample);
1120                 break;
1121         }
1122
1123         return ret;
1124 }
1125
1126 static int trace__tool_process(struct perf_tool *tool,
1127                                union perf_event *event,
1128                                struct perf_sample *sample,
1129                                struct machine *machine)
1130 {
1131         struct trace *trace = container_of(tool, struct trace, tool);
1132         return trace__process_event(trace, machine, event, sample);
1133 }
1134
1135 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1136 {
1137         struct machine *machine = vmachine;
1138
1139         if (machine->kptr_restrict_warned)
1140                 return NULL;
1141
1142         if (symbol_conf.kptr_restrict) {
1143                 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1144                            "Check /proc/sys/kernel/kptr_restrict.\n\n"
1145                            "Kernel samples will not be resolved.\n");
1146                 machine->kptr_restrict_warned = true;
1147                 return NULL;
1148         }
1149
1150         return machine__resolve_kernel_addr(vmachine, addrp, modp);
1151 }
1152
1153 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1154 {
1155         int err = symbol__init(NULL);
1156
1157         if (err)
1158                 return err;
1159
1160         trace->host = machine__new_host();
1161         if (trace->host == NULL)
1162                 return -ENOMEM;
1163
1164         err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1165         if (err < 0)
1166                 goto out;
1167
1168         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1169                                             evlist->threads, trace__tool_process, false,
1170                                             trace->opts.proc_map_timeout, 1);
1171 out:
1172         if (err)
1173                 symbol__exit();
1174
1175         return err;
1176 }
1177
1178 static void trace__symbols__exit(struct trace *trace)
1179 {
1180         machine__exit(trace->host);
1181         trace->host = NULL;
1182
1183         symbol__exit();
1184 }
1185
1186 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1187 {
1188         int idx;
1189
1190         if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1191                 nr_args = sc->fmt->nr_args;
1192
1193         sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1194         if (sc->arg_fmt == NULL)
1195                 return -1;
1196
1197         for (idx = 0; idx < nr_args; ++idx) {
1198                 if (sc->fmt)
1199                         sc->arg_fmt[idx] = sc->fmt->arg[idx];
1200         }
1201
1202         sc->nr_args = nr_args;
1203         return 0;
1204 }
1205
1206 static int syscall__set_arg_fmts(struct syscall *sc)
1207 {
1208         struct format_field *field;
1209         int idx = 0, len;
1210
1211         for (field = sc->args; field; field = field->next, ++idx) {
1212                 if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1213                         continue;
1214
1215                 if (strcmp(field->type, "const char *") == 0 &&
1216                          (strcmp(field->name, "filename") == 0 ||
1217                           strcmp(field->name, "path") == 0 ||
1218                           strcmp(field->name, "pathname") == 0))
1219                         sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1220                 else if (field->flags & FIELD_IS_POINTER)
1221                         sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1222                 else if (strcmp(field->type, "pid_t") == 0)
1223                         sc->arg_fmt[idx].scnprintf = SCA_PID;
1224                 else if (strcmp(field->type, "umode_t") == 0)
1225                         sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1226                 else if ((strcmp(field->type, "int") == 0 ||
1227                           strcmp(field->type, "unsigned int") == 0 ||
1228                           strcmp(field->type, "long") == 0) &&
1229                          (len = strlen(field->name)) >= 2 &&
1230                          strcmp(field->name + len - 2, "fd") == 0) {
1231                         /*
1232                          * /sys/kernel/tracing/events/syscalls/sys_enter*
1233                          * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1234                          * 65 int
1235                          * 23 unsigned int
1236                          * 7 unsigned long
1237                          */
1238                         sc->arg_fmt[idx].scnprintf = SCA_FD;
1239                 }
1240         }
1241
1242         return 0;
1243 }
1244
1245 static int trace__read_syscall_info(struct trace *trace, int id)
1246 {
1247         char tp_name[128];
1248         struct syscall *sc;
1249         const char *name = syscalltbl__name(trace->sctbl, id);
1250
1251         if (name == NULL)
1252                 return -1;
1253
1254         if (id > trace->syscalls.max) {
1255                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1256
1257                 if (nsyscalls == NULL)
1258                         return -1;
1259
1260                 if (trace->syscalls.max != -1) {
1261                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1262                                (id - trace->syscalls.max) * sizeof(*sc));
1263                 } else {
1264                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1265                 }
1266
1267                 trace->syscalls.table = nsyscalls;
1268                 trace->syscalls.max   = id;
1269         }
1270
1271         sc = trace->syscalls.table + id;
1272         sc->name = name;
1273
1274         sc->fmt  = syscall_fmt__find(sc->name);
1275
1276         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1277         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1278
1279         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1280                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1281                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1282         }
1283
1284         if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1285                 return -1;
1286
1287         if (IS_ERR(sc->tp_format))
1288                 return -1;
1289
1290         sc->args = sc->tp_format->format.fields;
1291         /*
1292          * We need to check and discard the first variable '__syscall_nr'
1293          * or 'nr' that mean the syscall number. It is needless here.
1294          * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1295          */
1296         if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1297                 sc->args = sc->args->next;
1298                 --sc->nr_args;
1299         }
1300
1301         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1302
1303         return syscall__set_arg_fmts(sc);
1304 }
1305
1306 static int trace__validate_ev_qualifier(struct trace *trace)
1307 {
1308         int err = 0, i;
1309         size_t nr_allocated;
1310         struct str_node *pos;
1311
1312         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1313         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1314                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1315
1316         if (trace->ev_qualifier_ids.entries == NULL) {
1317                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1318                        trace->output);
1319                 err = -EINVAL;
1320                 goto out;
1321         }
1322
1323         nr_allocated = trace->ev_qualifier_ids.nr;
1324         i = 0;
1325
1326         strlist__for_each_entry(pos, trace->ev_qualifier) {
1327                 const char *sc = pos->s;
1328                 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1329
1330                 if (id < 0) {
1331                         id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1332                         if (id >= 0)
1333                                 goto matches;
1334
1335                         if (err == 0) {
1336                                 fputs("Error:\tInvalid syscall ", trace->output);
1337                                 err = -EINVAL;
1338                         } else {
1339                                 fputs(", ", trace->output);
1340                         }
1341
1342                         fputs(sc, trace->output);
1343                 }
1344 matches:
1345                 trace->ev_qualifier_ids.entries[i++] = id;
1346                 if (match_next == -1)
1347                         continue;
1348
1349                 while (1) {
1350                         id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1351                         if (id < 0)
1352                                 break;
1353                         if (nr_allocated == trace->ev_qualifier_ids.nr) {
1354                                 void *entries;
1355
1356                                 nr_allocated += 8;
1357                                 entries = realloc(trace->ev_qualifier_ids.entries,
1358                                                   nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1359                                 if (entries == NULL) {
1360                                         err = -ENOMEM;
1361                                         fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1362                                         goto out_free;
1363                                 }
1364                                 trace->ev_qualifier_ids.entries = entries;
1365                         }
1366                         trace->ev_qualifier_ids.nr++;
1367                         trace->ev_qualifier_ids.entries[i++] = id;
1368                 }
1369         }
1370
1371         if (err < 0) {
1372                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1373                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1374 out_free:
1375                 zfree(&trace->ev_qualifier_ids.entries);
1376                 trace->ev_qualifier_ids.nr = 0;
1377         }
1378 out:
1379         return err;
1380 }
1381
1382 /*
1383  * args is to be interpreted as a series of longs but we need to handle
1384  * 8-byte unaligned accesses. args points to raw_data within the event
1385  * and raw_data is guaranteed to be 8-byte unaligned because it is
1386  * preceded by raw_size which is a u32. So we need to copy args to a temp
1387  * variable to read it. Most notably this avoids extended load instructions
1388  * on unaligned addresses
1389  */
1390 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1391 {
1392         unsigned long val;
1393         unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1394
1395         memcpy(&val, p, sizeof(val));
1396         return val;
1397 }
1398
1399 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1400                                       struct syscall_arg *arg)
1401 {
1402         if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1403                 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1404
1405         return scnprintf(bf, size, "arg%d: ", arg->idx);
1406 }
1407
1408 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1409                                      struct syscall_arg *arg, unsigned long val)
1410 {
1411         if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1412                 arg->val = val;
1413                 if (sc->arg_fmt[arg->idx].parm)
1414                         arg->parm = sc->arg_fmt[arg->idx].parm;
1415                 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1416         }
1417         return scnprintf(bf, size, "%ld", val);
1418 }
1419
1420 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1421                                       unsigned char *args, struct trace *trace,
1422                                       struct thread *thread)
1423 {
1424         size_t printed = 0;
1425         unsigned long val;
1426         u8 bit = 1;
1427         struct syscall_arg arg = {
1428                 .args   = args,
1429                 .idx    = 0,
1430                 .mask   = 0,
1431                 .trace  = trace,
1432                 .thread = thread,
1433         };
1434         struct thread_trace *ttrace = thread__priv(thread);
1435
1436         /*
1437          * Things like fcntl will set this in its 'cmd' formatter to pick the
1438          * right formatter for the return value (an fd? file flags?), which is
1439          * not needed for syscalls that always return a given type, say an fd.
1440          */
1441         ttrace->ret_scnprintf = NULL;
1442
1443         if (sc->args != NULL) {
1444                 struct format_field *field;
1445
1446                 for (field = sc->args; field;
1447                      field = field->next, ++arg.idx, bit <<= 1) {
1448                         if (arg.mask & bit)
1449                                 continue;
1450
1451                         val = syscall_arg__val(&arg, arg.idx);
1452
1453                         /*
1454                          * Suppress this argument if its value is zero and
1455                          * and we don't have a string associated in an
1456                          * strarray for it.
1457                          */
1458                         if (val == 0 &&
1459                             !(sc->arg_fmt &&
1460                               (sc->arg_fmt[arg.idx].show_zero ||
1461                                sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1462                                sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1463                               sc->arg_fmt[arg.idx].parm))
1464                                 continue;
1465
1466                         printed += scnprintf(bf + printed, size - printed,
1467                                              "%s%s: ", printed ? ", " : "", field->name);
1468                         printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1469                 }
1470         } else if (IS_ERR(sc->tp_format)) {
1471                 /*
1472                  * If we managed to read the tracepoint /format file, then we
1473                  * may end up not having any args, like with gettid(), so only
1474                  * print the raw args when we didn't manage to read it.
1475                  */
1476                 while (arg.idx < sc->nr_args) {
1477                         if (arg.mask & bit)
1478                                 goto next_arg;
1479                         val = syscall_arg__val(&arg, arg.idx);
1480                         if (printed)
1481                                 printed += scnprintf(bf + printed, size - printed, ", ");
1482                         printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1483                         printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1484 next_arg:
1485                         ++arg.idx;
1486                         bit <<= 1;
1487                 }
1488         }
1489
1490         return printed;
1491 }
1492
1493 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1494                                   union perf_event *event,
1495                                   struct perf_sample *sample);
1496
1497 static struct syscall *trace__syscall_info(struct trace *trace,
1498                                            struct perf_evsel *evsel, int id)
1499 {
1500
1501         if (id < 0) {
1502
1503                 /*
1504                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1505                  * before that, leaving at a higher verbosity level till that is
1506                  * explained. Reproduced with plain ftrace with:
1507                  *
1508                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1509                  * grep "NR -1 " /t/trace_pipe
1510                  *
1511                  * After generating some load on the machine.
1512                  */
1513                 if (verbose > 1) {
1514                         static u64 n;
1515                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1516                                 id, perf_evsel__name(evsel), ++n);
1517                 }
1518                 return NULL;
1519         }
1520
1521         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1522             trace__read_syscall_info(trace, id))
1523                 goto out_cant_read;
1524
1525         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1526                 goto out_cant_read;
1527
1528         return &trace->syscalls.table[id];
1529
1530 out_cant_read:
1531         if (verbose > 0) {
1532                 fprintf(trace->output, "Problems reading syscall %d", id);
1533                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1534                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1535                 fputs(" information\n", trace->output);
1536         }
1537         return NULL;
1538 }
1539
1540 static void thread__update_stats(struct thread_trace *ttrace,
1541                                  int id, struct perf_sample *sample)
1542 {
1543         struct int_node *inode;
1544         struct stats *stats;
1545         u64 duration = 0;
1546
1547         inode = intlist__findnew(ttrace->syscall_stats, id);
1548         if (inode == NULL)
1549                 return;
1550
1551         stats = inode->priv;
1552         if (stats == NULL) {
1553                 stats = malloc(sizeof(struct stats));
1554                 if (stats == NULL)
1555                         return;
1556                 init_stats(stats);
1557                 inode->priv = stats;
1558         }
1559
1560         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1561                 duration = sample->time - ttrace->entry_time;
1562
1563         update_stats(stats, duration);
1564 }
1565
1566 static int trace__printf_interrupted_entry(struct trace *trace)
1567 {
1568         struct thread_trace *ttrace;
1569         size_t printed;
1570
1571         if (trace->failure_only || trace->current == NULL)
1572                 return 0;
1573
1574         ttrace = thread__priv(trace->current);
1575
1576         if (!ttrace->entry_pending)
1577                 return 0;
1578
1579         printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1580         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1581         ttrace->entry_pending = false;
1582
1583         return printed;
1584 }
1585
1586 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1587                                  struct perf_sample *sample, struct thread *thread)
1588 {
1589         int printed = 0;
1590
1591         if (trace->print_sample) {
1592                 double ts = (double)sample->time / NSEC_PER_MSEC;
1593
1594                 printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1595                                    perf_evsel__name(evsel), ts,
1596                                    thread__comm_str(thread),
1597                                    sample->pid, sample->tid, sample->cpu);
1598         }
1599
1600         return printed;
1601 }
1602
1603 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1604                             union perf_event *event __maybe_unused,
1605                             struct perf_sample *sample)
1606 {
1607         char *msg;
1608         void *args;
1609         size_t printed = 0;
1610         struct thread *thread;
1611         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1612         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1613         struct thread_trace *ttrace;
1614
1615         if (sc == NULL)
1616                 return -1;
1617
1618         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1619         ttrace = thread__trace(thread, trace->output);
1620         if (ttrace == NULL)
1621                 goto out_put;
1622
1623         trace__fprintf_sample(trace, evsel, sample, thread);
1624
1625         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1626
1627         if (ttrace->entry_str == NULL) {
1628                 ttrace->entry_str = malloc(trace__entry_str_size);
1629                 if (!ttrace->entry_str)
1630                         goto out_put;
1631         }
1632
1633         if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1634                 trace__printf_interrupted_entry(trace);
1635
1636         ttrace->entry_time = sample->time;
1637         msg = ttrace->entry_str;
1638         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1639
1640         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1641                                            args, trace, thread);
1642
1643         if (sc->is_exit) {
1644                 if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1645                         trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1646                         fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1647                 }
1648         } else {
1649                 ttrace->entry_pending = true;
1650                 /* See trace__vfs_getname & trace__sys_exit */
1651                 ttrace->filename.pending_open = false;
1652         }
1653
1654         if (trace->current != thread) {
1655                 thread__put(trace->current);
1656                 trace->current = thread__get(thread);
1657         }
1658         err = 0;
1659 out_put:
1660         thread__put(thread);
1661         return err;
1662 }
1663
1664 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1665                                     struct perf_sample *sample,
1666                                     struct callchain_cursor *cursor)
1667 {
1668         struct addr_location al;
1669         int max_stack = evsel->attr.sample_max_stack ?
1670                         evsel->attr.sample_max_stack :
1671                         trace->max_stack;
1672
1673         if (machine__resolve(trace->host, &al, sample) < 0 ||
1674             thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1675                 return -1;
1676
1677         return 0;
1678 }
1679
1680 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1681 {
1682         /* TODO: user-configurable print_opts */
1683         const unsigned int print_opts = EVSEL__PRINT_SYM |
1684                                         EVSEL__PRINT_DSO |
1685                                         EVSEL__PRINT_UNKNOWN_AS_ADDR;
1686
1687         return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1688 }
1689
1690 static const char *errno_to_name(struct perf_evsel *evsel, int err)
1691 {
1692         struct perf_env *env = perf_evsel__env(evsel);
1693         const char *arch_name = perf_env__arch(env);
1694
1695         return arch_syscalls__strerrno(arch_name, err);
1696 }
1697
1698 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1699                            union perf_event *event __maybe_unused,
1700                            struct perf_sample *sample)
1701 {
1702         long ret;
1703         u64 duration = 0;
1704         bool duration_calculated = false;
1705         struct thread *thread;
1706         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1707         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1708         struct thread_trace *ttrace;
1709
1710         if (sc == NULL)
1711                 return -1;
1712
1713         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1714         ttrace = thread__trace(thread, trace->output);
1715         if (ttrace == NULL)
1716                 goto out_put;
1717
1718         trace__fprintf_sample(trace, evsel, sample, thread);
1719
1720         if (trace->summary)
1721                 thread__update_stats(ttrace, id, sample);
1722
1723         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1724
1725         if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1726                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1727                 ttrace->filename.pending_open = false;
1728                 ++trace->stats.vfs_getname;
1729         }
1730
1731         if (ttrace->entry_time) {
1732                 duration = sample->time - ttrace->entry_time;
1733                 if (trace__filter_duration(trace, duration))
1734                         goto out;
1735                 duration_calculated = true;
1736         } else if (trace->duration_filter)
1737                 goto out;
1738
1739         if (sample->callchain) {
1740                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1741                 if (callchain_ret == 0) {
1742                         if (callchain_cursor.nr < trace->min_stack)
1743                                 goto out;
1744                         callchain_ret = 1;
1745                 }
1746         }
1747
1748         if (trace->summary_only || (ret >= 0 && trace->failure_only))
1749                 goto out;
1750
1751         trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1752
1753         if (ttrace->entry_pending) {
1754                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1755         } else {
1756                 fprintf(trace->output, " ... [");
1757                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1758                 fprintf(trace->output, "]: %s()", sc->name);
1759         }
1760
1761         if (sc->fmt == NULL) {
1762                 if (ret < 0)
1763                         goto errno_print;
1764 signed_print:
1765                 fprintf(trace->output, ") = %ld", ret);
1766         } else if (ret < 0) {
1767 errno_print: {
1768                 char bf[STRERR_BUFSIZE];
1769                 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1770                            *e = errno_to_name(evsel, -ret);
1771
1772                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1773         }
1774         } else if (ret == 0 && sc->fmt->timeout)
1775                 fprintf(trace->output, ") = 0 Timeout");
1776         else if (ttrace->ret_scnprintf) {
1777                 char bf[1024];
1778                 struct syscall_arg arg = {
1779                         .val    = ret,
1780                         .thread = thread,
1781                         .trace  = trace,
1782                 };
1783                 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1784                 ttrace->ret_scnprintf = NULL;
1785                 fprintf(trace->output, ") = %s", bf);
1786         } else if (sc->fmt->hexret)
1787                 fprintf(trace->output, ") = %#lx", ret);
1788         else if (sc->fmt->errpid) {
1789                 struct thread *child = machine__find_thread(trace->host, ret, ret);
1790
1791                 if (child != NULL) {
1792                         fprintf(trace->output, ") = %ld", ret);
1793                         if (child->comm_set)
1794                                 fprintf(trace->output, " (%s)", thread__comm_str(child));
1795                         thread__put(child);
1796                 }
1797         } else
1798                 goto signed_print;
1799
1800         fputc('\n', trace->output);
1801
1802         if (callchain_ret > 0)
1803                 trace__fprintf_callchain(trace, sample);
1804         else if (callchain_ret < 0)
1805                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1806 out:
1807         ttrace->entry_pending = false;
1808         err = 0;
1809 out_put:
1810         thread__put(thread);
1811         return err;
1812 }
1813
1814 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1815                               union perf_event *event __maybe_unused,
1816                               struct perf_sample *sample)
1817 {
1818         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1819         struct thread_trace *ttrace;
1820         size_t filename_len, entry_str_len, to_move;
1821         ssize_t remaining_space;
1822         char *pos;
1823         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1824
1825         if (!thread)
1826                 goto out;
1827
1828         ttrace = thread__priv(thread);
1829         if (!ttrace)
1830                 goto out_put;
1831
1832         filename_len = strlen(filename);
1833         if (filename_len == 0)
1834                 goto out_put;
1835
1836         if (ttrace->filename.namelen < filename_len) {
1837                 char *f = realloc(ttrace->filename.name, filename_len + 1);
1838
1839                 if (f == NULL)
1840                         goto out_put;
1841
1842                 ttrace->filename.namelen = filename_len;
1843                 ttrace->filename.name = f;
1844         }
1845
1846         strcpy(ttrace->filename.name, filename);
1847         ttrace->filename.pending_open = true;
1848
1849         if (!ttrace->filename.ptr)
1850                 goto out_put;
1851
1852         entry_str_len = strlen(ttrace->entry_str);
1853         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1854         if (remaining_space <= 0)
1855                 goto out_put;
1856
1857         if (filename_len > (size_t)remaining_space) {
1858                 filename += filename_len - remaining_space;
1859                 filename_len = remaining_space;
1860         }
1861
1862         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1863         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1864         memmove(pos + filename_len, pos, to_move);
1865         memcpy(pos, filename, filename_len);
1866
1867         ttrace->filename.ptr = 0;
1868         ttrace->filename.entry_str_pos = 0;
1869 out_put:
1870         thread__put(thread);
1871 out:
1872         return 0;
1873 }
1874
1875 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1876                                      union perf_event *event __maybe_unused,
1877                                      struct perf_sample *sample)
1878 {
1879         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1880         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1881         struct thread *thread = machine__findnew_thread(trace->host,
1882                                                         sample->pid,
1883                                                         sample->tid);
1884         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1885
1886         if (ttrace == NULL)
1887                 goto out_dump;
1888
1889         ttrace->runtime_ms += runtime_ms;
1890         trace->runtime_ms += runtime_ms;
1891 out_put:
1892         thread__put(thread);
1893         return 0;
1894
1895 out_dump:
1896         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1897                evsel->name,
1898                perf_evsel__strval(evsel, sample, "comm"),
1899                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1900                runtime,
1901                perf_evsel__intval(evsel, sample, "vruntime"));
1902         goto out_put;
1903 }
1904
1905 static int bpf_output__printer(enum binary_printer_ops op,
1906                                unsigned int val, void *extra __maybe_unused, FILE *fp)
1907 {
1908         unsigned char ch = (unsigned char)val;
1909
1910         switch (op) {
1911         case BINARY_PRINT_CHAR_DATA:
1912                 return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1913         case BINARY_PRINT_DATA_BEGIN:
1914         case BINARY_PRINT_LINE_BEGIN:
1915         case BINARY_PRINT_ADDR:
1916         case BINARY_PRINT_NUM_DATA:
1917         case BINARY_PRINT_NUM_PAD:
1918         case BINARY_PRINT_SEP:
1919         case BINARY_PRINT_CHAR_PAD:
1920         case BINARY_PRINT_LINE_END:
1921         case BINARY_PRINT_DATA_END:
1922         default:
1923                 break;
1924         }
1925
1926         return 0;
1927 }
1928
1929 static void bpf_output__fprintf(struct trace *trace,
1930                                 struct perf_sample *sample)
1931 {
1932         binary__fprintf(sample->raw_data, sample->raw_size, 8,
1933                         bpf_output__printer, NULL, trace->output);
1934 }
1935
1936 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1937                                 union perf_event *event __maybe_unused,
1938                                 struct perf_sample *sample)
1939 {
1940         int callchain_ret = 0;
1941
1942         if (sample->callchain) {
1943                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1944                 if (callchain_ret == 0) {
1945                         if (callchain_cursor.nr < trace->min_stack)
1946                                 goto out;
1947                         callchain_ret = 1;
1948                 }
1949         }
1950
1951         trace__printf_interrupted_entry(trace);
1952         trace__fprintf_tstamp(trace, sample->time, trace->output);
1953
1954         if (trace->trace_syscalls)
1955                 fprintf(trace->output, "(         ): ");
1956
1957         fprintf(trace->output, "%s:", evsel->name);
1958
1959         if (perf_evsel__is_bpf_output(evsel)) {
1960                 bpf_output__fprintf(trace, sample);
1961         } else if (evsel->tp_format) {
1962                 event_format__fprintf(evsel->tp_format, sample->cpu,
1963                                       sample->raw_data, sample->raw_size,
1964                                       trace->output);
1965         }
1966
1967         fprintf(trace->output, "\n");
1968
1969         if (callchain_ret > 0)
1970                 trace__fprintf_callchain(trace, sample);
1971         else if (callchain_ret < 0)
1972                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1973 out:
1974         return 0;
1975 }
1976
1977 static void print_location(FILE *f, struct perf_sample *sample,
1978                            struct addr_location *al,
1979                            bool print_dso, bool print_sym)
1980 {
1981
1982         if ((verbose > 0 || print_dso) && al->map)
1983                 fprintf(f, "%s@", al->map->dso->long_name);
1984
1985         if ((verbose > 0 || print_sym) && al->sym)
1986                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1987                         al->addr - al->sym->start);
1988         else if (al->map)
1989                 fprintf(f, "0x%" PRIx64, al->addr);
1990         else
1991                 fprintf(f, "0x%" PRIx64, sample->addr);
1992 }
1993
1994 static int trace__pgfault(struct trace *trace,
1995                           struct perf_evsel *evsel,
1996                           union perf_event *event __maybe_unused,
1997                           struct perf_sample *sample)
1998 {
1999         struct thread *thread;
2000         struct addr_location al;
2001         char map_type = 'd';
2002         struct thread_trace *ttrace;
2003         int err = -1;
2004         int callchain_ret = 0;
2005
2006         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2007
2008         if (sample->callchain) {
2009                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2010                 if (callchain_ret == 0) {
2011                         if (callchain_cursor.nr < trace->min_stack)
2012                                 goto out_put;
2013                         callchain_ret = 1;
2014                 }
2015         }
2016
2017         ttrace = thread__trace(thread, trace->output);
2018         if (ttrace == NULL)
2019                 goto out_put;
2020
2021         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2022                 ttrace->pfmaj++;
2023         else
2024                 ttrace->pfmin++;
2025
2026         if (trace->summary_only)
2027                 goto out;
2028
2029         thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
2030
2031         trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2032
2033         fprintf(trace->output, "%sfault [",
2034                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2035                 "maj" : "min");
2036
2037         print_location(trace->output, sample, &al, false, true);
2038
2039         fprintf(trace->output, "] => ");
2040
2041         thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2042
2043         if (!al.map) {
2044                 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2045
2046                 if (al.map)
2047                         map_type = 'x';
2048                 else
2049                         map_type = '?';
2050         }
2051
2052         print_location(trace->output, sample, &al, true, false);
2053
2054         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2055
2056         if (callchain_ret > 0)
2057                 trace__fprintf_callchain(trace, sample);
2058         else if (callchain_ret < 0)
2059                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2060 out:
2061         err = 0;
2062 out_put:
2063         thread__put(thread);
2064         return err;
2065 }
2066
2067 static void trace__set_base_time(struct trace *trace,
2068                                  struct perf_evsel *evsel,
2069                                  struct perf_sample *sample)
2070 {
2071         /*
2072          * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2073          * and don't use sample->time unconditionally, we may end up having
2074          * some other event in the future without PERF_SAMPLE_TIME for good
2075          * reason, i.e. we may not be interested in its timestamps, just in
2076          * it taking place, picking some piece of information when it
2077          * appears in our event stream (vfs_getname comes to mind).
2078          */
2079         if (trace->base_time == 0 && !trace->full_time &&
2080             (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2081                 trace->base_time = sample->time;
2082 }
2083
2084 static int trace__process_sample(struct perf_tool *tool,
2085                                  union perf_event *event,
2086                                  struct perf_sample *sample,
2087                                  struct perf_evsel *evsel,
2088                                  struct machine *machine __maybe_unused)
2089 {
2090         struct trace *trace = container_of(tool, struct trace, tool);
2091         struct thread *thread;
2092         int err = 0;
2093
2094         tracepoint_handler handler = evsel->handler;
2095
2096         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2097         if (thread && thread__is_filtered(thread))
2098                 goto out;
2099
2100         trace__set_base_time(trace, evsel, sample);
2101
2102         if (handler) {
2103                 ++trace->nr_events;
2104                 handler(trace, evsel, event, sample);
2105         }
2106 out:
2107         thread__put(thread);
2108         return err;
2109 }
2110
2111 static int trace__record(struct trace *trace, int argc, const char **argv)
2112 {
2113         unsigned int rec_argc, i, j;
2114         const char **rec_argv;
2115         const char * const record_args[] = {
2116                 "record",
2117                 "-R",
2118                 "-m", "1024",
2119                 "-c", "1",
2120         };
2121
2122         const char * const sc_args[] = { "-e", };
2123         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2124         const char * const majpf_args[] = { "-e", "major-faults" };
2125         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2126         const char * const minpf_args[] = { "-e", "minor-faults" };
2127         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2128
2129         /* +1 is for the event string below */
2130         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2131                 majpf_args_nr + minpf_args_nr + argc;
2132         rec_argv = calloc(rec_argc + 1, sizeof(char *));
2133
2134         if (rec_argv == NULL)
2135                 return -ENOMEM;
2136
2137         j = 0;
2138         for (i = 0; i < ARRAY_SIZE(record_args); i++)
2139                 rec_argv[j++] = record_args[i];
2140
2141         if (trace->trace_syscalls) {
2142                 for (i = 0; i < sc_args_nr; i++)
2143                         rec_argv[j++] = sc_args[i];
2144
2145                 /* event string may be different for older kernels - e.g., RHEL6 */
2146                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2147                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2148                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2149                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2150                 else {
2151                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2152                         free(rec_argv);
2153                         return -1;
2154                 }
2155         }
2156
2157         if (trace->trace_pgfaults & TRACE_PFMAJ)
2158                 for (i = 0; i < majpf_args_nr; i++)
2159                         rec_argv[j++] = majpf_args[i];
2160
2161         if (trace->trace_pgfaults & TRACE_PFMIN)
2162                 for (i = 0; i < minpf_args_nr; i++)
2163                         rec_argv[j++] = minpf_args[i];
2164
2165         for (i = 0; i < (unsigned int)argc; i++)
2166                 rec_argv[j++] = argv[i];
2167
2168         return cmd_record(j, rec_argv);
2169 }
2170
2171 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2172
2173 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2174 {
2175         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2176
2177         if (IS_ERR(evsel))
2178                 return false;
2179
2180         if (perf_evsel__field(evsel, "pathname") == NULL) {
2181                 perf_evsel__delete(evsel);
2182                 return false;
2183         }
2184
2185         evsel->handler = trace__vfs_getname;
2186         perf_evlist__add(evlist, evsel);
2187         return true;
2188 }
2189
2190 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2191 {
2192         struct perf_evsel *evsel;
2193         struct perf_event_attr attr = {
2194                 .type = PERF_TYPE_SOFTWARE,
2195                 .mmap_data = 1,
2196         };
2197
2198         attr.config = config;
2199         attr.sample_period = 1;
2200
2201         event_attr_init(&attr);
2202
2203         evsel = perf_evsel__new(&attr);
2204         if (evsel)
2205                 evsel->handler = trace__pgfault;
2206
2207         return evsel;
2208 }
2209
2210 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2211 {
2212         const u32 type = event->header.type;
2213         struct perf_evsel *evsel;
2214
2215         if (type != PERF_RECORD_SAMPLE) {
2216                 trace__process_event(trace, trace->host, event, sample);
2217                 return;
2218         }
2219
2220         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2221         if (evsel == NULL) {
2222                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2223                 return;
2224         }
2225
2226         trace__set_base_time(trace, evsel, sample);
2227
2228         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2229             sample->raw_data == NULL) {
2230                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2231                        perf_evsel__name(evsel), sample->tid,
2232                        sample->cpu, sample->raw_size);
2233         } else {
2234                 tracepoint_handler handler = evsel->handler;
2235                 handler(trace, evsel, event, sample);
2236         }
2237 }
2238
2239 static int trace__add_syscall_newtp(struct trace *trace)
2240 {
2241         int ret = -1;
2242         struct perf_evlist *evlist = trace->evlist;
2243         struct perf_evsel *sys_enter, *sys_exit;
2244
2245         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2246         if (sys_enter == NULL)
2247                 goto out;
2248
2249         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2250                 goto out_delete_sys_enter;
2251
2252         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2253         if (sys_exit == NULL)
2254                 goto out_delete_sys_enter;
2255
2256         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2257                 goto out_delete_sys_exit;
2258
2259         perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2260         perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2261
2262         perf_evlist__add(evlist, sys_enter);
2263         perf_evlist__add(evlist, sys_exit);
2264
2265         if (callchain_param.enabled && !trace->kernel_syscallchains) {
2266                 /*
2267                  * We're interested only in the user space callchain
2268                  * leading to the syscall, allow overriding that for
2269                  * debugging reasons using --kernel_syscall_callchains
2270                  */
2271                 sys_exit->attr.exclude_callchain_kernel = 1;
2272         }
2273
2274         trace->syscalls.events.sys_enter = sys_enter;
2275         trace->syscalls.events.sys_exit  = sys_exit;
2276
2277         ret = 0;
2278 out:
2279         return ret;
2280
2281 out_delete_sys_exit:
2282         perf_evsel__delete_priv(sys_exit);
2283 out_delete_sys_enter:
2284         perf_evsel__delete_priv(sys_enter);
2285         goto out;
2286 }
2287
2288 static int trace__set_ev_qualifier_filter(struct trace *trace)
2289 {
2290         int err = -1;
2291         struct perf_evsel *sys_exit;
2292         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2293                                                 trace->ev_qualifier_ids.nr,
2294                                                 trace->ev_qualifier_ids.entries);
2295
2296         if (filter == NULL)
2297                 goto out_enomem;
2298
2299         if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2300                                           filter)) {
2301                 sys_exit = trace->syscalls.events.sys_exit;
2302                 err = perf_evsel__append_tp_filter(sys_exit, filter);
2303         }
2304
2305         free(filter);
2306 out:
2307         return err;
2308 out_enomem:
2309         errno = ENOMEM;
2310         goto out;
2311 }
2312
2313 static int trace__set_filter_loop_pids(struct trace *trace)
2314 {
2315         unsigned int nr = 1;
2316         pid_t pids[32] = {
2317                 getpid(),
2318         };
2319         struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2320
2321         while (thread && nr < ARRAY_SIZE(pids)) {
2322                 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2323
2324                 if (parent == NULL)
2325                         break;
2326
2327                 if (!strcmp(thread__comm_str(parent), "sshd")) {
2328                         pids[nr++] = parent->tid;
2329                         break;
2330                 }
2331                 thread = parent;
2332         }
2333
2334         return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2335 }
2336
2337 static int trace__run(struct trace *trace, int argc, const char **argv)
2338 {
2339         struct perf_evlist *evlist = trace->evlist;
2340         struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2341         int err = -1, i;
2342         unsigned long before;
2343         const bool forks = argc > 0;
2344         bool draining = false;
2345
2346         trace->live = true;
2347
2348         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2349                 goto out_error_raw_syscalls;
2350
2351         if (trace->trace_syscalls)
2352                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2353
2354         if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2355                 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2356                 if (pgfault_maj == NULL)
2357                         goto out_error_mem;
2358                 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2359                 perf_evlist__add(evlist, pgfault_maj);
2360         }
2361
2362         if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2363                 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2364                 if (pgfault_min == NULL)
2365                         goto out_error_mem;
2366                 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2367                 perf_evlist__add(evlist, pgfault_min);
2368         }
2369
2370         if (trace->sched &&
2371             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2372                                    trace__sched_stat_runtime))
2373                 goto out_error_sched_stat_runtime;
2374
2375         /*
2376          * If a global cgroup was set, apply it to all the events without an
2377          * explicit cgroup. I.e.:
2378          *
2379          *      trace -G A -e sched:*switch
2380          *
2381          * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2382          * _and_ sched:sched_switch to the 'A' cgroup, while:
2383          *
2384          * trace -e sched:*switch -G A
2385          *
2386          * will only set the sched:sched_switch event to the 'A' cgroup, all the
2387          * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
2388          * a cgroup (on the root cgroup, sys wide, etc).
2389          *
2390          * Multiple cgroups:
2391          *
2392          * trace -G A -e sched:*switch -G B
2393          *
2394          * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
2395          * to the 'B' cgroup.
2396          *
2397          * evlist__set_default_cgroup() grabs a reference of the passed cgroup
2398          * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
2399          */
2400         if (trace->cgroup)
2401                 evlist__set_default_cgroup(trace->evlist, trace->cgroup);
2402
2403         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2404         if (err < 0) {
2405                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2406                 goto out_delete_evlist;
2407         }
2408
2409         err = trace__symbols_init(trace, evlist);
2410         if (err < 0) {
2411                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2412                 goto out_delete_evlist;
2413         }
2414
2415         perf_evlist__config(evlist, &trace->opts, &callchain_param);
2416
2417         signal(SIGCHLD, sig_handler);
2418         signal(SIGINT, sig_handler);
2419
2420         if (forks) {
2421                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2422                                                     argv, false, NULL);
2423                 if (err < 0) {
2424                         fprintf(trace->output, "Couldn't run the workload!\n");
2425                         goto out_delete_evlist;
2426                 }
2427         }
2428
2429         err = perf_evlist__open(evlist);
2430         if (err < 0)
2431                 goto out_error_open;
2432
2433         err = bpf__apply_obj_config();
2434         if (err) {
2435                 char errbuf[BUFSIZ];
2436
2437                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2438                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2439                          errbuf);
2440                 goto out_error_open;
2441         }
2442
2443         /*
2444          * Better not use !target__has_task() here because we need to cover the
2445          * case where no threads were specified in the command line, but a
2446          * workload was, and in that case we will fill in the thread_map when
2447          * we fork the workload in perf_evlist__prepare_workload.
2448          */
2449         if (trace->filter_pids.nr > 0)
2450                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2451         else if (thread_map__pid(evlist->threads, 0) == -1)
2452                 err = trace__set_filter_loop_pids(trace);
2453
2454         if (err < 0)
2455                 goto out_error_mem;
2456
2457         if (trace->ev_qualifier_ids.nr > 0) {
2458                 err = trace__set_ev_qualifier_filter(trace);
2459                 if (err < 0)
2460                         goto out_errno;
2461
2462                 pr_debug("event qualifier tracepoint filter: %s\n",
2463                          trace->syscalls.events.sys_exit->filter);
2464         }
2465
2466         err = perf_evlist__apply_filters(evlist, &evsel);
2467         if (err < 0)
2468                 goto out_error_apply_filters;
2469
2470         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2471         if (err < 0)
2472                 goto out_error_mmap;
2473
2474         if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2475                 perf_evlist__enable(evlist);
2476
2477         if (forks)
2478                 perf_evlist__start_workload(evlist);
2479
2480         if (trace->opts.initial_delay) {
2481                 usleep(trace->opts.initial_delay * 1000);
2482                 perf_evlist__enable(evlist);
2483         }
2484
2485         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2486                                   evlist->threads->nr > 1 ||
2487                                   perf_evlist__first(evlist)->attr.inherit;
2488
2489         /*
2490          * Now that we already used evsel->attr to ask the kernel to setup the
2491          * events, lets reuse evsel->attr.sample_max_stack as the limit in
2492          * trace__resolve_callchain(), allowing per-event max-stack settings
2493          * to override an explicitely set --max-stack global setting.
2494          */
2495         evlist__for_each_entry(evlist, evsel) {
2496                 if (evsel__has_callchain(evsel) &&
2497                     evsel->attr.sample_max_stack == 0)
2498                         evsel->attr.sample_max_stack = trace->max_stack;
2499         }
2500 again:
2501         before = trace->nr_events;
2502
2503         for (i = 0; i < evlist->nr_mmaps; i++) {
2504                 union perf_event *event;
2505                 struct perf_mmap *md;
2506
2507                 md = &evlist->mmap[i];
2508                 if (perf_mmap__read_init(md) < 0)
2509                         continue;
2510
2511                 while ((event = perf_mmap__read_event(md)) != NULL) {
2512                         struct perf_sample sample;
2513
2514                         ++trace->nr_events;
2515
2516                         err = perf_evlist__parse_sample(evlist, event, &sample);
2517                         if (err) {
2518                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2519                                 goto next_event;
2520                         }
2521
2522                         trace__handle_event(trace, event, &sample);
2523 next_event:
2524                         perf_mmap__consume(md);
2525
2526                         if (interrupted)
2527                                 goto out_disable;
2528
2529                         if (done && !draining) {
2530                                 perf_evlist__disable(evlist);
2531                                 draining = true;
2532                         }
2533                 }
2534                 perf_mmap__read_done(md);
2535         }
2536
2537         if (trace->nr_events == before) {
2538                 int timeout = done ? 100 : -1;
2539
2540                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2541                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2542                                 draining = true;
2543
2544                         goto again;
2545                 }
2546         } else {
2547                 goto again;
2548         }
2549
2550 out_disable:
2551         thread__zput(trace->current);
2552
2553         perf_evlist__disable(evlist);
2554
2555         if (!err) {
2556                 if (trace->summary)
2557                         trace__fprintf_thread_summary(trace, trace->output);
2558
2559                 if (trace->show_tool_stats) {
2560                         fprintf(trace->output, "Stats:\n "
2561                                                " vfs_getname : %" PRIu64 "\n"
2562                                                " proc_getname: %" PRIu64 "\n",
2563                                 trace->stats.vfs_getname,
2564                                 trace->stats.proc_getname);
2565                 }
2566         }
2567
2568 out_delete_evlist:
2569         trace__symbols__exit(trace);
2570
2571         perf_evlist__delete(evlist);
2572         cgroup__put(trace->cgroup);
2573         trace->evlist = NULL;
2574         trace->live = false;
2575         return err;
2576 {
2577         char errbuf[BUFSIZ];
2578
2579 out_error_sched_stat_runtime:
2580         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2581         goto out_error;
2582
2583 out_error_raw_syscalls:
2584         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2585         goto out_error;
2586
2587 out_error_mmap:
2588         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2589         goto out_error;
2590
2591 out_error_open:
2592         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2593
2594 out_error:
2595         fprintf(trace->output, "%s\n", errbuf);
2596         goto out_delete_evlist;
2597
2598 out_error_apply_filters:
2599         fprintf(trace->output,
2600                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2601                 evsel->filter, perf_evsel__name(evsel), errno,
2602                 str_error_r(errno, errbuf, sizeof(errbuf)));
2603         goto out_delete_evlist;
2604 }
2605 out_error_mem:
2606         fprintf(trace->output, "Not enough memory to run!\n");
2607         goto out_delete_evlist;
2608
2609 out_errno:
2610         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2611         goto out_delete_evlist;
2612 }
2613
2614 static int trace__replay(struct trace *trace)
2615 {
2616         const struct perf_evsel_str_handler handlers[] = {
2617                 { "probe:vfs_getname",       trace__vfs_getname, },
2618         };
2619         struct perf_data data = {
2620                 .file      = {
2621                         .path = input_name,
2622                 },
2623                 .mode      = PERF_DATA_MODE_READ,
2624                 .force     = trace->force,
2625         };
2626         struct perf_session *session;
2627         struct perf_evsel *evsel;
2628         int err = -1;
2629
2630         trace->tool.sample        = trace__process_sample;
2631         trace->tool.mmap          = perf_event__process_mmap;
2632         trace->tool.mmap2         = perf_event__process_mmap2;
2633         trace->tool.comm          = perf_event__process_comm;
2634         trace->tool.exit          = perf_event__process_exit;
2635         trace->tool.fork          = perf_event__process_fork;
2636         trace->tool.attr          = perf_event__process_attr;
2637         trace->tool.tracing_data  = perf_event__process_tracing_data;
2638         trace->tool.build_id      = perf_event__process_build_id;
2639         trace->tool.namespaces    = perf_event__process_namespaces;
2640
2641         trace->tool.ordered_events = true;
2642         trace->tool.ordering_requires_timestamps = true;
2643
2644         /* add tid to output */
2645         trace->multiple_threads = true;
2646
2647         session = perf_session__new(&data, false, &trace->tool);
2648         if (session == NULL)
2649                 return -1;
2650
2651         if (trace->opts.target.pid)
2652                 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2653
2654         if (trace->opts.target.tid)
2655                 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2656
2657         if (symbol__init(&session->header.env) < 0)
2658                 goto out;
2659
2660         trace->host = &session->machines.host;
2661
2662         err = perf_session__set_tracepoints_handlers(session, handlers);
2663         if (err)
2664                 goto out;
2665
2666         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2667                                                      "raw_syscalls:sys_enter");
2668         /* older kernels have syscalls tp versus raw_syscalls */
2669         if (evsel == NULL)
2670                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2671                                                              "syscalls:sys_enter");
2672
2673         if (evsel &&
2674             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2675             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2676                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2677                 goto out;
2678         }
2679
2680         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2681                                                      "raw_syscalls:sys_exit");
2682         if (evsel == NULL)
2683                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2684                                                              "syscalls:sys_exit");
2685         if (evsel &&
2686             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2687             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2688                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2689                 goto out;
2690         }
2691
2692         evlist__for_each_entry(session->evlist, evsel) {
2693                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2694                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2695                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2696                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2697                         evsel->handler = trace__pgfault;
2698         }
2699
2700         setup_pager();
2701
2702         err = perf_session__process_events(session);
2703         if (err)
2704                 pr_err("Failed to process events, error %d", err);
2705
2706         else if (trace->summary)
2707                 trace__fprintf_thread_summary(trace, trace->output);
2708
2709 out:
2710         perf_session__delete(session);
2711
2712         return err;
2713 }
2714
2715 static size_t trace__fprintf_threads_header(FILE *fp)
2716 {
2717         size_t printed;
2718
2719         printed  = fprintf(fp, "\n Summary of events:\n\n");
2720
2721         return printed;
2722 }
2723
2724 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2725         struct stats    *stats;
2726         double          msecs;
2727         int             syscall;
2728 )
2729 {
2730         struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2731         struct stats *stats = source->priv;
2732
2733         entry->syscall = source->i;
2734         entry->stats   = stats;
2735         entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2736 }
2737
2738 static size_t thread__dump_stats(struct thread_trace *ttrace,
2739                                  struct trace *trace, FILE *fp)
2740 {
2741         size_t printed = 0;
2742         struct syscall *sc;
2743         struct rb_node *nd;
2744         DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2745
2746         if (syscall_stats == NULL)
2747                 return 0;
2748
2749         printed += fprintf(fp, "\n");
2750
2751         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2752         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2753         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2754
2755         resort_rb__for_each_entry(nd, syscall_stats) {
2756                 struct stats *stats = syscall_stats_entry->stats;
2757                 if (stats) {
2758                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2759                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2760                         double avg = avg_stats(stats);
2761                         double pct;
2762                         u64 n = (u64) stats->n;
2763
2764                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2765                         avg /= NSEC_PER_MSEC;
2766
2767                         sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2768                         printed += fprintf(fp, "   %-15s", sc->name);
2769                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2770                                            n, syscall_stats_entry->msecs, min, avg);
2771                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2772                 }
2773         }
2774
2775         resort_rb__delete(syscall_stats);
2776         printed += fprintf(fp, "\n\n");
2777
2778         return printed;
2779 }
2780
2781 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2782 {
2783         size_t printed = 0;
2784         struct thread_trace *ttrace = thread__priv(thread);
2785         double ratio;
2786
2787         if (ttrace == NULL)
2788                 return 0;
2789
2790         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2791
2792         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2793         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2794         printed += fprintf(fp, "%.1f%%", ratio);
2795         if (ttrace->pfmaj)
2796                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2797         if (ttrace->pfmin)
2798                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2799         if (trace->sched)
2800                 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2801         else if (fputc('\n', fp) != EOF)
2802                 ++printed;
2803
2804         printed += thread__dump_stats(ttrace, trace, fp);
2805
2806         return printed;
2807 }
2808
2809 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2810 {
2811         return ttrace ? ttrace->nr_events : 0;
2812 }
2813
2814 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2815         struct thread *thread;
2816 )
2817 {
2818         entry->thread = rb_entry(nd, struct thread, rb_node);
2819 }
2820
2821 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2822 {
2823         size_t printed = trace__fprintf_threads_header(fp);
2824         struct rb_node *nd;
2825         int i;
2826
2827         for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2828                 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2829
2830                 if (threads == NULL) {
2831                         fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2832                         return 0;
2833                 }
2834
2835                 resort_rb__for_each_entry(nd, threads)
2836                         printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2837
2838                 resort_rb__delete(threads);
2839         }
2840         return printed;
2841 }
2842
2843 static int trace__set_duration(const struct option *opt, const char *str,
2844                                int unset __maybe_unused)
2845 {
2846         struct trace *trace = opt->value;
2847
2848         trace->duration_filter = atof(str);
2849         return 0;
2850 }
2851
2852 static int trace__set_filter_pids(const struct option *opt, const char *str,
2853                                   int unset __maybe_unused)
2854 {
2855         int ret = -1;
2856         size_t i;
2857         struct trace *trace = opt->value;
2858         /*
2859          * FIXME: introduce a intarray class, plain parse csv and create a
2860          * { int nr, int entries[] } struct...
2861          */
2862         struct intlist *list = intlist__new(str);
2863
2864         if (list == NULL)
2865                 return -1;
2866
2867         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2868         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2869
2870         if (trace->filter_pids.entries == NULL)
2871                 goto out;
2872
2873         trace->filter_pids.entries[0] = getpid();
2874
2875         for (i = 1; i < trace->filter_pids.nr; ++i)
2876                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2877
2878         intlist__delete(list);
2879         ret = 0;
2880 out:
2881         return ret;
2882 }
2883
2884 static int trace__open_output(struct trace *trace, const char *filename)
2885 {
2886         struct stat st;
2887
2888         if (!stat(filename, &st) && st.st_size) {
2889                 char oldname[PATH_MAX];
2890
2891                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2892                 unlink(oldname);
2893                 rename(filename, oldname);
2894         }
2895
2896         trace->output = fopen(filename, "w");
2897
2898         return trace->output == NULL ? -errno : 0;
2899 }
2900
2901 static int parse_pagefaults(const struct option *opt, const char *str,
2902                             int unset __maybe_unused)
2903 {
2904         int *trace_pgfaults = opt->value;
2905
2906         if (strcmp(str, "all") == 0)
2907                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2908         else if (strcmp(str, "maj") == 0)
2909                 *trace_pgfaults |= TRACE_PFMAJ;
2910         else if (strcmp(str, "min") == 0)
2911                 *trace_pgfaults |= TRACE_PFMIN;
2912         else
2913                 return -1;
2914
2915         return 0;
2916 }
2917
2918 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2919 {
2920         struct perf_evsel *evsel;
2921
2922         evlist__for_each_entry(evlist, evsel)
2923                 evsel->handler = handler;
2924 }
2925
2926 /*
2927  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2928  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2929  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2930  *
2931  * It'd be better to introduce a parse_options() variant that would return a
2932  * list with the terms it didn't match to an event...
2933  */
2934 static int trace__parse_events_option(const struct option *opt, const char *str,
2935                                       int unset __maybe_unused)
2936 {
2937         struct trace *trace = (struct trace *)opt->value;
2938         const char *s = str;
2939         char *sep = NULL, *lists[2] = { NULL, NULL, };
2940         int len = strlen(str) + 1, err = -1, list, idx;
2941         char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2942         char group_name[PATH_MAX];
2943
2944         if (strace_groups_dir == NULL)
2945                 return -1;
2946
2947         if (*s == '!') {
2948                 ++s;
2949                 trace->not_ev_qualifier = true;
2950         }
2951
2952         while (1) {
2953                 if ((sep = strchr(s, ',')) != NULL)
2954                         *sep = '\0';
2955
2956                 list = 0;
2957                 if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2958                     syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2959                         list = 1;
2960                 } else {
2961                         path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2962                         if (access(group_name, R_OK) == 0)
2963                                 list = 1;
2964                 }
2965
2966                 if (lists[list]) {
2967                         sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2968                 } else {
2969                         lists[list] = malloc(len);
2970                         if (lists[list] == NULL)
2971                                 goto out;
2972                         strcpy(lists[list], s);
2973                 }
2974
2975                 if (!sep)
2976                         break;
2977
2978                 *sep = ',';
2979                 s = sep + 1;
2980         }
2981
2982         if (lists[1] != NULL) {
2983                 struct strlist_config slist_config = {
2984                         .dirname = strace_groups_dir,
2985                 };
2986
2987                 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2988                 if (trace->ev_qualifier == NULL) {
2989                         fputs("Not enough memory to parse event qualifier", trace->output);
2990                         goto out;
2991                 }
2992
2993                 if (trace__validate_ev_qualifier(trace))
2994                         goto out;
2995                 trace->trace_syscalls = true;
2996         }
2997
2998         err = 0;
2999
3000         if (lists[0]) {
3001                 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3002                                                "event selector. use 'perf list' to list available events",
3003                                                parse_events_option);
3004                 err = parse_events_option(&o, lists[0], 0);
3005         }
3006 out:
3007         if (sep)
3008                 *sep = ',';
3009
3010         return err;
3011 }
3012
3013 static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3014 {
3015         struct trace *trace = opt->value;
3016
3017         if (!list_empty(&trace->evlist->entries))
3018                 return parse_cgroups(opt, str, unset);
3019
3020         trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3021
3022         return 0;
3023 }
3024
3025 int cmd_trace(int argc, const char **argv)
3026 {
3027         const char *trace_usage[] = {
3028                 "perf trace [<options>] [<command>]",
3029                 "perf trace [<options>] -- <command> [<options>]",
3030                 "perf trace record [<options>] [<command>]",
3031                 "perf trace record [<options>] -- <command> [<options>]",
3032                 NULL
3033         };
3034         struct trace trace = {
3035                 .syscalls = {
3036                         . max = -1,
3037                 },
3038                 .opts = {
3039                         .target = {
3040                                 .uid       = UINT_MAX,
3041                                 .uses_mmap = true,
3042                         },
3043                         .user_freq     = UINT_MAX,
3044                         .user_interval = ULLONG_MAX,
3045                         .no_buffering  = true,
3046                         .mmap_pages    = UINT_MAX,
3047                         .proc_map_timeout  = 500,
3048                 },
3049                 .output = stderr,
3050                 .show_comm = true,
3051                 .trace_syscalls = false,
3052                 .kernel_syscallchains = false,
3053                 .max_stack = UINT_MAX,
3054         };
3055         const char *output_name = NULL;
3056         const struct option trace_options[] = {
3057         OPT_CALLBACK('e', "event", &trace, "event",
3058                      "event/syscall selector. use 'perf list' to list available events",
3059                      trace__parse_events_option),
3060         OPT_BOOLEAN(0, "comm", &trace.show_comm,
3061                     "show the thread COMM next to its id"),
3062         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3063         OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3064                      trace__parse_events_option),
3065         OPT_STRING('o', "output", &output_name, "file", "output file name"),
3066         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3067         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3068                     "trace events on existing process id"),
3069         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3070                     "trace events on existing thread id"),
3071         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3072                      "pids to filter (by the kernel)", trace__set_filter_pids),
3073         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3074                     "system-wide collection from all CPUs"),
3075         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3076                     "list of cpus to monitor"),
3077         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3078                     "child tasks do not inherit counters"),
3079         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3080                      "number of mmap data pages",
3081                      perf_evlist__parse_mmap_pages),
3082         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3083                    "user to profile"),
3084         OPT_CALLBACK(0, "duration", &trace, "float",
3085                      "show only events with duration > N.M ms",
3086                      trace__set_duration),
3087         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3088         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3089         OPT_BOOLEAN('T', "time", &trace.full_time,
3090                     "Show full timestamp, not time relative to first start"),
3091         OPT_BOOLEAN(0, "failure", &trace.failure_only,
3092                     "Show only syscalls that failed"),
3093         OPT_BOOLEAN('s', "summary", &trace.summary_only,
3094                     "Show only syscall summary with statistics"),
3095         OPT_BOOLEAN('S', "with-summary", &trace.summary,
3096                     "Show all syscalls and summary with statistics"),
3097         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3098                      "Trace pagefaults", parse_pagefaults, "maj"),
3099         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3100         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3101         OPT_CALLBACK(0, "call-graph", &trace.opts,
3102                      "record_mode[,record_size]", record_callchain_help,
3103                      &record_parse_callchain_opt),
3104         OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3105                     "Show the kernel callchains on the syscall exit path"),
3106         OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3107                      "Set the minimum stack depth when parsing the callchain, "
3108                      "anything below the specified depth will be ignored."),
3109         OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3110                      "Set the maximum stack depth when parsing the callchain, "
3111                      "anything beyond the specified depth will be ignored. "
3112                      "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3113         OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3114                         "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3115         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3116                         "per thread proc mmap processing timeout in ms"),
3117         OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3118                      trace__parse_cgroups),
3119         OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3120                      "ms to wait before starting measurement after program "
3121                      "start"),
3122         OPT_END()
3123         };
3124         bool __maybe_unused max_stack_user_set = true;
3125         bool mmap_pages_user_set = true;
3126         const char * const trace_subcommands[] = { "record", NULL };
3127         int err;
3128         char bf[BUFSIZ];
3129
3130         signal(SIGSEGV, sighandler_dump_stack);
3131         signal(SIGFPE, sighandler_dump_stack);
3132
3133         trace.evlist = perf_evlist__new();
3134         trace.sctbl = syscalltbl__new();
3135
3136         if (trace.evlist == NULL || trace.sctbl == NULL) {
3137                 pr_err("Not enough memory to run!\n");
3138                 err = -ENOMEM;
3139                 goto out;
3140         }
3141
3142         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3143                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3144
3145         if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3146                 usage_with_options_msg(trace_usage, trace_options,
3147                                        "cgroup monitoring only available in system-wide mode");
3148         }
3149
3150         err = bpf__setup_stdout(trace.evlist);
3151         if (err) {
3152                 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3153                 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3154                 goto out;
3155         }
3156
3157         err = -1;
3158
3159         if (trace.trace_pgfaults) {
3160                 trace.opts.sample_address = true;
3161                 trace.opts.sample_time = true;
3162         }
3163
3164         if (trace.opts.mmap_pages == UINT_MAX)
3165                 mmap_pages_user_set = false;
3166
3167         if (trace.max_stack == UINT_MAX) {
3168                 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
3169                 max_stack_user_set = false;
3170         }
3171
3172 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3173         if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3174                 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3175         }
3176 #endif
3177
3178         if (callchain_param.enabled) {
3179                 if (!mmap_pages_user_set && geteuid() == 0)
3180                         trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3181
3182                 symbol_conf.use_callchain = true;
3183         }
3184
3185         if (trace.evlist->nr_entries > 0)
3186                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3187
3188         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3189                 return trace__record(&trace, argc-1, &argv[1]);
3190
3191         /* summary_only implies summary option, but don't overwrite summary if set */
3192         if (trace.summary_only)
3193                 trace.summary = trace.summary_only;
3194
3195         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3196             trace.evlist->nr_entries == 0 /* Was --events used? */) {
3197                 trace.trace_syscalls = true;
3198         }
3199
3200         if (output_name != NULL) {
3201                 err = trace__open_output(&trace, output_name);
3202                 if (err < 0) {
3203                         perror("failed to create output file");
3204                         goto out;
3205                 }
3206         }
3207
3208         trace.open_id = syscalltbl__id(trace.sctbl, "open");
3209
3210         err = target__validate(&trace.opts.target);
3211         if (err) {
3212                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3213                 fprintf(trace.output, "%s", bf);
3214                 goto out_close;
3215         }
3216
3217         err = target__parse_uid(&trace.opts.target);
3218         if (err) {
3219                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3220                 fprintf(trace.output, "%s", bf);
3221                 goto out_close;
3222         }
3223
3224         if (!argc && target__none(&trace.opts.target))
3225                 trace.opts.target.system_wide = true;
3226
3227         if (input_name)
3228                 err = trace__replay(&trace);
3229         else
3230                 err = trace__run(&trace, argc, argv);
3231
3232 out_close:
3233         if (output_name != NULL)
3234                 fclose(trace.output);
3235 out:
3236         return err;
3237 }