4 * Builtin 'trace' command:
6 * Display a continuously updated trace of any workload, CPU, specific PID,
7 * system wide, etc. Default format is loosely strace like, but any other
8 * event may be specified using --event.
10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
12 * Initially based on the 'trace' prototype by Thomas Gleixner:
14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
16 * Released under the GPL v2. (and only v2, not any later version)
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
22 #include "util/cgroup.h"
23 #include "util/color.h"
24 #include "util/debug.h"
26 #include "util/event.h"
27 #include "util/evlist.h"
28 #include <subcmd/exec-cmd.h>
29 #include "util/machine.h"
30 #include "util/path.h"
31 #include "util/session.h"
32 #include "util/thread.h"
33 #include <subcmd/parse-options.h>
34 #include "util/strlist.h"
35 #include "util/intlist.h"
36 #include "util/thread_map.h"
37 #include "util/stat.h"
38 #include "trace/beauty/beauty.h"
39 #include "trace-event.h"
40 #include "util/parse-events.h"
41 #include "util/bpf-loader.h"
42 #include "callchain.h"
43 #include "print_binary.h"
45 #include "syscalltbl.h"
46 #include "rb_resort.h"
54 #include <linux/err.h>
55 #include <linux/filter.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
62 #include "sane_ctype.h"
65 # define O_CLOEXEC 02000000
68 #ifndef F_LINUX_SPECIFIC_BASE
69 # define F_LINUX_SPECIFIC_BASE 1024
73 struct perf_tool tool;
74 struct syscalltbl *sctbl;
77 struct syscall *table;
79 struct perf_evsel *sys_enter,
83 struct record_opts opts;
84 struct perf_evlist *evlist;
86 struct thread *current;
87 struct cgroup *cgroup;
90 unsigned long nr_events;
91 struct strlist *ev_qualifier;
100 double duration_filter;
106 unsigned int max_stack;
107 unsigned int min_stack;
108 bool not_ev_qualifier;
112 bool multiple_threads;
118 bool show_tool_stats;
120 bool kernel_syscallchains;
129 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
130 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
134 #define TP_UINT_FIELD(bits) \
135 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
138 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
147 #define TP_UINT_FIELD__SWAPPED(bits) \
148 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
151 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
152 return bswap_##bits(value);\
155 TP_UINT_FIELD__SWAPPED(16);
156 TP_UINT_FIELD__SWAPPED(32);
157 TP_UINT_FIELD__SWAPPED(64);
159 static int tp_field__init_uint(struct tp_field *field,
160 struct format_field *format_field,
163 field->offset = format_field->offset;
165 switch (format_field->size) {
167 field->integer = tp_field__u8;
170 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
173 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
176 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
185 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
187 return sample->raw_data + field->offset;
190 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
192 field->offset = format_field->offset;
193 field->pointer = tp_field__ptr;
200 struct tp_field args, ret;
204 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
205 struct tp_field *field,
208 struct format_field *format_field = perf_evsel__field(evsel, name);
210 if (format_field == NULL)
213 return tp_field__init_uint(field, format_field, evsel->needs_swap);
216 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
217 ({ struct syscall_tp *sc = evsel->priv;\
218 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
220 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
221 struct tp_field *field,
224 struct format_field *format_field = perf_evsel__field(evsel, name);
226 if (format_field == NULL)
229 return tp_field__init_ptr(field, format_field);
232 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
233 ({ struct syscall_tp *sc = evsel->priv;\
234 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
236 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
239 perf_evsel__delete(evsel);
242 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
244 evsel->priv = malloc(sizeof(struct syscall_tp));
245 if (evsel->priv != NULL) {
246 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
249 evsel->handler = handler;
260 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
262 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
264 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
266 evsel = perf_evsel__newtp("syscalls", direction);
271 if (perf_evsel__init_syscall_tp(evsel, handler))
277 perf_evsel__delete_priv(evsel);
281 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
282 ({ struct syscall_tp *fields = evsel->priv; \
283 fields->name.integer(&fields->name, sample); })
285 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
286 ({ struct syscall_tp *fields = evsel->priv; \
287 fields->name.pointer(&fields->name, sample); })
289 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
291 int idx = val - sa->offset;
293 if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL)
294 return scnprintf(bf, size, intfmt, val);
296 return scnprintf(bf, size, "%s", sa->entries[idx]);
299 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
301 struct syscall_arg *arg)
303 return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
306 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
307 struct syscall_arg *arg)
309 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
312 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
316 struct strarray **entries;
319 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
320 .nr_entries = ARRAY_SIZE(array), \
324 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
325 struct syscall_arg *arg)
327 struct strarrays *sas = arg->parm;
330 for (i = 0; i < sas->nr_entries; ++i) {
331 struct strarray *sa = sas->entries[i];
332 int idx = arg->val - sa->offset;
334 if (idx >= 0 && idx < sa->nr_entries) {
335 if (sa->entries[idx] == NULL)
337 return scnprintf(bf, size, "%s", sa->entries[idx]);
341 return scnprintf(bf, size, "%d", arg->val);
345 #define AT_FDCWD -100
348 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
349 struct syscall_arg *arg)
354 return scnprintf(bf, size, "CWD");
356 return syscall_arg__scnprintf_fd(bf, size, arg);
359 #define SCA_FDAT syscall_arg__scnprintf_fd_at
361 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
362 struct syscall_arg *arg);
364 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
366 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
368 return scnprintf(bf, size, "%#lx", arg->val);
371 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
373 return scnprintf(bf, size, "%d", arg->val);
376 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
378 return scnprintf(bf, size, "%ld", arg->val);
381 static const char *bpf_cmd[] = {
382 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
383 "MAP_GET_NEXT_KEY", "PROG_LOAD",
385 static DEFINE_STRARRAY(bpf_cmd);
387 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
388 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
390 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
391 static DEFINE_STRARRAY(itimers);
393 static const char *keyctl_options[] = {
394 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
395 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
396 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
397 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
398 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
400 static DEFINE_STRARRAY(keyctl_options);
402 static const char *whences[] = { "SET", "CUR", "END",
410 static DEFINE_STRARRAY(whences);
412 static const char *fcntl_cmds[] = {
413 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
414 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
415 "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
418 static DEFINE_STRARRAY(fcntl_cmds);
420 static const char *fcntl_linux_specific_cmds[] = {
421 "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
422 "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
423 "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
426 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
428 static struct strarray *fcntl_cmds_arrays[] = {
429 &strarray__fcntl_cmds,
430 &strarray__fcntl_linux_specific_cmds,
433 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
435 static const char *rlimit_resources[] = {
436 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
437 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
440 static DEFINE_STRARRAY(rlimit_resources);
442 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
443 static DEFINE_STRARRAY(sighow);
445 static const char *clockid[] = {
446 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
447 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
448 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
450 static DEFINE_STRARRAY(clockid);
452 static const char *socket_families[] = {
453 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
454 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
455 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
456 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
457 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
458 "ALG", "NFC", "VSOCK",
460 static DEFINE_STRARRAY(socket_families);
462 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
463 struct syscall_arg *arg)
468 if (mode == F_OK) /* 0 */
469 return scnprintf(bf, size, "F");
471 if (mode & n##_OK) { \
472 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
482 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
487 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
489 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
490 struct syscall_arg *arg);
492 #define SCA_FILENAME syscall_arg__scnprintf_filename
494 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
495 struct syscall_arg *arg)
497 int printed = 0, flags = arg->val;
500 if (flags & O_##n) { \
501 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
510 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
515 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
517 #ifndef GRND_NONBLOCK
518 #define GRND_NONBLOCK 0x0001
521 #define GRND_RANDOM 0x0002
524 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
525 struct syscall_arg *arg)
527 int printed = 0, flags = arg->val;
530 if (flags & GRND_##n) { \
531 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
532 flags &= ~GRND_##n; \
540 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
545 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
547 #define STRARRAY(name, array) \
548 { .scnprintf = SCA_STRARRAY, \
549 .parm = &strarray__##array, }
551 #include "trace/beauty/arch_errno_names.c"
552 #include "trace/beauty/eventfd.c"
553 #include "trace/beauty/futex_op.c"
554 #include "trace/beauty/futex_val3.c"
555 #include "trace/beauty/mmap.c"
556 #include "trace/beauty/mode_t.c"
557 #include "trace/beauty/msg_flags.c"
558 #include "trace/beauty/open_flags.c"
559 #include "trace/beauty/perf_event_open.c"
560 #include "trace/beauty/pid.c"
561 #include "trace/beauty/sched_policy.c"
562 #include "trace/beauty/seccomp.c"
563 #include "trace/beauty/signum.c"
564 #include "trace/beauty/socket_type.c"
565 #include "trace/beauty/waitid_options.c"
567 struct syscall_arg_fmt {
568 size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
574 static struct syscall_fmt {
577 struct syscall_arg_fmt arg[6];
584 .arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, },
586 .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
587 { .name = "brk", .hexret = true,
588 .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
589 { .name = "clock_gettime",
590 .arg = { [0] = STRARRAY(clk_id, clockid), }, },
591 { .name = "clone", .errpid = true, .nr_args = 5,
592 .arg = { [0] = { .name = "flags", .scnprintf = SCA_CLONE_FLAGS, },
593 [1] = { .name = "child_stack", .scnprintf = SCA_HEX, },
594 [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
595 [3] = { .name = "child_tidptr", .scnprintf = SCA_HEX, },
596 [4] = { .name = "tls", .scnprintf = SCA_HEX, }, }, },
598 .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
599 { .name = "epoll_ctl",
600 .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
601 { .name = "eventfd2",
602 .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
603 { .name = "fchmodat",
604 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
605 { .name = "fchownat",
606 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
608 .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
609 .parm = &strarrays__fcntl_cmds_arrays,
610 .show_zero = true, },
611 [2] = { .scnprintf = SCA_FCNTL_ARG, /* arg */ }, }, },
613 .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
614 { .name = "fstat", .alias = "newfstat", },
615 { .name = "fstatat", .alias = "newfstatat", },
617 .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
618 [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
619 { .name = "futimesat",
620 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
621 { .name = "getitimer",
622 .arg = { [0] = STRARRAY(which, itimers), }, },
623 { .name = "getpid", .errpid = true, },
624 { .name = "getpgid", .errpid = true, },
625 { .name = "getppid", .errpid = true, },
626 { .name = "getrandom",
627 .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
628 { .name = "getrlimit",
629 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
630 { .name = "gettid", .errpid = true, },
633 #if defined(__i386__) || defined(__x86_64__)
635 * FIXME: Make this available to all arches.
637 [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
638 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
640 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
642 { .name = "kcmp", .nr_args = 5,
643 .arg = { [0] = { .name = "pid1", .scnprintf = SCA_PID, },
644 [1] = { .name = "pid2", .scnprintf = SCA_PID, },
645 [2] = { .name = "type", .scnprintf = SCA_KCMP_TYPE, },
646 [3] = { .name = "idx1", .scnprintf = SCA_KCMP_IDX, },
647 [4] = { .name = "idx2", .scnprintf = SCA_KCMP_IDX, }, }, },
649 .arg = { [0] = STRARRAY(option, keyctl_options), }, },
651 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
653 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
655 .arg = { [2] = STRARRAY(whence, whences), }, },
656 { .name = "lstat", .alias = "newlstat", },
658 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
659 [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
661 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
663 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
665 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
666 { .name = "mlockall",
667 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
668 { .name = "mmap", .hexret = true,
669 /* The standard mmap maps to old_mmap on s390x */
670 #if defined(__s390x__)
673 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ },
674 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
675 [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, },
676 { .name = "mprotect",
677 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
678 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, }, },
679 { .name = "mq_unlink",
680 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
681 { .name = "mremap", .hexret = true,
682 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ },
683 [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
684 [4] = { .scnprintf = SCA_HEX, /* new_addr */ }, }, },
686 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
688 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
689 { .name = "name_to_handle_at",
690 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
691 { .name = "newfstatat",
692 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
694 .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
695 { .name = "open_by_handle_at",
696 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
697 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
699 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
700 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
701 { .name = "perf_event_open",
702 .arg = { [2] = { .scnprintf = SCA_INT, /* cpu */ },
703 [3] = { .scnprintf = SCA_FD, /* group_fd */ },
704 [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
706 .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
707 { .name = "pkey_alloc",
708 .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS, /* access_rights */ }, }, },
709 { .name = "pkey_free",
710 .arg = { [0] = { .scnprintf = SCA_INT, /* key */ }, }, },
711 { .name = "pkey_mprotect",
712 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
713 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
714 [3] = { .scnprintf = SCA_INT, /* pkey */ }, }, },
715 { .name = "poll", .timeout = true, },
716 { .name = "ppoll", .timeout = true, },
717 { .name = "prctl", .alias = "arch_prctl",
718 .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
719 [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
720 [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
721 { .name = "pread", .alias = "pread64", },
722 { .name = "preadv", .alias = "pread", },
723 { .name = "prlimit64",
724 .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
725 { .name = "pwrite", .alias = "pwrite64", },
726 { .name = "readlinkat",
727 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
728 { .name = "recvfrom",
729 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
730 { .name = "recvmmsg",
731 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
733 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
734 { .name = "renameat",
735 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
736 { .name = "rt_sigaction",
737 .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
738 { .name = "rt_sigprocmask",
739 .arg = { [0] = STRARRAY(how, sighow), }, },
740 { .name = "rt_sigqueueinfo",
741 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
742 { .name = "rt_tgsigqueueinfo",
743 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
744 { .name = "sched_setscheduler",
745 .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
747 .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP, /* op */ },
748 [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
749 { .name = "select", .timeout = true, },
750 { .name = "sendmmsg",
751 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
753 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
755 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
756 { .name = "set_tid_address", .errpid = true, },
757 { .name = "setitimer",
758 .arg = { [0] = STRARRAY(which, itimers), }, },
759 { .name = "setrlimit",
760 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
762 .arg = { [0] = STRARRAY(family, socket_families),
763 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
764 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
765 { .name = "socketpair",
766 .arg = { [0] = STRARRAY(family, socket_families),
767 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
768 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
769 { .name = "stat", .alias = "newstat", },
771 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fdat */ },
772 [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
773 [3] = { .scnprintf = SCA_STATX_MASK, /* mask */ }, }, },
775 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
777 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
778 { .name = "symlinkat",
779 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
781 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
783 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
784 { .name = "uname", .alias = "newuname", },
785 { .name = "unlinkat",
786 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
787 { .name = "utimensat",
788 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
789 { .name = "wait4", .errpid = true,
790 .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
791 { .name = "waitid", .errpid = true,
792 .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
795 static int syscall_fmt__cmp(const void *name, const void *fmtp)
797 const struct syscall_fmt *fmt = fmtp;
798 return strcmp(name, fmt->name);
801 static struct syscall_fmt *syscall_fmt__find(const char *name)
803 const int nmemb = ARRAY_SIZE(syscall_fmts);
804 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
808 * is_exit: is this "exit" or "exit_group"?
809 * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
812 struct event_format *tp_format;
816 struct format_field *args;
818 struct syscall_fmt *fmt;
819 struct syscall_arg_fmt *arg_fmt;
823 * We need to have this 'calculated' boolean because in some cases we really
824 * don't know what is the duration of a syscall, for instance, when we start
825 * a session and some threads are waiting for a syscall to finish, say 'poll',
826 * in which case all we can do is to print "( ? ) for duration and for the
829 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
831 double duration = (double)t / NSEC_PER_MSEC;
832 size_t printed = fprintf(fp, "(");
835 printed += fprintf(fp, " ");
836 else if (duration >= 1.0)
837 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
838 else if (duration >= 0.01)
839 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
841 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
842 return printed + fprintf(fp, "): ");
846 * filename.ptr: The filename char pointer that will be vfs_getname'd
847 * filename.entry_str_pos: Where to insert the string translated from
848 * filename.ptr by the vfs_getname tracepoint/kprobe.
849 * ret_scnprintf: syscall args may set this to a different syscall return
850 * formatter, for instance, fcntl may return fds, file flags, etc.
852 struct thread_trace {
855 unsigned long nr_events;
856 unsigned long pfmaj, pfmin;
859 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
862 short int entry_str_pos;
864 unsigned int namelen;
872 struct intlist *syscall_stats;
875 static struct thread_trace *thread_trace__new(void)
877 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
880 ttrace->paths.max = -1;
882 ttrace->syscall_stats = intlist__new(NULL);
887 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
889 struct thread_trace *ttrace;
894 if (thread__priv(thread) == NULL)
895 thread__set_priv(thread, thread_trace__new());
897 if (thread__priv(thread) == NULL)
900 ttrace = thread__priv(thread);
905 color_fprintf(fp, PERF_COLOR_RED,
906 "WARNING: not enough memory, dropping samples!\n");
911 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
912 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
914 struct thread_trace *ttrace = thread__priv(arg->thread);
916 ttrace->ret_scnprintf = ret_scnprintf;
919 #define TRACE_PFMAJ (1 << 0)
920 #define TRACE_PFMIN (1 << 1)
922 static const size_t trace__entry_str_size = 2048;
924 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
926 struct thread_trace *ttrace = thread__priv(thread);
928 if (fd > ttrace->paths.max) {
929 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
934 if (ttrace->paths.max != -1) {
935 memset(npath + ttrace->paths.max + 1, 0,
936 (fd - ttrace->paths.max) * sizeof(char *));
938 memset(npath, 0, (fd + 1) * sizeof(char *));
941 ttrace->paths.table = npath;
942 ttrace->paths.max = fd;
945 ttrace->paths.table[fd] = strdup(pathname);
947 return ttrace->paths.table[fd] != NULL ? 0 : -1;
950 static int thread__read_fd_path(struct thread *thread, int fd)
952 char linkname[PATH_MAX], pathname[PATH_MAX];
956 if (thread->pid_ == thread->tid) {
957 scnprintf(linkname, sizeof(linkname),
958 "/proc/%d/fd/%d", thread->pid_, fd);
960 scnprintf(linkname, sizeof(linkname),
961 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
964 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
967 ret = readlink(linkname, pathname, sizeof(pathname));
969 if (ret < 0 || ret > st.st_size)
972 pathname[ret] = '\0';
973 return trace__set_fd_pathname(thread, fd, pathname);
976 static const char *thread__fd_path(struct thread *thread, int fd,
979 struct thread_trace *ttrace = thread__priv(thread);
987 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
990 ++trace->stats.proc_getname;
991 if (thread__read_fd_path(thread, fd))
995 return ttrace->paths.table[fd];
998 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
1001 size_t printed = scnprintf(bf, size, "%d", fd);
1002 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1005 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1010 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1012 size_t printed = scnprintf(bf, size, "%d", fd);
1013 struct thread *thread = machine__find_thread(trace->host, pid, pid);
1016 const char *path = thread__fd_path(thread, fd, trace);
1019 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1021 thread__put(thread);
1027 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1028 struct syscall_arg *arg)
1031 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1032 struct thread_trace *ttrace = thread__priv(arg->thread);
1034 if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1035 zfree(&ttrace->paths.table[fd]);
1040 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1043 struct thread_trace *ttrace = thread__priv(thread);
1045 ttrace->filename.ptr = ptr;
1046 ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1049 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1050 struct syscall_arg *arg)
1052 unsigned long ptr = arg->val;
1054 if (!arg->trace->vfs_getname)
1055 return scnprintf(bf, size, "%#x", ptr);
1057 thread__set_filename_pos(arg->thread, bf, ptr);
1061 static bool trace__filter_duration(struct trace *trace, double t)
1063 return t < (trace->duration_filter * NSEC_PER_MSEC);
1066 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1068 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1070 return fprintf(fp, "%10.3f ", ts);
1074 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1075 * using ttrace->entry_time for a thread that receives a sys_exit without
1076 * first having received a sys_enter ("poll" issued before tracing session
1077 * starts, lost sys_enter exit due to ring buffer overflow).
1079 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1082 return __trace__fprintf_tstamp(trace, tstamp, fp);
1084 return fprintf(fp, " ? ");
1087 static bool done = false;
1088 static bool interrupted = false;
1090 static void sig_handler(int sig)
1093 interrupted = sig == SIGINT;
1096 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1097 u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1099 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1100 printed += fprintf_duration(duration, duration_calculated, fp);
1102 if (trace->multiple_threads) {
1103 if (trace->show_comm)
1104 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1105 printed += fprintf(fp, "%d ", thread->tid);
1111 static int trace__process_event(struct trace *trace, struct machine *machine,
1112 union perf_event *event, struct perf_sample *sample)
1116 switch (event->header.type) {
1117 case PERF_RECORD_LOST:
1118 color_fprintf(trace->output, PERF_COLOR_RED,
1119 "LOST %" PRIu64 " events!\n", event->lost.lost);
1120 ret = machine__process_lost_event(machine, event, sample);
1123 ret = machine__process_event(machine, event, sample);
1130 static int trace__tool_process(struct perf_tool *tool,
1131 union perf_event *event,
1132 struct perf_sample *sample,
1133 struct machine *machine)
1135 struct trace *trace = container_of(tool, struct trace, tool);
1136 return trace__process_event(trace, machine, event, sample);
1139 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1141 struct machine *machine = vmachine;
1143 if (machine->kptr_restrict_warned)
1146 if (symbol_conf.kptr_restrict) {
1147 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1148 "Check /proc/sys/kernel/kptr_restrict.\n\n"
1149 "Kernel samples will not be resolved.\n");
1150 machine->kptr_restrict_warned = true;
1154 return machine__resolve_kernel_addr(vmachine, addrp, modp);
1157 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1159 int err = symbol__init(NULL);
1164 trace->host = machine__new_host();
1165 if (trace->host == NULL)
1168 err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1172 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1173 evlist->threads, trace__tool_process, false,
1174 trace->opts.proc_map_timeout, 1);
1182 static void trace__symbols__exit(struct trace *trace)
1184 machine__exit(trace->host);
1190 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1194 if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1195 nr_args = sc->fmt->nr_args;
1197 sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1198 if (sc->arg_fmt == NULL)
1201 for (idx = 0; idx < nr_args; ++idx) {
1203 sc->arg_fmt[idx] = sc->fmt->arg[idx];
1206 sc->nr_args = nr_args;
1210 static int syscall__set_arg_fmts(struct syscall *sc)
1212 struct format_field *field;
1215 for (field = sc->args; field; field = field->next, ++idx) {
1216 if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1219 if (strcmp(field->type, "const char *") == 0 &&
1220 (strcmp(field->name, "filename") == 0 ||
1221 strcmp(field->name, "path") == 0 ||
1222 strcmp(field->name, "pathname") == 0))
1223 sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1224 else if (field->flags & FIELD_IS_POINTER)
1225 sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1226 else if (strcmp(field->type, "pid_t") == 0)
1227 sc->arg_fmt[idx].scnprintf = SCA_PID;
1228 else if (strcmp(field->type, "umode_t") == 0)
1229 sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1230 else if ((strcmp(field->type, "int") == 0 ||
1231 strcmp(field->type, "unsigned int") == 0 ||
1232 strcmp(field->type, "long") == 0) &&
1233 (len = strlen(field->name)) >= 2 &&
1234 strcmp(field->name + len - 2, "fd") == 0) {
1236 * /sys/kernel/tracing/events/syscalls/sys_enter*
1237 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1242 sc->arg_fmt[idx].scnprintf = SCA_FD;
1249 static int trace__read_syscall_info(struct trace *trace, int id)
1253 const char *name = syscalltbl__name(trace->sctbl, id);
1258 if (id > trace->syscalls.max) {
1259 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1261 if (nsyscalls == NULL)
1264 if (trace->syscalls.max != -1) {
1265 memset(nsyscalls + trace->syscalls.max + 1, 0,
1266 (id - trace->syscalls.max) * sizeof(*sc));
1268 memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1271 trace->syscalls.table = nsyscalls;
1272 trace->syscalls.max = id;
1275 sc = trace->syscalls.table + id;
1278 sc->fmt = syscall_fmt__find(sc->name);
1280 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1281 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1283 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1284 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1285 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1288 if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1291 if (IS_ERR(sc->tp_format))
1294 sc->args = sc->tp_format->format.fields;
1296 * We need to check and discard the first variable '__syscall_nr'
1297 * or 'nr' that mean the syscall number. It is needless here.
1298 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1300 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1301 sc->args = sc->args->next;
1305 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1306 sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
1308 return syscall__set_arg_fmts(sc);
1311 static int trace__validate_ev_qualifier(struct trace *trace)
1314 size_t nr_allocated;
1315 struct str_node *pos;
1317 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1318 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1319 sizeof(trace->ev_qualifier_ids.entries[0]));
1321 if (trace->ev_qualifier_ids.entries == NULL) {
1322 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1328 nr_allocated = trace->ev_qualifier_ids.nr;
1331 strlist__for_each_entry(pos, trace->ev_qualifier) {
1332 const char *sc = pos->s;
1333 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1336 id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1341 fputs("Error:\tInvalid syscall ", trace->output);
1344 fputs(", ", trace->output);
1347 fputs(sc, trace->output);
1350 trace->ev_qualifier_ids.entries[i++] = id;
1351 if (match_next == -1)
1355 id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1358 if (nr_allocated == trace->ev_qualifier_ids.nr) {
1362 entries = realloc(trace->ev_qualifier_ids.entries,
1363 nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1364 if (entries == NULL) {
1366 fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1369 trace->ev_qualifier_ids.entries = entries;
1371 trace->ev_qualifier_ids.nr++;
1372 trace->ev_qualifier_ids.entries[i++] = id;
1377 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1378 "\nHint:\tand: 'man syscalls'\n", trace->output);
1380 zfree(&trace->ev_qualifier_ids.entries);
1381 trace->ev_qualifier_ids.nr = 0;
1388 * args is to be interpreted as a series of longs but we need to handle
1389 * 8-byte unaligned accesses. args points to raw_data within the event
1390 * and raw_data is guaranteed to be 8-byte unaligned because it is
1391 * preceded by raw_size which is a u32. So we need to copy args to a temp
1392 * variable to read it. Most notably this avoids extended load instructions
1393 * on unaligned addresses
1395 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1398 unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1400 memcpy(&val, p, sizeof(val));
1404 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1405 struct syscall_arg *arg)
1407 if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1408 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1410 return scnprintf(bf, size, "arg%d: ", arg->idx);
1413 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1414 struct syscall_arg *arg, unsigned long val)
1416 if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1418 if (sc->arg_fmt[arg->idx].parm)
1419 arg->parm = sc->arg_fmt[arg->idx].parm;
1420 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1422 return scnprintf(bf, size, "%ld", val);
1425 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1426 unsigned char *args, struct trace *trace,
1427 struct thread *thread)
1432 struct syscall_arg arg = {
1439 struct thread_trace *ttrace = thread__priv(thread);
1442 * Things like fcntl will set this in its 'cmd' formatter to pick the
1443 * right formatter for the return value (an fd? file flags?), which is
1444 * not needed for syscalls that always return a given type, say an fd.
1446 ttrace->ret_scnprintf = NULL;
1448 if (sc->args != NULL) {
1449 struct format_field *field;
1451 for (field = sc->args; field;
1452 field = field->next, ++arg.idx, bit <<= 1) {
1456 val = syscall_arg__val(&arg, arg.idx);
1459 * Suppress this argument if its value is zero and
1460 * and we don't have a string associated in an
1465 (sc->arg_fmt[arg.idx].show_zero ||
1466 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1467 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1468 sc->arg_fmt[arg.idx].parm))
1471 printed += scnprintf(bf + printed, size - printed,
1472 "%s%s: ", printed ? ", " : "", field->name);
1473 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1475 } else if (IS_ERR(sc->tp_format)) {
1477 * If we managed to read the tracepoint /format file, then we
1478 * may end up not having any args, like with gettid(), so only
1479 * print the raw args when we didn't manage to read it.
1481 while (arg.idx < sc->nr_args) {
1484 val = syscall_arg__val(&arg, arg.idx);
1486 printed += scnprintf(bf + printed, size - printed, ", ");
1487 printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1488 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1498 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1499 union perf_event *event,
1500 struct perf_sample *sample);
1502 static struct syscall *trace__syscall_info(struct trace *trace,
1503 struct perf_evsel *evsel, int id)
1509 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1510 * before that, leaving at a higher verbosity level till that is
1511 * explained. Reproduced with plain ftrace with:
1513 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1514 * grep "NR -1 " /t/trace_pipe
1516 * After generating some load on the machine.
1520 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1521 id, perf_evsel__name(evsel), ++n);
1526 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1527 trace__read_syscall_info(trace, id))
1530 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1533 return &trace->syscalls.table[id];
1537 fprintf(trace->output, "Problems reading syscall %d", id);
1538 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1539 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1540 fputs(" information\n", trace->output);
1545 static void thread__update_stats(struct thread_trace *ttrace,
1546 int id, struct perf_sample *sample)
1548 struct int_node *inode;
1549 struct stats *stats;
1552 inode = intlist__findnew(ttrace->syscall_stats, id);
1556 stats = inode->priv;
1557 if (stats == NULL) {
1558 stats = malloc(sizeof(struct stats));
1562 inode->priv = stats;
1565 if (ttrace->entry_time && sample->time > ttrace->entry_time)
1566 duration = sample->time - ttrace->entry_time;
1568 update_stats(stats, duration);
1571 static int trace__printf_interrupted_entry(struct trace *trace)
1573 struct thread_trace *ttrace;
1576 if (trace->failure_only || trace->current == NULL)
1579 ttrace = thread__priv(trace->current);
1581 if (!ttrace->entry_pending)
1584 printed = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1585 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1586 ttrace->entry_pending = false;
1591 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1592 struct perf_sample *sample, struct thread *thread)
1596 if (trace->print_sample) {
1597 double ts = (double)sample->time / NSEC_PER_MSEC;
1599 printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1600 perf_evsel__name(evsel), ts,
1601 thread__comm_str(thread),
1602 sample->pid, sample->tid, sample->cpu);
1608 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1609 union perf_event *event __maybe_unused,
1610 struct perf_sample *sample)
1615 struct thread *thread;
1616 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1617 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1618 struct thread_trace *ttrace;
1623 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1624 ttrace = thread__trace(thread, trace->output);
1628 trace__fprintf_sample(trace, evsel, sample, thread);
1630 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1632 if (ttrace->entry_str == NULL) {
1633 ttrace->entry_str = malloc(trace__entry_str_size);
1634 if (!ttrace->entry_str)
1638 if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1639 trace__printf_interrupted_entry(trace);
1641 ttrace->entry_time = sample->time;
1642 msg = ttrace->entry_str;
1643 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1645 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1646 args, trace, thread);
1649 if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1650 trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1651 fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1654 ttrace->entry_pending = true;
1655 /* See trace__vfs_getname & trace__sys_exit */
1656 ttrace->filename.pending_open = false;
1659 if (trace->current != thread) {
1660 thread__put(trace->current);
1661 trace->current = thread__get(thread);
1665 thread__put(thread);
1669 static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evsel,
1670 struct perf_sample *sample)
1672 struct format_field *field = perf_evsel__field(evsel, "__syscall_nr");
1673 struct thread_trace *ttrace;
1674 struct thread *thread;
1683 id = format_field__intval(field, sample, evsel->needs_swap);
1684 sc = trace__syscall_info(trace, evsel, id);
1689 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1690 ttrace = thread__trace(thread, trace->output);
1692 * We need to get ttrace just to make sure it is there when syscall__scnprintf_args()
1693 * and the rest of the beautifiers accessing it via struct syscall_arg touches it.
1698 args = sample->raw_data + field->offset + sizeof(u64); /* skip __syscall_nr, there is where args are */
1699 syscall__scnprintf_args(sc, msg, sizeof(msg), args, trace, thread);
1700 fprintf(trace->output, "%s", msg);
1703 thread__put(thread);
1707 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1708 struct perf_sample *sample,
1709 struct callchain_cursor *cursor)
1711 struct addr_location al;
1712 int max_stack = evsel->attr.sample_max_stack ?
1713 evsel->attr.sample_max_stack :
1716 if (machine__resolve(trace->host, &al, sample) < 0 ||
1717 thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1723 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1725 /* TODO: user-configurable print_opts */
1726 const unsigned int print_opts = EVSEL__PRINT_SYM |
1728 EVSEL__PRINT_UNKNOWN_AS_ADDR;
1730 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1733 static const char *errno_to_name(struct perf_evsel *evsel, int err)
1735 struct perf_env *env = perf_evsel__env(evsel);
1736 const char *arch_name = perf_env__arch(env);
1738 return arch_syscalls__strerrno(arch_name, err);
1741 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1742 union perf_event *event __maybe_unused,
1743 struct perf_sample *sample)
1747 bool duration_calculated = false;
1748 struct thread *thread;
1749 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1750 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1751 struct thread_trace *ttrace;
1756 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1757 ttrace = thread__trace(thread, trace->output);
1761 trace__fprintf_sample(trace, evsel, sample, thread);
1764 thread__update_stats(ttrace, id, sample);
1766 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1768 if (sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
1769 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1770 ttrace->filename.pending_open = false;
1771 ++trace->stats.vfs_getname;
1774 if (ttrace->entry_time) {
1775 duration = sample->time - ttrace->entry_time;
1776 if (trace__filter_duration(trace, duration))
1778 duration_calculated = true;
1779 } else if (trace->duration_filter)
1782 if (sample->callchain) {
1783 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1784 if (callchain_ret == 0) {
1785 if (callchain_cursor.nr < trace->min_stack)
1791 if (trace->summary_only || (ret >= 0 && trace->failure_only))
1794 trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1796 if (ttrace->entry_pending) {
1797 fprintf(trace->output, "%-70s", ttrace->entry_str);
1799 fprintf(trace->output, " ... [");
1800 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1801 fprintf(trace->output, "]: %s()", sc->name);
1804 if (sc->fmt == NULL) {
1808 fprintf(trace->output, ") = %ld", ret);
1809 } else if (ret < 0) {
1811 char bf[STRERR_BUFSIZE];
1812 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1813 *e = errno_to_name(evsel, -ret);
1815 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1817 } else if (ret == 0 && sc->fmt->timeout)
1818 fprintf(trace->output, ") = 0 Timeout");
1819 else if (ttrace->ret_scnprintf) {
1821 struct syscall_arg arg = {
1826 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1827 ttrace->ret_scnprintf = NULL;
1828 fprintf(trace->output, ") = %s", bf);
1829 } else if (sc->fmt->hexret)
1830 fprintf(trace->output, ") = %#lx", ret);
1831 else if (sc->fmt->errpid) {
1832 struct thread *child = machine__find_thread(trace->host, ret, ret);
1834 if (child != NULL) {
1835 fprintf(trace->output, ") = %ld", ret);
1836 if (child->comm_set)
1837 fprintf(trace->output, " (%s)", thread__comm_str(child));
1843 fputc('\n', trace->output);
1845 if (callchain_ret > 0)
1846 trace__fprintf_callchain(trace, sample);
1847 else if (callchain_ret < 0)
1848 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1850 ttrace->entry_pending = false;
1853 thread__put(thread);
1857 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1858 union perf_event *event __maybe_unused,
1859 struct perf_sample *sample)
1861 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1862 struct thread_trace *ttrace;
1863 size_t filename_len, entry_str_len, to_move;
1864 ssize_t remaining_space;
1866 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1871 ttrace = thread__priv(thread);
1875 filename_len = strlen(filename);
1876 if (filename_len == 0)
1879 if (ttrace->filename.namelen < filename_len) {
1880 char *f = realloc(ttrace->filename.name, filename_len + 1);
1885 ttrace->filename.namelen = filename_len;
1886 ttrace->filename.name = f;
1889 strcpy(ttrace->filename.name, filename);
1890 ttrace->filename.pending_open = true;
1892 if (!ttrace->filename.ptr)
1895 entry_str_len = strlen(ttrace->entry_str);
1896 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1897 if (remaining_space <= 0)
1900 if (filename_len > (size_t)remaining_space) {
1901 filename += filename_len - remaining_space;
1902 filename_len = remaining_space;
1905 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1906 pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1907 memmove(pos + filename_len, pos, to_move);
1908 memcpy(pos, filename, filename_len);
1910 ttrace->filename.ptr = 0;
1911 ttrace->filename.entry_str_pos = 0;
1913 thread__put(thread);
1918 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1919 union perf_event *event __maybe_unused,
1920 struct perf_sample *sample)
1922 u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1923 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1924 struct thread *thread = machine__findnew_thread(trace->host,
1927 struct thread_trace *ttrace = thread__trace(thread, trace->output);
1932 ttrace->runtime_ms += runtime_ms;
1933 trace->runtime_ms += runtime_ms;
1935 thread__put(thread);
1939 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1941 perf_evsel__strval(evsel, sample, "comm"),
1942 (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1944 perf_evsel__intval(evsel, sample, "vruntime"));
1948 static int bpf_output__printer(enum binary_printer_ops op,
1949 unsigned int val, void *extra __maybe_unused, FILE *fp)
1951 unsigned char ch = (unsigned char)val;
1954 case BINARY_PRINT_CHAR_DATA:
1955 return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1956 case BINARY_PRINT_DATA_BEGIN:
1957 case BINARY_PRINT_LINE_BEGIN:
1958 case BINARY_PRINT_ADDR:
1959 case BINARY_PRINT_NUM_DATA:
1960 case BINARY_PRINT_NUM_PAD:
1961 case BINARY_PRINT_SEP:
1962 case BINARY_PRINT_CHAR_PAD:
1963 case BINARY_PRINT_LINE_END:
1964 case BINARY_PRINT_DATA_END:
1972 static void bpf_output__fprintf(struct trace *trace,
1973 struct perf_sample *sample)
1975 binary__fprintf(sample->raw_data, sample->raw_size, 8,
1976 bpf_output__printer, NULL, trace->output);
1979 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1980 union perf_event *event __maybe_unused,
1981 struct perf_sample *sample)
1983 int callchain_ret = 0;
1985 if (sample->callchain) {
1986 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1987 if (callchain_ret == 0) {
1988 if (callchain_cursor.nr < trace->min_stack)
1994 trace__printf_interrupted_entry(trace);
1995 trace__fprintf_tstamp(trace, sample->time, trace->output);
1997 if (trace->trace_syscalls)
1998 fprintf(trace->output, "( ): ");
2000 fprintf(trace->output, "%s:", evsel->name);
2002 if (perf_evsel__is_bpf_output(evsel)) {
2003 bpf_output__fprintf(trace, sample);
2004 } else if (evsel->tp_format) {
2005 if (strncmp(evsel->tp_format->name, "sys_enter_", 10) ||
2006 trace__fprintf_sys_enter(trace, evsel, sample)) {
2007 event_format__fprintf(evsel->tp_format, sample->cpu,
2008 sample->raw_data, sample->raw_size,
2013 fprintf(trace->output, "\n");
2015 if (callchain_ret > 0)
2016 trace__fprintf_callchain(trace, sample);
2017 else if (callchain_ret < 0)
2018 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2023 static void print_location(FILE *f, struct perf_sample *sample,
2024 struct addr_location *al,
2025 bool print_dso, bool print_sym)
2028 if ((verbose > 0 || print_dso) && al->map)
2029 fprintf(f, "%s@", al->map->dso->long_name);
2031 if ((verbose > 0 || print_sym) && al->sym)
2032 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2033 al->addr - al->sym->start);
2035 fprintf(f, "0x%" PRIx64, al->addr);
2037 fprintf(f, "0x%" PRIx64, sample->addr);
2040 static int trace__pgfault(struct trace *trace,
2041 struct perf_evsel *evsel,
2042 union perf_event *event __maybe_unused,
2043 struct perf_sample *sample)
2045 struct thread *thread;
2046 struct addr_location al;
2047 char map_type = 'd';
2048 struct thread_trace *ttrace;
2050 int callchain_ret = 0;
2052 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2054 if (sample->callchain) {
2055 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2056 if (callchain_ret == 0) {
2057 if (callchain_cursor.nr < trace->min_stack)
2063 ttrace = thread__trace(thread, trace->output);
2067 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2072 if (trace->summary_only)
2075 thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
2077 trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2079 fprintf(trace->output, "%sfault [",
2080 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2083 print_location(trace->output, sample, &al, false, true);
2085 fprintf(trace->output, "] => ");
2087 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2090 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2098 print_location(trace->output, sample, &al, true, false);
2100 fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2102 if (callchain_ret > 0)
2103 trace__fprintf_callchain(trace, sample);
2104 else if (callchain_ret < 0)
2105 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2109 thread__put(thread);
2113 static void trace__set_base_time(struct trace *trace,
2114 struct perf_evsel *evsel,
2115 struct perf_sample *sample)
2118 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2119 * and don't use sample->time unconditionally, we may end up having
2120 * some other event in the future without PERF_SAMPLE_TIME for good
2121 * reason, i.e. we may not be interested in its timestamps, just in
2122 * it taking place, picking some piece of information when it
2123 * appears in our event stream (vfs_getname comes to mind).
2125 if (trace->base_time == 0 && !trace->full_time &&
2126 (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2127 trace->base_time = sample->time;
2130 static int trace__process_sample(struct perf_tool *tool,
2131 union perf_event *event,
2132 struct perf_sample *sample,
2133 struct perf_evsel *evsel,
2134 struct machine *machine __maybe_unused)
2136 struct trace *trace = container_of(tool, struct trace, tool);
2137 struct thread *thread;
2140 tracepoint_handler handler = evsel->handler;
2142 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2143 if (thread && thread__is_filtered(thread))
2146 trace__set_base_time(trace, evsel, sample);
2150 handler(trace, evsel, event, sample);
2153 thread__put(thread);
2157 static int trace__record(struct trace *trace, int argc, const char **argv)
2159 unsigned int rec_argc, i, j;
2160 const char **rec_argv;
2161 const char * const record_args[] = {
2168 const char * const sc_args[] = { "-e", };
2169 unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2170 const char * const majpf_args[] = { "-e", "major-faults" };
2171 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2172 const char * const minpf_args[] = { "-e", "minor-faults" };
2173 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2175 /* +1 is for the event string below */
2176 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2177 majpf_args_nr + minpf_args_nr + argc;
2178 rec_argv = calloc(rec_argc + 1, sizeof(char *));
2180 if (rec_argv == NULL)
2184 for (i = 0; i < ARRAY_SIZE(record_args); i++)
2185 rec_argv[j++] = record_args[i];
2187 if (trace->trace_syscalls) {
2188 for (i = 0; i < sc_args_nr; i++)
2189 rec_argv[j++] = sc_args[i];
2191 /* event string may be different for older kernels - e.g., RHEL6 */
2192 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2193 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2194 else if (is_valid_tracepoint("syscalls:sys_enter"))
2195 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2197 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2203 if (trace->trace_pgfaults & TRACE_PFMAJ)
2204 for (i = 0; i < majpf_args_nr; i++)
2205 rec_argv[j++] = majpf_args[i];
2207 if (trace->trace_pgfaults & TRACE_PFMIN)
2208 for (i = 0; i < minpf_args_nr; i++)
2209 rec_argv[j++] = minpf_args[i];
2211 for (i = 0; i < (unsigned int)argc; i++)
2212 rec_argv[j++] = argv[i];
2214 return cmd_record(j, rec_argv);
2217 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2219 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2221 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2226 if (perf_evsel__field(evsel, "pathname") == NULL) {
2227 perf_evsel__delete(evsel);
2231 evsel->handler = trace__vfs_getname;
2232 perf_evlist__add(evlist, evsel);
2236 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2238 struct perf_evsel *evsel;
2239 struct perf_event_attr attr = {
2240 .type = PERF_TYPE_SOFTWARE,
2244 attr.config = config;
2245 attr.sample_period = 1;
2247 event_attr_init(&attr);
2249 evsel = perf_evsel__new(&attr);
2251 evsel->handler = trace__pgfault;
2256 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2258 const u32 type = event->header.type;
2259 struct perf_evsel *evsel;
2261 if (type != PERF_RECORD_SAMPLE) {
2262 trace__process_event(trace, trace->host, event, sample);
2266 evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2267 if (evsel == NULL) {
2268 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2272 trace__set_base_time(trace, evsel, sample);
2274 if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2275 sample->raw_data == NULL) {
2276 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2277 perf_evsel__name(evsel), sample->tid,
2278 sample->cpu, sample->raw_size);
2280 tracepoint_handler handler = evsel->handler;
2281 handler(trace, evsel, event, sample);
2285 static int trace__add_syscall_newtp(struct trace *trace)
2288 struct perf_evlist *evlist = trace->evlist;
2289 struct perf_evsel *sys_enter, *sys_exit;
2291 sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2292 if (sys_enter == NULL)
2295 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2296 goto out_delete_sys_enter;
2298 sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2299 if (sys_exit == NULL)
2300 goto out_delete_sys_enter;
2302 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2303 goto out_delete_sys_exit;
2305 perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2306 perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2308 perf_evlist__add(evlist, sys_enter);
2309 perf_evlist__add(evlist, sys_exit);
2311 if (callchain_param.enabled && !trace->kernel_syscallchains) {
2313 * We're interested only in the user space callchain
2314 * leading to the syscall, allow overriding that for
2315 * debugging reasons using --kernel_syscall_callchains
2317 sys_exit->attr.exclude_callchain_kernel = 1;
2320 trace->syscalls.events.sys_enter = sys_enter;
2321 trace->syscalls.events.sys_exit = sys_exit;
2327 out_delete_sys_exit:
2328 perf_evsel__delete_priv(sys_exit);
2329 out_delete_sys_enter:
2330 perf_evsel__delete_priv(sys_enter);
2334 static int trace__set_ev_qualifier_filter(struct trace *trace)
2337 struct perf_evsel *sys_exit;
2338 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2339 trace->ev_qualifier_ids.nr,
2340 trace->ev_qualifier_ids.entries);
2345 if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2347 sys_exit = trace->syscalls.events.sys_exit;
2348 err = perf_evsel__append_tp_filter(sys_exit, filter);
2359 static int trace__set_filter_loop_pids(struct trace *trace)
2361 unsigned int nr = 1;
2365 struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2367 while (thread && nr < ARRAY_SIZE(pids)) {
2368 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2373 if (!strcmp(thread__comm_str(parent), "sshd")) {
2374 pids[nr++] = parent->tid;
2380 return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2383 static int trace__run(struct trace *trace, int argc, const char **argv)
2385 struct perf_evlist *evlist = trace->evlist;
2386 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2388 unsigned long before;
2389 const bool forks = argc > 0;
2390 bool draining = false;
2394 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2395 goto out_error_raw_syscalls;
2397 if (trace->trace_syscalls)
2398 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2400 if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2401 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2402 if (pgfault_maj == NULL)
2404 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2405 perf_evlist__add(evlist, pgfault_maj);
2408 if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2409 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2410 if (pgfault_min == NULL)
2412 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2413 perf_evlist__add(evlist, pgfault_min);
2417 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2418 trace__sched_stat_runtime))
2419 goto out_error_sched_stat_runtime;
2422 * If a global cgroup was set, apply it to all the events without an
2423 * explicit cgroup. I.e.:
2425 * trace -G A -e sched:*switch
2427 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2428 * _and_ sched:sched_switch to the 'A' cgroup, while:
2430 * trace -e sched:*switch -G A
2432 * will only set the sched:sched_switch event to the 'A' cgroup, all the
2433 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
2434 * a cgroup (on the root cgroup, sys wide, etc).
2438 * trace -G A -e sched:*switch -G B
2440 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
2441 * to the 'B' cgroup.
2443 * evlist__set_default_cgroup() grabs a reference of the passed cgroup
2444 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
2447 evlist__set_default_cgroup(trace->evlist, trace->cgroup);
2449 err = perf_evlist__create_maps(evlist, &trace->opts.target);
2451 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2452 goto out_delete_evlist;
2455 err = trace__symbols_init(trace, evlist);
2457 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2458 goto out_delete_evlist;
2461 perf_evlist__config(evlist, &trace->opts, &callchain_param);
2463 signal(SIGCHLD, sig_handler);
2464 signal(SIGINT, sig_handler);
2467 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2470 fprintf(trace->output, "Couldn't run the workload!\n");
2471 goto out_delete_evlist;
2475 err = perf_evlist__open(evlist);
2477 goto out_error_open;
2479 err = bpf__apply_obj_config();
2481 char errbuf[BUFSIZ];
2483 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2484 pr_err("ERROR: Apply config to BPF failed: %s\n",
2486 goto out_error_open;
2490 * Better not use !target__has_task() here because we need to cover the
2491 * case where no threads were specified in the command line, but a
2492 * workload was, and in that case we will fill in the thread_map when
2493 * we fork the workload in perf_evlist__prepare_workload.
2495 if (trace->filter_pids.nr > 0)
2496 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2497 else if (thread_map__pid(evlist->threads, 0) == -1)
2498 err = trace__set_filter_loop_pids(trace);
2503 if (trace->ev_qualifier_ids.nr > 0) {
2504 err = trace__set_ev_qualifier_filter(trace);
2508 pr_debug("event qualifier tracepoint filter: %s\n",
2509 trace->syscalls.events.sys_exit->filter);
2512 err = perf_evlist__apply_filters(evlist, &evsel);
2514 goto out_error_apply_filters;
2516 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2518 goto out_error_mmap;
2520 if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2521 perf_evlist__enable(evlist);
2524 perf_evlist__start_workload(evlist);
2526 if (trace->opts.initial_delay) {
2527 usleep(trace->opts.initial_delay * 1000);
2528 perf_evlist__enable(evlist);
2531 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2532 evlist->threads->nr > 1 ||
2533 perf_evlist__first(evlist)->attr.inherit;
2536 * Now that we already used evsel->attr to ask the kernel to setup the
2537 * events, lets reuse evsel->attr.sample_max_stack as the limit in
2538 * trace__resolve_callchain(), allowing per-event max-stack settings
2539 * to override an explicitely set --max-stack global setting.
2541 evlist__for_each_entry(evlist, evsel) {
2542 if (evsel__has_callchain(evsel) &&
2543 evsel->attr.sample_max_stack == 0)
2544 evsel->attr.sample_max_stack = trace->max_stack;
2547 before = trace->nr_events;
2549 for (i = 0; i < evlist->nr_mmaps; i++) {
2550 union perf_event *event;
2551 struct perf_mmap *md;
2553 md = &evlist->mmap[i];
2554 if (perf_mmap__read_init(md) < 0)
2557 while ((event = perf_mmap__read_event(md)) != NULL) {
2558 struct perf_sample sample;
2562 err = perf_evlist__parse_sample(evlist, event, &sample);
2564 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2568 trace__handle_event(trace, event, &sample);
2570 perf_mmap__consume(md);
2575 if (done && !draining) {
2576 perf_evlist__disable(evlist);
2580 perf_mmap__read_done(md);
2583 if (trace->nr_events == before) {
2584 int timeout = done ? 100 : -1;
2586 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2587 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2597 thread__zput(trace->current);
2599 perf_evlist__disable(evlist);
2603 trace__fprintf_thread_summary(trace, trace->output);
2605 if (trace->show_tool_stats) {
2606 fprintf(trace->output, "Stats:\n "
2607 " vfs_getname : %" PRIu64 "\n"
2608 " proc_getname: %" PRIu64 "\n",
2609 trace->stats.vfs_getname,
2610 trace->stats.proc_getname);
2615 trace__symbols__exit(trace);
2617 perf_evlist__delete(evlist);
2618 cgroup__put(trace->cgroup);
2619 trace->evlist = NULL;
2620 trace->live = false;
2623 char errbuf[BUFSIZ];
2625 out_error_sched_stat_runtime:
2626 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2629 out_error_raw_syscalls:
2630 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2634 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2638 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2641 fprintf(trace->output, "%s\n", errbuf);
2642 goto out_delete_evlist;
2644 out_error_apply_filters:
2645 fprintf(trace->output,
2646 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2647 evsel->filter, perf_evsel__name(evsel), errno,
2648 str_error_r(errno, errbuf, sizeof(errbuf)));
2649 goto out_delete_evlist;
2652 fprintf(trace->output, "Not enough memory to run!\n");
2653 goto out_delete_evlist;
2656 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2657 goto out_delete_evlist;
2660 static int trace__replay(struct trace *trace)
2662 const struct perf_evsel_str_handler handlers[] = {
2663 { "probe:vfs_getname", trace__vfs_getname, },
2665 struct perf_data data = {
2669 .mode = PERF_DATA_MODE_READ,
2670 .force = trace->force,
2672 struct perf_session *session;
2673 struct perf_evsel *evsel;
2676 trace->tool.sample = trace__process_sample;
2677 trace->tool.mmap = perf_event__process_mmap;
2678 trace->tool.mmap2 = perf_event__process_mmap2;
2679 trace->tool.comm = perf_event__process_comm;
2680 trace->tool.exit = perf_event__process_exit;
2681 trace->tool.fork = perf_event__process_fork;
2682 trace->tool.attr = perf_event__process_attr;
2683 trace->tool.tracing_data = perf_event__process_tracing_data;
2684 trace->tool.build_id = perf_event__process_build_id;
2685 trace->tool.namespaces = perf_event__process_namespaces;
2687 trace->tool.ordered_events = true;
2688 trace->tool.ordering_requires_timestamps = true;
2690 /* add tid to output */
2691 trace->multiple_threads = true;
2693 session = perf_session__new(&data, false, &trace->tool);
2694 if (session == NULL)
2697 if (trace->opts.target.pid)
2698 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2700 if (trace->opts.target.tid)
2701 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2703 if (symbol__init(&session->header.env) < 0)
2706 trace->host = &session->machines.host;
2708 err = perf_session__set_tracepoints_handlers(session, handlers);
2712 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2713 "raw_syscalls:sys_enter");
2714 /* older kernels have syscalls tp versus raw_syscalls */
2716 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2717 "syscalls:sys_enter");
2720 (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2721 perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2722 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2726 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2727 "raw_syscalls:sys_exit");
2729 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2730 "syscalls:sys_exit");
2732 (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2733 perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2734 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2738 evlist__for_each_entry(session->evlist, evsel) {
2739 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2740 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2741 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2742 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2743 evsel->handler = trace__pgfault;
2748 err = perf_session__process_events(session);
2750 pr_err("Failed to process events, error %d", err);
2752 else if (trace->summary)
2753 trace__fprintf_thread_summary(trace, trace->output);
2756 perf_session__delete(session);
2761 static size_t trace__fprintf_threads_header(FILE *fp)
2765 printed = fprintf(fp, "\n Summary of events:\n\n");
2770 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2771 struct stats *stats;
2776 struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2777 struct stats *stats = source->priv;
2779 entry->syscall = source->i;
2780 entry->stats = stats;
2781 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2784 static size_t thread__dump_stats(struct thread_trace *ttrace,
2785 struct trace *trace, FILE *fp)
2790 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2792 if (syscall_stats == NULL)
2795 printed += fprintf(fp, "\n");
2797 printed += fprintf(fp, " syscall calls total min avg max stddev\n");
2798 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
2799 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n");
2801 resort_rb__for_each_entry(nd, syscall_stats) {
2802 struct stats *stats = syscall_stats_entry->stats;
2804 double min = (double)(stats->min) / NSEC_PER_MSEC;
2805 double max = (double)(stats->max) / NSEC_PER_MSEC;
2806 double avg = avg_stats(stats);
2808 u64 n = (u64) stats->n;
2810 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2811 avg /= NSEC_PER_MSEC;
2813 sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2814 printed += fprintf(fp, " %-15s", sc->name);
2815 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2816 n, syscall_stats_entry->msecs, min, avg);
2817 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2821 resort_rb__delete(syscall_stats);
2822 printed += fprintf(fp, "\n\n");
2827 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2830 struct thread_trace *ttrace = thread__priv(thread);
2836 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2838 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2839 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2840 printed += fprintf(fp, "%.1f%%", ratio);
2842 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2844 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2846 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2847 else if (fputc('\n', fp) != EOF)
2850 printed += thread__dump_stats(ttrace, trace, fp);
2855 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2857 return ttrace ? ttrace->nr_events : 0;
2860 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2861 struct thread *thread;
2864 entry->thread = rb_entry(nd, struct thread, rb_node);
2867 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2869 size_t printed = trace__fprintf_threads_header(fp);
2873 for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2874 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2876 if (threads == NULL) {
2877 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2881 resort_rb__for_each_entry(nd, threads)
2882 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2884 resort_rb__delete(threads);
2889 static int trace__set_duration(const struct option *opt, const char *str,
2890 int unset __maybe_unused)
2892 struct trace *trace = opt->value;
2894 trace->duration_filter = atof(str);
2898 static int trace__set_filter_pids(const struct option *opt, const char *str,
2899 int unset __maybe_unused)
2903 struct trace *trace = opt->value;
2905 * FIXME: introduce a intarray class, plain parse csv and create a
2906 * { int nr, int entries[] } struct...
2908 struct intlist *list = intlist__new(str);
2913 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2914 trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2916 if (trace->filter_pids.entries == NULL)
2919 trace->filter_pids.entries[0] = getpid();
2921 for (i = 1; i < trace->filter_pids.nr; ++i)
2922 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2924 intlist__delete(list);
2930 static int trace__open_output(struct trace *trace, const char *filename)
2934 if (!stat(filename, &st) && st.st_size) {
2935 char oldname[PATH_MAX];
2937 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2939 rename(filename, oldname);
2942 trace->output = fopen(filename, "w");
2944 return trace->output == NULL ? -errno : 0;
2947 static int parse_pagefaults(const struct option *opt, const char *str,
2948 int unset __maybe_unused)
2950 int *trace_pgfaults = opt->value;
2952 if (strcmp(str, "all") == 0)
2953 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2954 else if (strcmp(str, "maj") == 0)
2955 *trace_pgfaults |= TRACE_PFMAJ;
2956 else if (strcmp(str, "min") == 0)
2957 *trace_pgfaults |= TRACE_PFMIN;
2964 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2966 struct perf_evsel *evsel;
2968 evlist__for_each_entry(evlist, evsel)
2969 evsel->handler = handler;
2973 * XXX: Hackish, just splitting the combined -e+--event (syscalls
2974 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2975 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2977 * It'd be better to introduce a parse_options() variant that would return a
2978 * list with the terms it didn't match to an event...
2980 static int trace__parse_events_option(const struct option *opt, const char *str,
2981 int unset __maybe_unused)
2983 struct trace *trace = (struct trace *)opt->value;
2984 const char *s = str;
2985 char *sep = NULL, *lists[2] = { NULL, NULL, };
2986 int len = strlen(str) + 1, err = -1, list, idx;
2987 char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2988 char group_name[PATH_MAX];
2990 if (strace_groups_dir == NULL)
2995 trace->not_ev_qualifier = true;
2999 if ((sep = strchr(s, ',')) != NULL)
3003 if (syscalltbl__id(trace->sctbl, s) >= 0 ||
3004 syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
3007 path__join(group_name, sizeof(group_name), strace_groups_dir, s);
3008 if (access(group_name, R_OK) == 0)
3013 sprintf(lists[list] + strlen(lists[list]), ",%s", s);
3015 lists[list] = malloc(len);
3016 if (lists[list] == NULL)
3018 strcpy(lists[list], s);
3028 if (lists[1] != NULL) {
3029 struct strlist_config slist_config = {
3030 .dirname = strace_groups_dir,
3033 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
3034 if (trace->ev_qualifier == NULL) {
3035 fputs("Not enough memory to parse event qualifier", trace->output);
3039 if (trace__validate_ev_qualifier(trace))
3041 trace->trace_syscalls = true;
3047 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3048 "event selector. use 'perf list' to list available events",
3049 parse_events_option);
3050 err = parse_events_option(&o, lists[0], 0);
3059 static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3061 struct trace *trace = opt->value;
3063 if (!list_empty(&trace->evlist->entries))
3064 return parse_cgroups(opt, str, unset);
3066 trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3071 int cmd_trace(int argc, const char **argv)
3073 const char *trace_usage[] = {
3074 "perf trace [<options>] [<command>]",
3075 "perf trace [<options>] -- <command> [<options>]",
3076 "perf trace record [<options>] [<command>]",
3077 "perf trace record [<options>] -- <command> [<options>]",
3080 struct trace trace = {
3089 .user_freq = UINT_MAX,
3090 .user_interval = ULLONG_MAX,
3091 .no_buffering = true,
3092 .mmap_pages = UINT_MAX,
3093 .proc_map_timeout = 500,
3097 .trace_syscalls = false,
3098 .kernel_syscallchains = false,
3099 .max_stack = UINT_MAX,
3101 const char *output_name = NULL;
3102 const struct option trace_options[] = {
3103 OPT_CALLBACK('e', "event", &trace, "event",
3104 "event/syscall selector. use 'perf list' to list available events",
3105 trace__parse_events_option),
3106 OPT_BOOLEAN(0, "comm", &trace.show_comm,
3107 "show the thread COMM next to its id"),
3108 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3109 OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3110 trace__parse_events_option),
3111 OPT_STRING('o', "output", &output_name, "file", "output file name"),
3112 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3113 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3114 "trace events on existing process id"),
3115 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3116 "trace events on existing thread id"),
3117 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3118 "pids to filter (by the kernel)", trace__set_filter_pids),
3119 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3120 "system-wide collection from all CPUs"),
3121 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3122 "list of cpus to monitor"),
3123 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3124 "child tasks do not inherit counters"),
3125 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3126 "number of mmap data pages",
3127 perf_evlist__parse_mmap_pages),
3128 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3130 OPT_CALLBACK(0, "duration", &trace, "float",
3131 "show only events with duration > N.M ms",
3132 trace__set_duration),
3133 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3134 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3135 OPT_BOOLEAN('T', "time", &trace.full_time,
3136 "Show full timestamp, not time relative to first start"),
3137 OPT_BOOLEAN(0, "failure", &trace.failure_only,
3138 "Show only syscalls that failed"),
3139 OPT_BOOLEAN('s', "summary", &trace.summary_only,
3140 "Show only syscall summary with statistics"),
3141 OPT_BOOLEAN('S', "with-summary", &trace.summary,
3142 "Show all syscalls and summary with statistics"),
3143 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3144 "Trace pagefaults", parse_pagefaults, "maj"),
3145 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3146 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3147 OPT_CALLBACK(0, "call-graph", &trace.opts,
3148 "record_mode[,record_size]", record_callchain_help,
3149 &record_parse_callchain_opt),
3150 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3151 "Show the kernel callchains on the syscall exit path"),
3152 OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3153 "Set the minimum stack depth when parsing the callchain, "
3154 "anything below the specified depth will be ignored."),
3155 OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3156 "Set the maximum stack depth when parsing the callchain, "
3157 "anything beyond the specified depth will be ignored. "
3158 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3159 OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3160 "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3161 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3162 "per thread proc mmap processing timeout in ms"),
3163 OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3164 trace__parse_cgroups),
3165 OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3166 "ms to wait before starting measurement after program "
3170 bool __maybe_unused max_stack_user_set = true;
3171 bool mmap_pages_user_set = true;
3172 const char * const trace_subcommands[] = { "record", NULL };
3176 signal(SIGSEGV, sighandler_dump_stack);
3177 signal(SIGFPE, sighandler_dump_stack);
3179 trace.evlist = perf_evlist__new();
3180 trace.sctbl = syscalltbl__new();
3182 if (trace.evlist == NULL || trace.sctbl == NULL) {
3183 pr_err("Not enough memory to run!\n");
3188 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3189 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3191 if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3192 usage_with_options_msg(trace_usage, trace_options,
3193 "cgroup monitoring only available in system-wide mode");
3196 err = bpf__setup_stdout(trace.evlist);
3198 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3199 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3205 if (trace.trace_pgfaults) {
3206 trace.opts.sample_address = true;
3207 trace.opts.sample_time = true;
3210 if (trace.opts.mmap_pages == UINT_MAX)
3211 mmap_pages_user_set = false;
3213 if (trace.max_stack == UINT_MAX) {
3214 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
3215 max_stack_user_set = false;
3218 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3219 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3220 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3224 if (callchain_param.enabled) {
3225 if (!mmap_pages_user_set && geteuid() == 0)
3226 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3228 symbol_conf.use_callchain = true;
3231 if (trace.evlist->nr_entries > 0)
3232 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3234 if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3235 return trace__record(&trace, argc-1, &argv[1]);
3237 /* summary_only implies summary option, but don't overwrite summary if set */
3238 if (trace.summary_only)
3239 trace.summary = trace.summary_only;
3241 if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3242 trace.evlist->nr_entries == 0 /* Was --events used? */) {
3243 trace.trace_syscalls = true;
3246 if (output_name != NULL) {
3247 err = trace__open_output(&trace, output_name);
3249 perror("failed to create output file");
3254 err = target__validate(&trace.opts.target);
3256 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3257 fprintf(trace.output, "%s", bf);
3261 err = target__parse_uid(&trace.opts.target);
3263 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3264 fprintf(trace.output, "%s", bf);
3268 if (!argc && target__none(&trace.opts.target))
3269 trace.opts.target.system_wide = true;
3272 err = trace__replay(&trace);
3274 err = trace__run(&trace, argc, argv);
3277 if (output_name != NULL)
3278 fclose(trace.output);