1 // SPDX-License-Identifier: GPL-2.0
5 * Builtin record command: Record the profile of a workload
6 * (or a CPU, or a PID) into the perf.data output file - for
7 * later analysis via perf report.
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/target.h"
24 #include "util/session.h"
25 #include "util/tool.h"
26 #include "util/symbol.h"
27 #include "util/record.h"
28 #include "util/cpumap.h"
29 #include "util/thread_map.h"
30 #include "util/data.h"
31 #include "util/perf_regs.h"
32 #include "util/auxtrace.h"
34 #include "util/parse-branch-options.h"
35 #include "util/parse-regs-options.h"
36 #include "util/llvm-utils.h"
37 #include "util/bpf-loader.h"
38 #include "util/trigger.h"
39 #include "util/perf-hooks.h"
40 #include "util/cpu-set-sched.h"
41 #include "util/time-utils.h"
42 #include "util/units.h"
43 #include "util/bpf-event.h"
56 #include <linux/string.h>
57 #include <linux/time64.h>
58 #include <linux/zalloc.h>
60 struct switch_output {
73 struct perf_tool tool;
74 struct record_opts opts;
76 struct perf_data data;
77 struct auxtrace_record *itr;
78 struct evlist *evlist;
79 struct perf_session *session;
83 bool no_buildid_cache;
84 bool no_buildid_cache_set;
86 bool timestamp_filename;
87 bool timestamp_boundary;
88 struct switch_output switch_output;
89 unsigned long long samples;
90 cpu_set_t affinity_mask;
93 static volatile int auxtrace_record__snapshot_started;
94 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
95 static DEFINE_TRIGGER(switch_output_trigger);
97 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
101 static bool switch_output_signal(struct record *rec)
103 return rec->switch_output.signal &&
104 trigger_is_ready(&switch_output_trigger);
107 static bool switch_output_size(struct record *rec)
109 return rec->switch_output.size &&
110 trigger_is_ready(&switch_output_trigger) &&
111 (rec->bytes_written >= rec->switch_output.size);
114 static bool switch_output_time(struct record *rec)
116 return rec->switch_output.time &&
117 trigger_is_ready(&switch_output_trigger);
120 static int record__write(struct record *rec, struct perf_mmap *map __maybe_unused,
121 void *bf, size_t size)
123 struct perf_data_file *file = &rec->session->data->file;
125 if (perf_data_file__write(file, bf, size) < 0) {
126 pr_err("failed to write perf data, error: %m\n");
130 rec->bytes_written += size;
132 if (switch_output_size(rec))
133 trigger_hit(&switch_output_trigger);
138 static int record__aio_enabled(struct record *rec);
139 static int record__comp_enabled(struct record *rec);
140 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
141 void *src, size_t src_size);
143 #ifdef HAVE_AIO_SUPPORT
144 static int record__aio_write(struct aiocb *cblock, int trace_fd,
145 void *buf, size_t size, off_t off)
149 cblock->aio_fildes = trace_fd;
150 cblock->aio_buf = buf;
151 cblock->aio_nbytes = size;
152 cblock->aio_offset = off;
153 cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
156 rc = aio_write(cblock);
159 } else if (errno != EAGAIN) {
160 cblock->aio_fildes = -1;
161 pr_err("failed to queue perf data, error: %m\n");
169 static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
175 ssize_t aio_ret, written;
177 aio_errno = aio_error(cblock);
178 if (aio_errno == EINPROGRESS)
181 written = aio_ret = aio_return(cblock);
183 if (aio_errno != EINTR)
184 pr_err("failed to write perf data, error: %m\n");
188 rem_size = cblock->aio_nbytes - written;
191 cblock->aio_fildes = -1;
193 * md->refcount is incremented in record__aio_pushfn() for
194 * every aio write request started in record__aio_push() so
195 * decrement it because the request is now complete.
201 * aio write request may require restart with the
202 * reminder if the kernel didn't write whole
205 rem_off = cblock->aio_offset + written;
206 rem_buf = (void *)(cblock->aio_buf + written);
207 record__aio_write(cblock, cblock->aio_fildes,
208 rem_buf, rem_size, rem_off);
215 static int record__aio_sync(struct perf_mmap *md, bool sync_all)
217 struct aiocb **aiocb = md->aio.aiocb;
218 struct aiocb *cblocks = md->aio.cblocks;
219 struct timespec timeout = { 0, 1000 * 1000 * 1 }; /* 1ms */
224 for (i = 0; i < md->aio.nr_cblocks; ++i) {
225 if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
232 * Started aio write is not complete yet
233 * so it has to be waited before the
236 aiocb[i] = &cblocks[i];
243 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
244 if (!(errno == EAGAIN || errno == EINTR))
245 pr_err("failed to sync perf data, error: %m\n");
256 static int record__aio_pushfn(struct perf_mmap *map, void *to, void *buf, size_t size)
258 struct record_aio *aio = to;
261 * map->base data pointed by buf is copied into free map->aio.data[] buffer
262 * to release space in the kernel buffer as fast as possible, calling
263 * perf_mmap__consume() from perf_mmap__push() function.
265 * That lets the kernel to proceed with storing more profiling data into
266 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
268 * Coping can be done in two steps in case the chunk of profiling data
269 * crosses the upper bound of the kernel buffer. In this case we first move
270 * part of data from map->start till the upper bound and then the reminder
271 * from the beginning of the kernel buffer till the end of the data chunk.
274 if (record__comp_enabled(aio->rec)) {
275 size = zstd_compress(aio->rec->session, aio->data + aio->size,
276 perf_mmap__mmap_len(map) - aio->size,
279 memcpy(aio->data + aio->size, buf, size);
284 * Increment map->refcount to guard map->aio.data[] buffer
285 * from premature deallocation because map object can be
286 * released earlier than aio write request started on
287 * map->aio.data[] buffer is complete.
289 * perf_mmap__put() is done at record__aio_complete()
290 * after started aio request completion or at record__aio_push()
291 * if the request failed to start.
301 static int record__aio_push(struct record *rec, struct perf_mmap *map, off_t *off)
304 int trace_fd = rec->session->data->file.fd;
305 struct record_aio aio = { .rec = rec, .size = 0 };
308 * Call record__aio_sync() to wait till map->aio.data[] buffer
309 * becomes available after previous aio write operation.
312 idx = record__aio_sync(map, false);
313 aio.data = map->aio.data[idx];
314 ret = perf_mmap__push(map, &aio, record__aio_pushfn);
315 if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
319 ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
322 rec->bytes_written += aio.size;
323 if (switch_output_size(rec))
324 trigger_hit(&switch_output_trigger);
327 * Decrement map->refcount incremented in record__aio_pushfn()
328 * back if record__aio_write() operation failed to start, otherwise
329 * map->refcount is decremented in record__aio_complete() after
330 * aio write operation finishes successfully.
338 static off_t record__aio_get_pos(int trace_fd)
340 return lseek(trace_fd, 0, SEEK_CUR);
343 static void record__aio_set_pos(int trace_fd, off_t pos)
345 lseek(trace_fd, pos, SEEK_SET);
348 static void record__aio_mmap_read_sync(struct record *rec)
351 struct evlist *evlist = rec->evlist;
352 struct perf_mmap *maps = evlist->mmap;
354 if (!record__aio_enabled(rec))
357 for (i = 0; i < evlist->nr_mmaps; i++) {
358 struct perf_mmap *map = &maps[i];
361 record__aio_sync(map, true);
365 static int nr_cblocks_default = 1;
366 static int nr_cblocks_max = 4;
368 static int record__aio_parse(const struct option *opt,
372 struct record_opts *opts = (struct record_opts *)opt->value;
375 opts->nr_cblocks = 0;
378 opts->nr_cblocks = strtol(str, NULL, 0);
379 if (!opts->nr_cblocks)
380 opts->nr_cblocks = nr_cblocks_default;
385 #else /* HAVE_AIO_SUPPORT */
386 static int nr_cblocks_max = 0;
388 static int record__aio_push(struct record *rec __maybe_unused, struct perf_mmap *map __maybe_unused,
389 off_t *off __maybe_unused)
394 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
399 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
403 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
408 static int record__aio_enabled(struct record *rec)
410 return rec->opts.nr_cblocks > 0;
413 #define MMAP_FLUSH_DEFAULT 1
414 static int record__mmap_flush_parse(const struct option *opt,
419 struct record_opts *opts = (struct record_opts *)opt->value;
420 static struct parse_tag tags[] = {
421 { .tag = 'B', .mult = 1 },
422 { .tag = 'K', .mult = 1 << 10 },
423 { .tag = 'M', .mult = 1 << 20 },
424 { .tag = 'G', .mult = 1 << 30 },
432 opts->mmap_flush = parse_tag_value(str, tags);
433 if (opts->mmap_flush == (int)-1)
434 opts->mmap_flush = strtol(str, NULL, 0);
437 if (!opts->mmap_flush)
438 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
440 flush_max = perf_evlist__mmap_size(opts->mmap_pages);
442 if (opts->mmap_flush > flush_max)
443 opts->mmap_flush = flush_max;
448 #ifdef HAVE_ZSTD_SUPPORT
449 static unsigned int comp_level_default = 1;
451 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
453 struct record_opts *opts = opt->value;
456 opts->comp_level = 0;
459 opts->comp_level = strtol(str, NULL, 0);
460 if (!opts->comp_level)
461 opts->comp_level = comp_level_default;
467 static unsigned int comp_level_max = 22;
469 static int record__comp_enabled(struct record *rec)
471 return rec->opts.comp_level > 0;
474 static int process_synthesized_event(struct perf_tool *tool,
475 union perf_event *event,
476 struct perf_sample *sample __maybe_unused,
477 struct machine *machine __maybe_unused)
479 struct record *rec = container_of(tool, struct record, tool);
480 return record__write(rec, NULL, event, event->header.size);
483 static int record__pushfn(struct perf_mmap *map, void *to, void *bf, size_t size)
485 struct record *rec = to;
487 if (record__comp_enabled(rec)) {
488 size = zstd_compress(rec->session, map->data, perf_mmap__mmap_len(map), bf, size);
493 return record__write(rec, map, bf, size);
496 static volatile int done;
497 static volatile int signr = -1;
498 static volatile int child_finished;
500 static void sig_handler(int sig)
510 static void sigsegv_handler(int sig)
512 perf_hooks__recover();
513 sighandler_dump_stack(sig);
516 static void record__sig_exit(void)
521 signal(signr, SIG_DFL);
525 #ifdef HAVE_AUXTRACE_SUPPORT
527 static int record__process_auxtrace(struct perf_tool *tool,
528 struct perf_mmap *map,
529 union perf_event *event, void *data1,
530 size_t len1, void *data2, size_t len2)
532 struct record *rec = container_of(tool, struct record, tool);
533 struct perf_data *data = &rec->data;
537 if (!perf_data__is_pipe(data) && !perf_data__is_dir(data)) {
539 int fd = perf_data__fd(data);
542 file_offset = lseek(fd, 0, SEEK_CUR);
543 if (file_offset == -1)
545 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
551 /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
552 padding = (len1 + len2) & 7;
554 padding = 8 - padding;
556 record__write(rec, map, event, event->header.size);
557 record__write(rec, map, data1, len1);
559 record__write(rec, map, data2, len2);
560 record__write(rec, map, &pad, padding);
565 static int record__auxtrace_mmap_read(struct record *rec,
566 struct perf_mmap *map)
570 ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
571 record__process_auxtrace);
581 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
582 struct perf_mmap *map)
586 ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
587 record__process_auxtrace,
588 rec->opts.auxtrace_snapshot_size);
598 static int record__auxtrace_read_snapshot_all(struct record *rec)
603 for (i = 0; i < rec->evlist->nr_mmaps; i++) {
604 struct perf_mmap *map = &rec->evlist->mmap[i];
606 if (!map->auxtrace_mmap.base)
609 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
618 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
620 pr_debug("Recording AUX area tracing snapshot\n");
621 if (record__auxtrace_read_snapshot_all(rec) < 0) {
622 trigger_error(&auxtrace_snapshot_trigger);
624 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
625 trigger_error(&auxtrace_snapshot_trigger);
627 trigger_ready(&auxtrace_snapshot_trigger);
631 static int record__auxtrace_snapshot_exit(struct record *rec)
633 if (trigger_is_error(&auxtrace_snapshot_trigger))
636 if (!auxtrace_record__snapshot_started &&
637 auxtrace_record__snapshot_start(rec->itr))
640 record__read_auxtrace_snapshot(rec, true);
641 if (trigger_is_error(&auxtrace_snapshot_trigger))
647 static int record__auxtrace_init(struct record *rec)
652 rec->itr = auxtrace_record__init(rec->evlist, &err);
657 err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
658 rec->opts.auxtrace_snapshot_opts);
662 return auxtrace_parse_filters(rec->evlist);
668 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
669 struct perf_mmap *map __maybe_unused)
675 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
676 bool on_exit __maybe_unused)
681 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
687 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
692 static int record__auxtrace_init(struct record *rec __maybe_unused)
699 static int record__mmap_evlist(struct record *rec,
700 struct evlist *evlist)
702 struct record_opts *opts = &rec->opts;
705 if (opts->affinity != PERF_AFFINITY_SYS)
706 cpu__setup_cpunode_map();
708 if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
709 opts->auxtrace_mmap_pages,
710 opts->auxtrace_snapshot_mode,
711 opts->nr_cblocks, opts->affinity,
712 opts->mmap_flush, opts->comp_level) < 0) {
713 if (errno == EPERM) {
714 pr_err("Permission error mapping pages.\n"
715 "Consider increasing "
716 "/proc/sys/kernel/perf_event_mlock_kb,\n"
717 "or try again with a smaller value of -m/--mmap_pages.\n"
718 "(current value: %u,%u)\n",
719 opts->mmap_pages, opts->auxtrace_mmap_pages);
722 pr_err("failed to mmap with %d (%s)\n", errno,
723 str_error_r(errno, msg, sizeof(msg)));
733 static int record__mmap(struct record *rec)
735 return record__mmap_evlist(rec, rec->evlist);
738 static int record__open(struct record *rec)
742 struct evlist *evlist = rec->evlist;
743 struct perf_session *session = rec->session;
744 struct record_opts *opts = &rec->opts;
748 * For initial_delay we need to add a dummy event so that we can track
749 * PERF_RECORD_MMAP while we wait for the initial delay to enable the
750 * real events, the ones asked by the user.
752 if (opts->initial_delay) {
753 if (perf_evlist__add_dummy(evlist))
756 pos = perf_evlist__first(evlist);
758 pos = perf_evlist__last(evlist);
760 pos->core.attr.enable_on_exec = 1;
763 perf_evlist__config(evlist, opts, &callchain_param);
765 evlist__for_each_entry(evlist, pos) {
767 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
768 if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
770 ui__warning("%s\n", msg);
773 if ((errno == EINVAL || errno == EBADF) &&
774 pos->leader != pos &&
776 pos = perf_evlist__reset_weak_group(evlist, pos);
780 perf_evsel__open_strerror(pos, &opts->target,
781 errno, msg, sizeof(msg));
782 ui__error("%s\n", msg);
786 pos->supported = true;
789 if (perf_evlist__apply_filters(evlist, &pos)) {
790 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
791 pos->filter, perf_evsel__name(pos), errno,
792 str_error_r(errno, msg, sizeof(msg)));
797 rc = record__mmap(rec);
801 session->evlist = evlist;
802 perf_session__set_id_hdr_size(session);
807 static int process_sample_event(struct perf_tool *tool,
808 union perf_event *event,
809 struct perf_sample *sample,
811 struct machine *machine)
813 struct record *rec = container_of(tool, struct record, tool);
815 if (rec->evlist->first_sample_time == 0)
816 rec->evlist->first_sample_time = sample->time;
818 rec->evlist->last_sample_time = sample->time;
820 if (rec->buildid_all)
824 return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
827 static int process_buildids(struct record *rec)
829 struct perf_session *session = rec->session;
831 if (perf_data__size(&rec->data) == 0)
835 * During this process, it'll load kernel map and replace the
836 * dso->long_name to a real pathname it found. In this case
837 * we prefer the vmlinux path like
838 * /lib/modules/3.16.4/build/vmlinux
840 * rather than build-id path (in debug directory).
841 * $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
843 symbol_conf.ignore_vmlinux_buildid = true;
846 * If --buildid-all is given, it marks all DSO regardless of hits,
847 * so no need to process samples. But if timestamp_boundary is enabled,
848 * it still needs to walk on all samples to get the timestamps of
849 * first/last samples.
851 if (rec->buildid_all && !rec->timestamp_boundary)
852 rec->tool.sample = NULL;
854 return perf_session__process_events(session);
857 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
860 struct perf_tool *tool = data;
862 *As for guest kernel when processing subcommand record&report,
863 *we arrange module mmap prior to guest kernel mmap and trigger
864 *a preload dso because default guest module symbols are loaded
865 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
866 *method is used to avoid symbol missing when the first addr is
867 *in module instead of in guest kernel.
869 err = perf_event__synthesize_modules(tool, process_synthesized_event,
872 pr_err("Couldn't record guest kernel [%d]'s reference"
873 " relocation symbol.\n", machine->pid);
876 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
877 * have no _text sometimes.
879 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
882 pr_err("Couldn't record guest kernel [%d]'s reference"
883 " relocation symbol.\n", machine->pid);
886 static struct perf_event_header finished_round_event = {
887 .size = sizeof(struct perf_event_header),
888 .type = PERF_RECORD_FINISHED_ROUND,
891 static void record__adjust_affinity(struct record *rec, struct perf_mmap *map)
893 if (rec->opts.affinity != PERF_AFFINITY_SYS &&
894 !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
895 CPU_ZERO(&rec->affinity_mask);
896 CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask);
897 sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask);
901 static size_t process_comp_header(void *record, size_t increment)
903 struct perf_record_compressed *event = record;
904 size_t size = sizeof(*event);
907 event->header.size += increment;
911 event->header.type = PERF_RECORD_COMPRESSED;
912 event->header.size = size;
917 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
918 void *src, size_t src_size)
921 size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
923 compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
924 max_record_size, process_comp_header);
926 session->bytes_transferred += src_size;
927 session->bytes_compressed += compressed;
932 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
933 bool overwrite, bool synch)
935 u64 bytes_written = rec->bytes_written;
938 struct perf_mmap *maps;
939 int trace_fd = rec->data.file.fd;
945 maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
949 if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
952 if (record__aio_enabled(rec))
953 off = record__aio_get_pos(trace_fd);
955 for (i = 0; i < evlist->nr_mmaps; i++) {
957 struct perf_mmap *map = &maps[i];
960 record__adjust_affinity(rec, map);
965 if (!record__aio_enabled(rec)) {
966 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
973 if (record__aio_push(rec, map, &off) < 0) {
974 record__aio_set_pos(trace_fd, off);
985 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
986 record__auxtrace_mmap_read(rec, map) != 0) {
992 if (record__aio_enabled(rec))
993 record__aio_set_pos(trace_fd, off);
996 * Mark the round finished in case we wrote
997 * at least one event.
999 if (bytes_written != rec->bytes_written)
1000 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1003 perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1008 static int record__mmap_read_all(struct record *rec, bool synch)
1012 err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1016 return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1019 static void record__init_features(struct record *rec)
1021 struct perf_session *session = rec->session;
1024 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1025 perf_header__set_feat(&session->header, feat);
1027 if (rec->no_buildid)
1028 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1030 if (!have_tracepoints(&rec->evlist->core.entries))
1031 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1033 if (!rec->opts.branch_stack)
1034 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1036 if (!rec->opts.full_auxtrace)
1037 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1039 if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1040 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1042 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1043 if (!record__comp_enabled(rec))
1044 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1046 perf_header__clear_feat(&session->header, HEADER_STAT);
1050 record__finish_output(struct record *rec)
1052 struct perf_data *data = &rec->data;
1053 int fd = perf_data__fd(data);
1058 rec->session->header.data_size += rec->bytes_written;
1059 data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1061 if (!rec->no_buildid) {
1062 process_buildids(rec);
1064 if (rec->buildid_all)
1065 dsos__hit_all(rec->session);
1067 perf_session__write_header(rec->session, rec->evlist, fd, true);
1072 static int record__synthesize_workload(struct record *rec, bool tail)
1075 struct perf_thread_map *thread_map;
1077 if (rec->opts.tail_synthesize != tail)
1080 thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1081 if (thread_map == NULL)
1084 err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1085 process_synthesized_event,
1086 &rec->session->machines.host,
1087 rec->opts.sample_address);
1088 perf_thread_map__put(thread_map);
1092 static int record__synthesize(struct record *rec, bool tail);
1095 record__switch_output(struct record *rec, bool at_exit)
1097 struct perf_data *data = &rec->data;
1101 /* Same Size: "2015122520103046"*/
1102 char timestamp[] = "InvalidTimestamp";
1104 record__aio_mmap_read_sync(rec);
1106 record__synthesize(rec, true);
1107 if (target__none(&rec->opts.target))
1108 record__synthesize_workload(rec, true);
1111 record__finish_output(rec);
1112 err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1114 pr_err("Failed to get current timestamp\n");
1118 fd = perf_data__switch(data, timestamp,
1119 rec->session->header.data_offset,
1120 at_exit, &new_filename);
1121 if (fd >= 0 && !at_exit) {
1122 rec->bytes_written = 0;
1123 rec->session->header.data_size = 0;
1127 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1128 data->path, timestamp);
1130 if (rec->switch_output.num_files) {
1131 int n = rec->switch_output.cur_file + 1;
1133 if (n >= rec->switch_output.num_files)
1135 rec->switch_output.cur_file = n;
1136 if (rec->switch_output.filenames[n]) {
1137 remove(rec->switch_output.filenames[n]);
1138 zfree(&rec->switch_output.filenames[n]);
1140 rec->switch_output.filenames[n] = new_filename;
1145 /* Output tracking events */
1147 record__synthesize(rec, false);
1150 * In 'perf record --switch-output' without -a,
1151 * record__synthesize() in record__switch_output() won't
1152 * generate tracking events because there's no thread_map
1153 * in evlist. Which causes newly created perf.data doesn't
1154 * contain map and comm information.
1155 * Create a fake thread_map and directly call
1156 * perf_event__synthesize_thread_map() for those events.
1158 if (target__none(&rec->opts.target))
1159 record__synthesize_workload(rec, false);
1164 static volatile int workload_exec_errno;
1167 * perf_evlist__prepare_workload will send a SIGUSR1
1168 * if the fork fails, since we asked by setting its
1169 * want_signal to true.
1171 static void workload_exec_failed_signal(int signo __maybe_unused,
1173 void *ucontext __maybe_unused)
1175 workload_exec_errno = info->si_value.sival_int;
1180 static void snapshot_sig_handler(int sig);
1181 static void alarm_sig_handler(int sig);
1184 perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused,
1185 struct perf_tool *tool __maybe_unused,
1186 perf_event__handler_t process __maybe_unused,
1187 struct machine *machine __maybe_unused)
1192 static const struct perf_event_mmap_page *
1193 perf_evlist__pick_pc(struct evlist *evlist)
1196 if (evlist->mmap && evlist->mmap[0].base)
1197 return evlist->mmap[0].base;
1198 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].base)
1199 return evlist->overwrite_mmap[0].base;
1204 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1206 const struct perf_event_mmap_page *pc;
1208 pc = perf_evlist__pick_pc(rec->evlist);
1214 static int record__synthesize(struct record *rec, bool tail)
1216 struct perf_session *session = rec->session;
1217 struct machine *machine = &session->machines.host;
1218 struct perf_data *data = &rec->data;
1219 struct record_opts *opts = &rec->opts;
1220 struct perf_tool *tool = &rec->tool;
1221 int fd = perf_data__fd(data);
1224 if (rec->opts.tail_synthesize != tail)
1227 if (data->is_pipe) {
1229 * We need to synthesize events first, because some
1230 * features works on top of them (on report side).
1232 err = perf_event__synthesize_attrs(tool, rec->evlist,
1233 process_synthesized_event);
1235 pr_err("Couldn't synthesize attrs.\n");
1239 err = perf_event__synthesize_features(tool, session, rec->evlist,
1240 process_synthesized_event);
1242 pr_err("Couldn't synthesize features.\n");
1246 if (have_tracepoints(&rec->evlist->core.entries)) {
1248 * FIXME err <= 0 here actually means that
1249 * there were no tracepoints so its not really
1250 * an error, just that we don't need to
1251 * synthesize anything. We really have to
1252 * return this more properly and also
1253 * propagate errors that now are calling die()
1255 err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
1256 process_synthesized_event);
1258 pr_err("Couldn't record tracing data.\n");
1261 rec->bytes_written += err;
1265 err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1266 process_synthesized_event, machine);
1270 if (rec->opts.full_auxtrace) {
1271 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1272 session, process_synthesized_event);
1277 if (!perf_evlist__exclude_kernel(rec->evlist)) {
1278 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1280 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1281 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1282 "Check /proc/kallsyms permission or run as root.\n");
1284 err = perf_event__synthesize_modules(tool, process_synthesized_event,
1286 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1287 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1288 "Check /proc/modules permission or run as root.\n");
1292 machines__process_guests(&session->machines,
1293 perf_event__synthesize_guest_os, tool);
1296 err = perf_event__synthesize_extra_attr(&rec->tool,
1298 process_synthesized_event,
1303 err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1304 process_synthesized_event,
1307 pr_err("Couldn't synthesize thread map.\n");
1311 err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1312 process_synthesized_event, NULL);
1314 pr_err("Couldn't synthesize cpu map.\n");
1318 err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1321 pr_warning("Couldn't synthesize bpf events.\n");
1323 err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1324 process_synthesized_event, opts->sample_address,
1330 static int __cmd_record(struct record *rec, int argc, const char **argv)
1334 unsigned long waking = 0;
1335 const bool forks = argc > 0;
1336 struct perf_tool *tool = &rec->tool;
1337 struct record_opts *opts = &rec->opts;
1338 struct perf_data *data = &rec->data;
1339 struct perf_session *session;
1340 bool disabled = false, draining = false;
1341 struct evlist *sb_evlist = NULL;
1345 atexit(record__sig_exit);
1346 signal(SIGCHLD, sig_handler);
1347 signal(SIGINT, sig_handler);
1348 signal(SIGTERM, sig_handler);
1349 signal(SIGSEGV, sigsegv_handler);
1351 if (rec->opts.record_namespaces)
1352 tool->namespace_events = true;
1354 if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1355 signal(SIGUSR2, snapshot_sig_handler);
1356 if (rec->opts.auxtrace_snapshot_mode)
1357 trigger_on(&auxtrace_snapshot_trigger);
1358 if (rec->switch_output.enabled)
1359 trigger_on(&switch_output_trigger);
1361 signal(SIGUSR2, SIG_IGN);
1364 session = perf_session__new(data, false, tool);
1365 if (session == NULL) {
1366 pr_err("Perf session creation failed.\n");
1370 fd = perf_data__fd(data);
1371 rec->session = session;
1373 if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1374 pr_err("Compression initialization failed.\n");
1378 session->header.env.comp_type = PERF_COMP_ZSTD;
1379 session->header.env.comp_level = rec->opts.comp_level;
1381 record__init_features(rec);
1383 if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1384 session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1387 err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1388 argv, data->is_pipe,
1389 workload_exec_failed_signal);
1391 pr_err("Couldn't run the workload!\n");
1393 goto out_delete_session;
1398 * If we have just single event and are sending data
1399 * through pipe, we need to force the ids allocation,
1400 * because we synthesize event name through the pipe
1401 * and need the id for that.
1403 if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1404 rec->opts.sample_id = true;
1406 if (record__open(rec) != 0) {
1410 session->header.env.comp_mmap_len = session->evlist->mmap_len;
1412 err = bpf__apply_obj_config();
1414 char errbuf[BUFSIZ];
1416 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1417 pr_err("ERROR: Apply config to BPF failed: %s\n",
1423 * Normally perf_session__new would do this, but it doesn't have the
1426 if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1427 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1428 rec->tool.ordered_events = false;
1431 if (!rec->evlist->nr_groups)
1432 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1434 if (data->is_pipe) {
1435 err = perf_header__write_pipe(fd);
1439 err = perf_session__write_header(session, rec->evlist, fd, false);
1444 if (!rec->no_buildid
1445 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1446 pr_err("Couldn't generate buildids. "
1447 "Use --no-buildid to profile anyway.\n");
1452 if (!opts->no_bpf_event)
1453 bpf_event__add_sb_event(&sb_evlist, &session->header.env);
1455 if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) {
1456 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1457 opts->no_bpf_event = true;
1460 err = record__synthesize(rec, false);
1464 if (rec->realtime_prio) {
1465 struct sched_param param;
1467 param.sched_priority = rec->realtime_prio;
1468 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) {
1469 pr_err("Could not set realtime priority.\n");
1476 * When perf is starting the traced process, all the events
1477 * (apart from group members) have enable_on_exec=1 set,
1478 * so don't spoil it by prematurely enabling them.
1480 if (!target__none(&opts->target) && !opts->initial_delay)
1481 evlist__enable(rec->evlist);
1487 struct machine *machine = &session->machines.host;
1488 union perf_event *event;
1491 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1492 if (event == NULL) {
1498 * Some H/W events are generated before COMM event
1499 * which is emitted during exec(), so perf script
1500 * cannot see a correct process name for those events.
1501 * Synthesize COMM event to prevent it.
1503 tgid = perf_event__synthesize_comm(tool, event,
1504 rec->evlist->workload.pid,
1505 process_synthesized_event,
1512 event = malloc(sizeof(event->namespaces) +
1513 (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1514 machine->id_hdr_size);
1515 if (event == NULL) {
1521 * Synthesize NAMESPACES event for the command specified.
1523 perf_event__synthesize_namespaces(tool, event,
1524 rec->evlist->workload.pid,
1525 tgid, process_synthesized_event,
1529 perf_evlist__start_workload(rec->evlist);
1532 if (opts->initial_delay) {
1533 usleep(opts->initial_delay * USEC_PER_MSEC);
1534 evlist__enable(rec->evlist);
1537 trigger_ready(&auxtrace_snapshot_trigger);
1538 trigger_ready(&switch_output_trigger);
1539 perf_hooks__invoke_record_start();
1541 unsigned long long hits = rec->samples;
1544 * rec->evlist->bkw_mmap_state is possible to be
1545 * BKW_MMAP_EMPTY here: when done == true and
1546 * hits != rec->samples in previous round.
1548 * perf_evlist__toggle_bkw_mmap ensure we never
1549 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1551 if (trigger_is_hit(&switch_output_trigger) || done || draining)
1552 perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1554 if (record__mmap_read_all(rec, false) < 0) {
1555 trigger_error(&auxtrace_snapshot_trigger);
1556 trigger_error(&switch_output_trigger);
1561 if (auxtrace_record__snapshot_started) {
1562 auxtrace_record__snapshot_started = 0;
1563 if (!trigger_is_error(&auxtrace_snapshot_trigger))
1564 record__read_auxtrace_snapshot(rec, false);
1565 if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1566 pr_err("AUX area tracing snapshot failed\n");
1572 if (trigger_is_hit(&switch_output_trigger)) {
1574 * If switch_output_trigger is hit, the data in
1575 * overwritable ring buffer should have been collected,
1576 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1578 * If SIGUSR2 raise after or during record__mmap_read_all(),
1579 * record__mmap_read_all() didn't collect data from
1580 * overwritable ring buffer. Read again.
1582 if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1584 trigger_ready(&switch_output_trigger);
1587 * Reenable events in overwrite ring buffer after
1588 * record__mmap_read_all(): we should have collected
1591 perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1594 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1597 fd = record__switch_output(rec, false);
1599 pr_err("Failed to switch to new file\n");
1600 trigger_error(&switch_output_trigger);
1605 /* re-arm the alarm */
1606 if (rec->switch_output.time)
1607 alarm(rec->switch_output.time);
1610 if (hits == rec->samples) {
1611 if (done || draining)
1613 err = perf_evlist__poll(rec->evlist, -1);
1615 * Propagate error, only if there's any. Ignore positive
1616 * number of returned events and interrupt error.
1618 if (err > 0 || (err < 0 && errno == EINTR))
1622 if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1627 * When perf is starting the traced process, at the end events
1628 * die with the process and we wait for that. Thus no need to
1629 * disable events in this case.
1631 if (done && !disabled && !target__none(&opts->target)) {
1632 trigger_off(&auxtrace_snapshot_trigger);
1633 evlist__disable(rec->evlist);
1638 trigger_off(&auxtrace_snapshot_trigger);
1639 trigger_off(&switch_output_trigger);
1641 if (opts->auxtrace_snapshot_on_exit)
1642 record__auxtrace_snapshot_exit(rec);
1644 if (forks && workload_exec_errno) {
1645 char msg[STRERR_BUFSIZE];
1646 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1647 pr_err("Workload failed: %s\n", emsg);
1653 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1655 if (target__none(&rec->opts.target))
1656 record__synthesize_workload(rec, true);
1659 record__mmap_read_all(rec, true);
1660 record__aio_mmap_read_sync(rec);
1662 if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1663 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1664 session->header.env.comp_ratio = ratio + 0.5;
1670 if (!child_finished)
1671 kill(rec->evlist->workload.pid, SIGTERM);
1677 else if (WIFEXITED(exit_status))
1678 status = WEXITSTATUS(exit_status);
1679 else if (WIFSIGNALED(exit_status))
1680 signr = WTERMSIG(exit_status);
1684 record__synthesize(rec, true);
1685 /* this will be recalculated during process_buildids() */
1689 if (!rec->timestamp_filename) {
1690 record__finish_output(rec);
1692 fd = record__switch_output(rec, true);
1695 goto out_delete_session;
1700 perf_hooks__invoke_record_end();
1702 if (!err && !quiet) {
1704 const char *postfix = rec->timestamp_filename ?
1705 ".<timestamp>" : "";
1707 if (rec->samples && !rec->opts.full_auxtrace)
1708 scnprintf(samples, sizeof(samples),
1709 " (%" PRIu64 " samples)", rec->samples);
1713 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
1714 perf_data__size(data) / 1024.0 / 1024.0,
1715 data->path, postfix, samples);
1717 fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
1718 rec->session->bytes_transferred / 1024.0 / 1024.0,
1721 fprintf(stderr, " ]\n");
1725 zstd_fini(&session->zstd_data);
1726 perf_session__delete(session);
1728 if (!opts->no_bpf_event)
1729 perf_evlist__stop_sb_thread(sb_evlist);
1733 static void callchain_debug(struct callchain_param *callchain)
1735 static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1737 pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1739 if (callchain->record_mode == CALLCHAIN_DWARF)
1740 pr_debug("callchain: stack dump size %d\n",
1741 callchain->dump_size);
1744 int record_opts__parse_callchain(struct record_opts *record,
1745 struct callchain_param *callchain,
1746 const char *arg, bool unset)
1749 callchain->enabled = !unset;
1751 /* --no-call-graph */
1753 callchain->record_mode = CALLCHAIN_NONE;
1754 pr_debug("callchain: disabled\n");
1758 ret = parse_callchain_record_opt(arg, callchain);
1760 /* Enable data address sampling for DWARF unwind. */
1761 if (callchain->record_mode == CALLCHAIN_DWARF)
1762 record->sample_address = true;
1763 callchain_debug(callchain);
1769 int record_parse_callchain_opt(const struct option *opt,
1773 return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1776 int record_callchain_opt(const struct option *opt,
1777 const char *arg __maybe_unused,
1778 int unset __maybe_unused)
1780 struct callchain_param *callchain = opt->value;
1782 callchain->enabled = true;
1784 if (callchain->record_mode == CALLCHAIN_NONE)
1785 callchain->record_mode = CALLCHAIN_FP;
1787 callchain_debug(callchain);
1791 static int perf_record_config(const char *var, const char *value, void *cb)
1793 struct record *rec = cb;
1795 if (!strcmp(var, "record.build-id")) {
1796 if (!strcmp(value, "cache"))
1797 rec->no_buildid_cache = false;
1798 else if (!strcmp(value, "no-cache"))
1799 rec->no_buildid_cache = true;
1800 else if (!strcmp(value, "skip"))
1801 rec->no_buildid = true;
1806 if (!strcmp(var, "record.call-graph")) {
1807 var = "call-graph.record-mode";
1808 return perf_default_config(var, value, cb);
1810 #ifdef HAVE_AIO_SUPPORT
1811 if (!strcmp(var, "record.aio")) {
1812 rec->opts.nr_cblocks = strtol(value, NULL, 0);
1813 if (!rec->opts.nr_cblocks)
1814 rec->opts.nr_cblocks = nr_cblocks_default;
1821 struct clockid_map {
1826 #define CLOCKID_MAP(n, c) \
1827 { .name = n, .clockid = (c), }
1829 #define CLOCKID_END { .name = NULL, }
1833 * Add the missing ones, we need to build on many distros...
1835 #ifndef CLOCK_MONOTONIC_RAW
1836 #define CLOCK_MONOTONIC_RAW 4
1838 #ifndef CLOCK_BOOTTIME
1839 #define CLOCK_BOOTTIME 7
1842 #define CLOCK_TAI 11
1845 static const struct clockid_map clockids[] = {
1846 /* available for all events, NMI safe */
1847 CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1848 CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1850 /* available for some events */
1851 CLOCKID_MAP("realtime", CLOCK_REALTIME),
1852 CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1853 CLOCKID_MAP("tai", CLOCK_TAI),
1855 /* available for the lazy */
1856 CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1857 CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1858 CLOCKID_MAP("real", CLOCK_REALTIME),
1859 CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1864 static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1866 struct timespec res;
1869 if (!clock_getres(clk_id, &res))
1870 *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1872 pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1877 static int parse_clockid(const struct option *opt, const char *str, int unset)
1879 struct record_opts *opts = (struct record_opts *)opt->value;
1880 const struct clockid_map *cm;
1881 const char *ostr = str;
1884 opts->use_clockid = 0;
1892 /* no setting it twice */
1893 if (opts->use_clockid)
1896 opts->use_clockid = true;
1898 /* if its a number, we're done */
1899 if (sscanf(str, "%d", &opts->clockid) == 1)
1900 return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
1902 /* allow a "CLOCK_" prefix to the name */
1903 if (!strncasecmp(str, "CLOCK_", 6))
1906 for (cm = clockids; cm->name; cm++) {
1907 if (!strcasecmp(str, cm->name)) {
1908 opts->clockid = cm->clockid;
1909 return get_clockid_res(opts->clockid,
1910 &opts->clockid_res_ns);
1914 opts->use_clockid = false;
1915 ui__warning("unknown clockid %s, check man page\n", ostr);
1919 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
1921 struct record_opts *opts = (struct record_opts *)opt->value;
1926 if (!strcasecmp(str, "node"))
1927 opts->affinity = PERF_AFFINITY_NODE;
1928 else if (!strcasecmp(str, "cpu"))
1929 opts->affinity = PERF_AFFINITY_CPU;
1934 static int record__parse_mmap_pages(const struct option *opt,
1936 int unset __maybe_unused)
1938 struct record_opts *opts = opt->value;
1940 unsigned int mmap_pages;
1955 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1958 opts->mmap_pages = mmap_pages;
1966 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1970 opts->auxtrace_mmap_pages = mmap_pages;
1977 static void switch_output_size_warn(struct record *rec)
1979 u64 wakeup_size = perf_evlist__mmap_size(rec->opts.mmap_pages);
1980 struct switch_output *s = &rec->switch_output;
1984 if (s->size < wakeup_size) {
1987 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
1988 pr_warning("WARNING: switch-output data size lower than "
1989 "wakeup kernel buffer size (%s) "
1990 "expect bigger perf.data sizes\n", buf);
1994 static int switch_output_setup(struct record *rec)
1996 struct switch_output *s = &rec->switch_output;
1997 static struct parse_tag tags_size[] = {
1998 { .tag = 'B', .mult = 1 },
1999 { .tag = 'K', .mult = 1 << 10 },
2000 { .tag = 'M', .mult = 1 << 20 },
2001 { .tag = 'G', .mult = 1 << 30 },
2004 static struct parse_tag tags_time[] = {
2005 { .tag = 's', .mult = 1 },
2006 { .tag = 'm', .mult = 60 },
2007 { .tag = 'h', .mult = 60*60 },
2008 { .tag = 'd', .mult = 60*60*24 },
2016 if (!strcmp(s->str, "signal")) {
2018 pr_debug("switch-output with SIGUSR2 signal\n");
2022 val = parse_tag_value(s->str, tags_size);
2023 if (val != (unsigned long) -1) {
2025 pr_debug("switch-output with %s size threshold\n", s->str);
2029 val = parse_tag_value(s->str, tags_time);
2030 if (val != (unsigned long) -1) {
2032 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2040 rec->timestamp_filename = true;
2043 if (s->size && !rec->opts.no_buffering)
2044 switch_output_size_warn(rec);
2049 static const char * const __record_usage[] = {
2050 "perf record [<options>] [<command>]",
2051 "perf record [<options>] -- <command> [<options>]",
2054 const char * const *record_usage = __record_usage;
2057 * XXX Ideally would be local to cmd_record() and passed to a record__new
2058 * because we need to have access to it in record__exit, that is called
2059 * after cmd_record() exits, but since record_options need to be accessible to
2060 * builtin-script, leave it here.
2062 * At least we don't ouch it in all the other functions here directly.
2064 * Just say no to tons of global variables, sigh.
2066 static struct record record = {
2068 .sample_time = true,
2069 .mmap_pages = UINT_MAX,
2070 .user_freq = UINT_MAX,
2071 .user_interval = ULLONG_MAX,
2075 .default_per_cpu = true,
2077 .mmap_flush = MMAP_FLUSH_DEFAULT,
2080 .sample = process_sample_event,
2081 .fork = perf_event__process_fork,
2082 .exit = perf_event__process_exit,
2083 .comm = perf_event__process_comm,
2084 .namespaces = perf_event__process_namespaces,
2085 .mmap = perf_event__process_mmap,
2086 .mmap2 = perf_event__process_mmap2,
2087 .ordered_events = true,
2091 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2092 "\n\t\t\t\tDefault: fp";
2094 static bool dry_run;
2097 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2098 * with it and switch to use the library functions in perf_evlist that came
2099 * from builtin-record.c, i.e. use record_opts,
2100 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2103 static struct option __record_options[] = {
2104 OPT_CALLBACK('e', "event", &record.evlist, "event",
2105 "event selector. use 'perf list' to list available events",
2106 parse_events_option),
2107 OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2108 "event filter", parse_filter),
2109 OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2110 NULL, "don't record events from perf itself",
2112 OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2113 "record events on existing process id"),
2114 OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2115 "record events on existing thread id"),
2116 OPT_INTEGER('r', "realtime", &record.realtime_prio,
2117 "collect data with this RT SCHED_FIFO priority"),
2118 OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2119 "collect data without buffering"),
2120 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2121 "collect raw sample records from all opened counters"),
2122 OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2123 "system-wide collection from all CPUs"),
2124 OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2125 "list of cpus to monitor"),
2126 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2127 OPT_STRING('o', "output", &record.data.path, "file",
2128 "output file name"),
2129 OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2130 &record.opts.no_inherit_set,
2131 "child tasks do not inherit counters"),
2132 OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2133 "synthesize non-sample events at the end of output"),
2134 OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2135 OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
2136 OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2137 "Fail if the specified frequency can't be used"),
2138 OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2139 "profile at this frequency",
2140 record__parse_freq),
2141 OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2142 "number of mmap data pages and AUX area tracing mmap pages",
2143 record__parse_mmap_pages),
2144 OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2145 "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2146 record__mmap_flush_parse),
2147 OPT_BOOLEAN(0, "group", &record.opts.group,
2148 "put the counters into a counter group"),
2149 OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2150 NULL, "enables call-graph recording" ,
2151 &record_callchain_opt),
2152 OPT_CALLBACK(0, "call-graph", &record.opts,
2153 "record_mode[,record_size]", record_callchain_help,
2154 &record_parse_callchain_opt),
2155 OPT_INCR('v', "verbose", &verbose,
2156 "be more verbose (show counter open errors, etc)"),
2157 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2158 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2159 "per thread counts"),
2160 OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2161 OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2162 "Record the sample physical addresses"),
2163 OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2164 OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2165 &record.opts.sample_time_set,
2166 "Record the sample timestamps"),
2167 OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2168 "Record the sample period"),
2169 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2171 OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2172 &record.no_buildid_cache_set,
2173 "do not update the buildid cache"),
2174 OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2175 &record.no_buildid_set,
2176 "do not collect buildids in perf.data"),
2177 OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2178 "monitor event in cgroup name only",
2180 OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2181 "ms to wait before starting measurement after program start"),
2182 OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2185 OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2186 "branch any", "sample any taken branches",
2187 parse_branch_stack),
2189 OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2190 "branch filter mask", "branch stack filter modes",
2191 parse_branch_stack),
2192 OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2193 "sample by weight (on special events only)"),
2194 OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2195 "sample transaction flags (special events only)"),
2196 OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2197 "use per-thread mmaps"),
2198 OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2199 "sample selected machine registers on interrupt,"
2200 " use '-I?' to list register names", parse_intr_regs),
2201 OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2202 "sample selected machine registers on interrupt,"
2203 " use '--user-regs=?' to list register names", parse_user_regs),
2204 OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2205 "Record running/enabled time of read (:S) events"),
2206 OPT_CALLBACK('k', "clockid", &record.opts,
2207 "clockid", "clockid to use for events, see clock_gettime()",
2209 OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2210 "opts", "AUX area tracing Snapshot Mode", ""),
2211 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2212 "per thread proc mmap processing timeout in ms"),
2213 OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2214 "Record namespaces events"),
2215 OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
2216 "Record context switch events"),
2217 OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2218 "Configure all used events to run in kernel space.",
2219 PARSE_OPT_EXCLUSIVE),
2220 OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2221 "Configure all used events to run in user space.",
2222 PARSE_OPT_EXCLUSIVE),
2223 OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2224 "collect kernel callchains"),
2225 OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2226 "collect user callchains"),
2227 OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2228 "clang binary to use for compiling BPF scriptlets"),
2229 OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2230 "options passed to clang when compiling BPF scriptlets"),
2231 OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2232 "file", "vmlinux pathname"),
2233 OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2234 "Record build-id of all DSOs regardless of hits"),
2235 OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2236 "append timestamp to output filename"),
2237 OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2238 "Record timestamp boundary (time of first/last samples)"),
2239 OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2240 &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2241 "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2243 OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2244 "Limit number of switch output generated files"),
2245 OPT_BOOLEAN(0, "dry-run", &dry_run,
2246 "Parse options then exit"),
2247 #ifdef HAVE_AIO_SUPPORT
2248 OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2249 &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2252 OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2253 "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2254 record__parse_affinity),
2255 #ifdef HAVE_ZSTD_SUPPORT
2256 OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2257 "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2258 record__parse_comp_level),
2263 struct option *record_options = __record_options;
2265 int cmd_record(int argc, const char **argv)
2268 struct record *rec = &record;
2269 char errbuf[BUFSIZ];
2271 setlocale(LC_ALL, "");
2273 #ifndef HAVE_LIBBPF_SUPPORT
2274 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2275 set_nobuild('\0', "clang-path", true);
2276 set_nobuild('\0', "clang-opt", true);
2280 #ifndef HAVE_BPF_PROLOGUE
2281 # if !defined (HAVE_DWARF_SUPPORT)
2282 # define REASON "NO_DWARF=1"
2283 # elif !defined (HAVE_LIBBPF_SUPPORT)
2284 # define REASON "NO_LIBBPF=1"
2286 # define REASON "this architecture doesn't support BPF prologue"
2288 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2289 set_nobuild('\0', "vmlinux", true);
2294 CPU_ZERO(&rec->affinity_mask);
2295 rec->opts.affinity = PERF_AFFINITY_SYS;
2297 rec->evlist = evlist__new();
2298 if (rec->evlist == NULL)
2301 err = perf_config(perf_record_config, rec);
2305 argc = parse_options(argc, argv, record_options, record_usage,
2306 PARSE_OPT_STOP_AT_NON_OPTION);
2308 perf_quiet_option();
2310 /* Make system wide (-a) the default target. */
2311 if (!argc && target__none(&rec->opts.target))
2312 rec->opts.target.system_wide = true;
2314 if (nr_cgroups && !rec->opts.target.system_wide) {
2315 usage_with_options_msg(record_usage, record_options,
2316 "cgroup monitoring only available in system-wide mode");
2320 if (rec->opts.comp_level != 0) {
2321 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2322 rec->no_buildid = true;
2325 if (rec->opts.record_switch_events &&
2326 !perf_can_record_switch_events()) {
2327 ui__error("kernel does not support recording context switch events\n");
2328 parse_options_usage(record_usage, record_options, "switch-events", 0);
2332 if (switch_output_setup(rec)) {
2333 parse_options_usage(record_usage, record_options, "switch-output", 0);
2337 if (rec->switch_output.time) {
2338 signal(SIGALRM, alarm_sig_handler);
2339 alarm(rec->switch_output.time);
2342 if (rec->switch_output.num_files) {
2343 rec->switch_output.filenames = calloc(sizeof(char *),
2344 rec->switch_output.num_files);
2345 if (!rec->switch_output.filenames)
2350 * Allow aliases to facilitate the lookup of symbols for address
2351 * filters. Refer to auxtrace_parse_filters().
2353 symbol_conf.allow_aliases = true;
2357 err = record__auxtrace_init(rec);
2364 err = bpf__setup_stdout(rec->evlist);
2366 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2367 pr_err("ERROR: Setup BPF stdout failed: %s\n",
2374 if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(rec->evlist))
2376 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
2377 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
2378 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
2379 "file is not found in the buildid cache or in the vmlinux path.\n\n"
2380 "Samples in kernel modules won't be resolved at all.\n\n"
2381 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
2382 "even with a suitable vmlinux or kallsyms file.\n\n");
2384 if (rec->no_buildid_cache || rec->no_buildid) {
2385 disable_buildid_cache();
2386 } else if (rec->switch_output.enabled) {
2388 * In 'perf record --switch-output', disable buildid
2389 * generation by default to reduce data file switching
2390 * overhead. Still generate buildid if they are required
2393 * perf record --switch-output --no-no-buildid \
2394 * --no-no-buildid-cache
2396 * Following code equals to:
2398 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2399 * (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2400 * disable_buildid_cache();
2402 bool disable = true;
2404 if (rec->no_buildid_set && !rec->no_buildid)
2406 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2409 rec->no_buildid = true;
2410 rec->no_buildid_cache = true;
2411 disable_buildid_cache();
2415 if (record.opts.overwrite)
2416 record.opts.tail_synthesize = true;
2418 if (rec->evlist->core.nr_entries == 0 &&
2419 __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2420 pr_err("Not enough memory for event selector list\n");
2424 if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2425 rec->opts.no_inherit = true;
2427 err = target__validate(&rec->opts.target);
2429 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2430 ui__warning("%s\n", errbuf);
2433 err = target__parse_uid(&rec->opts.target);
2435 int saved_errno = errno;
2437 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2438 ui__error("%s", errbuf);
2444 /* Enable ignoring missing threads when -u/-p option is defined. */
2445 rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2448 if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2449 usage_with_options(record_usage, record_options);
2451 err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2456 * We take all buildids when the file contains
2457 * AUX area tracing data because we do not decode the
2458 * trace because it would take too long.
2460 if (rec->opts.full_auxtrace)
2461 rec->buildid_all = true;
2463 if (record_opts__config(&rec->opts)) {
2468 if (rec->opts.nr_cblocks > nr_cblocks_max)
2469 rec->opts.nr_cblocks = nr_cblocks_max;
2470 pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2472 pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2473 pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2475 if (rec->opts.comp_level > comp_level_max)
2476 rec->opts.comp_level = comp_level_max;
2477 pr_debug("comp level: %d\n", rec->opts.comp_level);
2479 err = __cmd_record(&record, argc, argv);
2481 evlist__delete(rec->evlist);
2483 auxtrace_record__free(rec->itr);
2487 static void snapshot_sig_handler(int sig __maybe_unused)
2489 struct record *rec = &record;
2491 if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2492 trigger_hit(&auxtrace_snapshot_trigger);
2493 auxtrace_record__snapshot_started = 1;
2494 if (auxtrace_record__snapshot_start(record.itr))
2495 trigger_error(&auxtrace_snapshot_trigger);
2498 if (switch_output_signal(rec))
2499 trigger_hit(&switch_output_trigger);
2502 static void alarm_sig_handler(int sig __maybe_unused)
2504 struct record *rec = &record;
2506 if (switch_output_time(rec))
2507 trigger_hit(&switch_output_trigger);