1 // SPDX-License-Identifier: GPL-2.0
5 * Builtin record command: Record the profile of a workload
6 * (or a CPU, or a PID) into the perf.data output file - for
7 * later analysis via perf report.
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/target.h"
24 #include "util/session.h"
25 #include "util/tool.h"
26 #include "util/symbol.h"
27 #include "util/record.h"
28 #include "util/cpumap.h"
29 #include "util/thread_map.h"
30 #include "util/data.h"
31 #include "util/perf_regs.h"
32 #include "util/auxtrace.h"
34 #include "util/parse-branch-options.h"
35 #include "util/parse-regs-options.h"
36 #include "util/llvm-utils.h"
37 #include "util/bpf-loader.h"
38 #include "util/trigger.h"
39 #include "util/perf-hooks.h"
40 #include "util/cpu-set-sched.h"
41 #include "util/synthetic-events.h"
42 #include "util/time-utils.h"
43 #include "util/units.h"
44 #include "util/bpf-event.h"
57 #include <linux/string.h>
58 #include <linux/time64.h>
59 #include <linux/zalloc.h>
61 struct switch_output {
74 struct perf_tool tool;
75 struct record_opts opts;
77 struct perf_data data;
78 struct auxtrace_record *itr;
79 struct evlist *evlist;
80 struct perf_session *session;
84 bool no_buildid_cache;
85 bool no_buildid_cache_set;
87 bool timestamp_filename;
88 bool timestamp_boundary;
89 struct switch_output switch_output;
90 unsigned long long samples;
91 cpu_set_t affinity_mask;
94 static volatile int auxtrace_record__snapshot_started;
95 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
96 static DEFINE_TRIGGER(switch_output_trigger);
98 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
102 static bool switch_output_signal(struct record *rec)
104 return rec->switch_output.signal &&
105 trigger_is_ready(&switch_output_trigger);
108 static bool switch_output_size(struct record *rec)
110 return rec->switch_output.size &&
111 trigger_is_ready(&switch_output_trigger) &&
112 (rec->bytes_written >= rec->switch_output.size);
115 static bool switch_output_time(struct record *rec)
117 return rec->switch_output.time &&
118 trigger_is_ready(&switch_output_trigger);
121 static int record__write(struct record *rec, struct perf_mmap *map __maybe_unused,
122 void *bf, size_t size)
124 struct perf_data_file *file = &rec->session->data->file;
126 if (perf_data_file__write(file, bf, size) < 0) {
127 pr_err("failed to write perf data, error: %m\n");
131 rec->bytes_written += size;
133 if (switch_output_size(rec))
134 trigger_hit(&switch_output_trigger);
139 static int record__aio_enabled(struct record *rec);
140 static int record__comp_enabled(struct record *rec);
141 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
142 void *src, size_t src_size);
144 #ifdef HAVE_AIO_SUPPORT
145 static int record__aio_write(struct aiocb *cblock, int trace_fd,
146 void *buf, size_t size, off_t off)
150 cblock->aio_fildes = trace_fd;
151 cblock->aio_buf = buf;
152 cblock->aio_nbytes = size;
153 cblock->aio_offset = off;
154 cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
157 rc = aio_write(cblock);
160 } else if (errno != EAGAIN) {
161 cblock->aio_fildes = -1;
162 pr_err("failed to queue perf data, error: %m\n");
170 static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
176 ssize_t aio_ret, written;
178 aio_errno = aio_error(cblock);
179 if (aio_errno == EINPROGRESS)
182 written = aio_ret = aio_return(cblock);
184 if (aio_errno != EINTR)
185 pr_err("failed to write perf data, error: %m\n");
189 rem_size = cblock->aio_nbytes - written;
192 cblock->aio_fildes = -1;
194 * md->refcount is incremented in record__aio_pushfn() for
195 * every aio write request started in record__aio_push() so
196 * decrement it because the request is now complete.
202 * aio write request may require restart with the
203 * reminder if the kernel didn't write whole
206 rem_off = cblock->aio_offset + written;
207 rem_buf = (void *)(cblock->aio_buf + written);
208 record__aio_write(cblock, cblock->aio_fildes,
209 rem_buf, rem_size, rem_off);
216 static int record__aio_sync(struct perf_mmap *md, bool sync_all)
218 struct aiocb **aiocb = md->aio.aiocb;
219 struct aiocb *cblocks = md->aio.cblocks;
220 struct timespec timeout = { 0, 1000 * 1000 * 1 }; /* 1ms */
225 for (i = 0; i < md->aio.nr_cblocks; ++i) {
226 if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
233 * Started aio write is not complete yet
234 * so it has to be waited before the
237 aiocb[i] = &cblocks[i];
244 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
245 if (!(errno == EAGAIN || errno == EINTR))
246 pr_err("failed to sync perf data, error: %m\n");
257 static int record__aio_pushfn(struct perf_mmap *map, void *to, void *buf, size_t size)
259 struct record_aio *aio = to;
262 * map->base data pointed by buf is copied into free map->aio.data[] buffer
263 * to release space in the kernel buffer as fast as possible, calling
264 * perf_mmap__consume() from perf_mmap__push() function.
266 * That lets the kernel to proceed with storing more profiling data into
267 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
269 * Coping can be done in two steps in case the chunk of profiling data
270 * crosses the upper bound of the kernel buffer. In this case we first move
271 * part of data from map->start till the upper bound and then the reminder
272 * from the beginning of the kernel buffer till the end of the data chunk.
275 if (record__comp_enabled(aio->rec)) {
276 size = zstd_compress(aio->rec->session, aio->data + aio->size,
277 perf_mmap__mmap_len(map) - aio->size,
280 memcpy(aio->data + aio->size, buf, size);
285 * Increment map->refcount to guard map->aio.data[] buffer
286 * from premature deallocation because map object can be
287 * released earlier than aio write request started on
288 * map->aio.data[] buffer is complete.
290 * perf_mmap__put() is done at record__aio_complete()
291 * after started aio request completion or at record__aio_push()
292 * if the request failed to start.
302 static int record__aio_push(struct record *rec, struct perf_mmap *map, off_t *off)
305 int trace_fd = rec->session->data->file.fd;
306 struct record_aio aio = { .rec = rec, .size = 0 };
309 * Call record__aio_sync() to wait till map->aio.data[] buffer
310 * becomes available after previous aio write operation.
313 idx = record__aio_sync(map, false);
314 aio.data = map->aio.data[idx];
315 ret = perf_mmap__push(map, &aio, record__aio_pushfn);
316 if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
320 ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
323 rec->bytes_written += aio.size;
324 if (switch_output_size(rec))
325 trigger_hit(&switch_output_trigger);
328 * Decrement map->refcount incremented in record__aio_pushfn()
329 * back if record__aio_write() operation failed to start, otherwise
330 * map->refcount is decremented in record__aio_complete() after
331 * aio write operation finishes successfully.
339 static off_t record__aio_get_pos(int trace_fd)
341 return lseek(trace_fd, 0, SEEK_CUR);
344 static void record__aio_set_pos(int trace_fd, off_t pos)
346 lseek(trace_fd, pos, SEEK_SET);
349 static void record__aio_mmap_read_sync(struct record *rec)
352 struct evlist *evlist = rec->evlist;
353 struct perf_mmap *maps = evlist->mmap;
355 if (!record__aio_enabled(rec))
358 for (i = 0; i < evlist->nr_mmaps; i++) {
359 struct perf_mmap *map = &maps[i];
362 record__aio_sync(map, true);
366 static int nr_cblocks_default = 1;
367 static int nr_cblocks_max = 4;
369 static int record__aio_parse(const struct option *opt,
373 struct record_opts *opts = (struct record_opts *)opt->value;
376 opts->nr_cblocks = 0;
379 opts->nr_cblocks = strtol(str, NULL, 0);
380 if (!opts->nr_cblocks)
381 opts->nr_cblocks = nr_cblocks_default;
386 #else /* HAVE_AIO_SUPPORT */
387 static int nr_cblocks_max = 0;
389 static int record__aio_push(struct record *rec __maybe_unused, struct perf_mmap *map __maybe_unused,
390 off_t *off __maybe_unused)
395 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
400 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
404 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
409 static int record__aio_enabled(struct record *rec)
411 return rec->opts.nr_cblocks > 0;
414 #define MMAP_FLUSH_DEFAULT 1
415 static int record__mmap_flush_parse(const struct option *opt,
420 struct record_opts *opts = (struct record_opts *)opt->value;
421 static struct parse_tag tags[] = {
422 { .tag = 'B', .mult = 1 },
423 { .tag = 'K', .mult = 1 << 10 },
424 { .tag = 'M', .mult = 1 << 20 },
425 { .tag = 'G', .mult = 1 << 30 },
433 opts->mmap_flush = parse_tag_value(str, tags);
434 if (opts->mmap_flush == (int)-1)
435 opts->mmap_flush = strtol(str, NULL, 0);
438 if (!opts->mmap_flush)
439 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
441 flush_max = perf_evlist__mmap_size(opts->mmap_pages);
443 if (opts->mmap_flush > flush_max)
444 opts->mmap_flush = flush_max;
449 #ifdef HAVE_ZSTD_SUPPORT
450 static unsigned int comp_level_default = 1;
452 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
454 struct record_opts *opts = opt->value;
457 opts->comp_level = 0;
460 opts->comp_level = strtol(str, NULL, 0);
461 if (!opts->comp_level)
462 opts->comp_level = comp_level_default;
468 static unsigned int comp_level_max = 22;
470 static int record__comp_enabled(struct record *rec)
472 return rec->opts.comp_level > 0;
475 static int process_synthesized_event(struct perf_tool *tool,
476 union perf_event *event,
477 struct perf_sample *sample __maybe_unused,
478 struct machine *machine __maybe_unused)
480 struct record *rec = container_of(tool, struct record, tool);
481 return record__write(rec, NULL, event, event->header.size);
484 static int record__pushfn(struct perf_mmap *map, void *to, void *bf, size_t size)
486 struct record *rec = to;
488 if (record__comp_enabled(rec)) {
489 size = zstd_compress(rec->session, map->data, perf_mmap__mmap_len(map), bf, size);
494 return record__write(rec, map, bf, size);
497 static volatile int done;
498 static volatile int signr = -1;
499 static volatile int child_finished;
501 static void sig_handler(int sig)
511 static void sigsegv_handler(int sig)
513 perf_hooks__recover();
514 sighandler_dump_stack(sig);
517 static void record__sig_exit(void)
522 signal(signr, SIG_DFL);
526 #ifdef HAVE_AUXTRACE_SUPPORT
528 static int record__process_auxtrace(struct perf_tool *tool,
529 struct perf_mmap *map,
530 union perf_event *event, void *data1,
531 size_t len1, void *data2, size_t len2)
533 struct record *rec = container_of(tool, struct record, tool);
534 struct perf_data *data = &rec->data;
538 if (!perf_data__is_pipe(data) && !perf_data__is_dir(data)) {
540 int fd = perf_data__fd(data);
543 file_offset = lseek(fd, 0, SEEK_CUR);
544 if (file_offset == -1)
546 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
552 /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
553 padding = (len1 + len2) & 7;
555 padding = 8 - padding;
557 record__write(rec, map, event, event->header.size);
558 record__write(rec, map, data1, len1);
560 record__write(rec, map, data2, len2);
561 record__write(rec, map, &pad, padding);
566 static int record__auxtrace_mmap_read(struct record *rec,
567 struct perf_mmap *map)
571 ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
572 record__process_auxtrace);
582 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
583 struct perf_mmap *map)
587 ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
588 record__process_auxtrace,
589 rec->opts.auxtrace_snapshot_size);
599 static int record__auxtrace_read_snapshot_all(struct record *rec)
604 for (i = 0; i < rec->evlist->nr_mmaps; i++) {
605 struct perf_mmap *map = &rec->evlist->mmap[i];
607 if (!map->auxtrace_mmap.base)
610 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
619 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
621 pr_debug("Recording AUX area tracing snapshot\n");
622 if (record__auxtrace_read_snapshot_all(rec) < 0) {
623 trigger_error(&auxtrace_snapshot_trigger);
625 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
626 trigger_error(&auxtrace_snapshot_trigger);
628 trigger_ready(&auxtrace_snapshot_trigger);
632 static int record__auxtrace_snapshot_exit(struct record *rec)
634 if (trigger_is_error(&auxtrace_snapshot_trigger))
637 if (!auxtrace_record__snapshot_started &&
638 auxtrace_record__snapshot_start(rec->itr))
641 record__read_auxtrace_snapshot(rec, true);
642 if (trigger_is_error(&auxtrace_snapshot_trigger))
648 static int record__auxtrace_init(struct record *rec)
653 rec->itr = auxtrace_record__init(rec->evlist, &err);
658 err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
659 rec->opts.auxtrace_snapshot_opts);
663 return auxtrace_parse_filters(rec->evlist);
669 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
670 struct perf_mmap *map __maybe_unused)
676 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
677 bool on_exit __maybe_unused)
682 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
688 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
693 static int record__auxtrace_init(struct record *rec __maybe_unused)
700 static int record__mmap_evlist(struct record *rec,
701 struct evlist *evlist)
703 struct record_opts *opts = &rec->opts;
706 if (opts->affinity != PERF_AFFINITY_SYS)
707 cpu__setup_cpunode_map();
709 if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
710 opts->auxtrace_mmap_pages,
711 opts->auxtrace_snapshot_mode,
712 opts->nr_cblocks, opts->affinity,
713 opts->mmap_flush, opts->comp_level) < 0) {
714 if (errno == EPERM) {
715 pr_err("Permission error mapping pages.\n"
716 "Consider increasing "
717 "/proc/sys/kernel/perf_event_mlock_kb,\n"
718 "or try again with a smaller value of -m/--mmap_pages.\n"
719 "(current value: %u,%u)\n",
720 opts->mmap_pages, opts->auxtrace_mmap_pages);
723 pr_err("failed to mmap with %d (%s)\n", errno,
724 str_error_r(errno, msg, sizeof(msg)));
734 static int record__mmap(struct record *rec)
736 return record__mmap_evlist(rec, rec->evlist);
739 static int record__open(struct record *rec)
743 struct evlist *evlist = rec->evlist;
744 struct perf_session *session = rec->session;
745 struct record_opts *opts = &rec->opts;
749 * For initial_delay we need to add a dummy event so that we can track
750 * PERF_RECORD_MMAP while we wait for the initial delay to enable the
751 * real events, the ones asked by the user.
753 if (opts->initial_delay) {
754 if (perf_evlist__add_dummy(evlist))
757 pos = perf_evlist__first(evlist);
759 pos = perf_evlist__last(evlist);
761 pos->core.attr.enable_on_exec = 1;
764 perf_evlist__config(evlist, opts, &callchain_param);
766 evlist__for_each_entry(evlist, pos) {
768 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
769 if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
771 ui__warning("%s\n", msg);
774 if ((errno == EINVAL || errno == EBADF) &&
775 pos->leader != pos &&
777 pos = perf_evlist__reset_weak_group(evlist, pos);
781 perf_evsel__open_strerror(pos, &opts->target,
782 errno, msg, sizeof(msg));
783 ui__error("%s\n", msg);
787 pos->supported = true;
790 if (perf_evlist__apply_filters(evlist, &pos)) {
791 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
792 pos->filter, perf_evsel__name(pos), errno,
793 str_error_r(errno, msg, sizeof(msg)));
798 rc = record__mmap(rec);
802 session->evlist = evlist;
803 perf_session__set_id_hdr_size(session);
808 static int process_sample_event(struct perf_tool *tool,
809 union perf_event *event,
810 struct perf_sample *sample,
812 struct machine *machine)
814 struct record *rec = container_of(tool, struct record, tool);
816 if (rec->evlist->first_sample_time == 0)
817 rec->evlist->first_sample_time = sample->time;
819 rec->evlist->last_sample_time = sample->time;
821 if (rec->buildid_all)
825 return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
828 static int process_buildids(struct record *rec)
830 struct perf_session *session = rec->session;
832 if (perf_data__size(&rec->data) == 0)
836 * During this process, it'll load kernel map and replace the
837 * dso->long_name to a real pathname it found. In this case
838 * we prefer the vmlinux path like
839 * /lib/modules/3.16.4/build/vmlinux
841 * rather than build-id path (in debug directory).
842 * $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
844 symbol_conf.ignore_vmlinux_buildid = true;
847 * If --buildid-all is given, it marks all DSO regardless of hits,
848 * so no need to process samples. But if timestamp_boundary is enabled,
849 * it still needs to walk on all samples to get the timestamps of
850 * first/last samples.
852 if (rec->buildid_all && !rec->timestamp_boundary)
853 rec->tool.sample = NULL;
855 return perf_session__process_events(session);
858 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
861 struct perf_tool *tool = data;
863 *As for guest kernel when processing subcommand record&report,
864 *we arrange module mmap prior to guest kernel mmap and trigger
865 *a preload dso because default guest module symbols are loaded
866 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
867 *method is used to avoid symbol missing when the first addr is
868 *in module instead of in guest kernel.
870 err = perf_event__synthesize_modules(tool, process_synthesized_event,
873 pr_err("Couldn't record guest kernel [%d]'s reference"
874 " relocation symbol.\n", machine->pid);
877 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
878 * have no _text sometimes.
880 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
883 pr_err("Couldn't record guest kernel [%d]'s reference"
884 " relocation symbol.\n", machine->pid);
887 static struct perf_event_header finished_round_event = {
888 .size = sizeof(struct perf_event_header),
889 .type = PERF_RECORD_FINISHED_ROUND,
892 static void record__adjust_affinity(struct record *rec, struct perf_mmap *map)
894 if (rec->opts.affinity != PERF_AFFINITY_SYS &&
895 !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
896 CPU_ZERO(&rec->affinity_mask);
897 CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask);
898 sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask);
902 static size_t process_comp_header(void *record, size_t increment)
904 struct perf_record_compressed *event = record;
905 size_t size = sizeof(*event);
908 event->header.size += increment;
912 event->header.type = PERF_RECORD_COMPRESSED;
913 event->header.size = size;
918 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
919 void *src, size_t src_size)
922 size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
924 compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
925 max_record_size, process_comp_header);
927 session->bytes_transferred += src_size;
928 session->bytes_compressed += compressed;
933 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
934 bool overwrite, bool synch)
936 u64 bytes_written = rec->bytes_written;
939 struct perf_mmap *maps;
940 int trace_fd = rec->data.file.fd;
946 maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
950 if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
953 if (record__aio_enabled(rec))
954 off = record__aio_get_pos(trace_fd);
956 for (i = 0; i < evlist->nr_mmaps; i++) {
958 struct perf_mmap *map = &maps[i];
961 record__adjust_affinity(rec, map);
966 if (!record__aio_enabled(rec)) {
967 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
974 if (record__aio_push(rec, map, &off) < 0) {
975 record__aio_set_pos(trace_fd, off);
986 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
987 record__auxtrace_mmap_read(rec, map) != 0) {
993 if (record__aio_enabled(rec))
994 record__aio_set_pos(trace_fd, off);
997 * Mark the round finished in case we wrote
998 * at least one event.
1000 if (bytes_written != rec->bytes_written)
1001 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1004 perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1009 static int record__mmap_read_all(struct record *rec, bool synch)
1013 err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1017 return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1020 static void record__init_features(struct record *rec)
1022 struct perf_session *session = rec->session;
1025 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1026 perf_header__set_feat(&session->header, feat);
1028 if (rec->no_buildid)
1029 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1031 if (!have_tracepoints(&rec->evlist->core.entries))
1032 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1034 if (!rec->opts.branch_stack)
1035 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1037 if (!rec->opts.full_auxtrace)
1038 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1040 if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1041 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1043 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1044 if (!record__comp_enabled(rec))
1045 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1047 perf_header__clear_feat(&session->header, HEADER_STAT);
1051 record__finish_output(struct record *rec)
1053 struct perf_data *data = &rec->data;
1054 int fd = perf_data__fd(data);
1059 rec->session->header.data_size += rec->bytes_written;
1060 data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1062 if (!rec->no_buildid) {
1063 process_buildids(rec);
1065 if (rec->buildid_all)
1066 dsos__hit_all(rec->session);
1068 perf_session__write_header(rec->session, rec->evlist, fd, true);
1073 static int record__synthesize_workload(struct record *rec, bool tail)
1076 struct perf_thread_map *thread_map;
1078 if (rec->opts.tail_synthesize != tail)
1081 thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1082 if (thread_map == NULL)
1085 err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1086 process_synthesized_event,
1087 &rec->session->machines.host,
1088 rec->opts.sample_address);
1089 perf_thread_map__put(thread_map);
1093 static int record__synthesize(struct record *rec, bool tail);
1096 record__switch_output(struct record *rec, bool at_exit)
1098 struct perf_data *data = &rec->data;
1102 /* Same Size: "2015122520103046"*/
1103 char timestamp[] = "InvalidTimestamp";
1105 record__aio_mmap_read_sync(rec);
1107 record__synthesize(rec, true);
1108 if (target__none(&rec->opts.target))
1109 record__synthesize_workload(rec, true);
1112 record__finish_output(rec);
1113 err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1115 pr_err("Failed to get current timestamp\n");
1119 fd = perf_data__switch(data, timestamp,
1120 rec->session->header.data_offset,
1121 at_exit, &new_filename);
1122 if (fd >= 0 && !at_exit) {
1123 rec->bytes_written = 0;
1124 rec->session->header.data_size = 0;
1128 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1129 data->path, timestamp);
1131 if (rec->switch_output.num_files) {
1132 int n = rec->switch_output.cur_file + 1;
1134 if (n >= rec->switch_output.num_files)
1136 rec->switch_output.cur_file = n;
1137 if (rec->switch_output.filenames[n]) {
1138 remove(rec->switch_output.filenames[n]);
1139 zfree(&rec->switch_output.filenames[n]);
1141 rec->switch_output.filenames[n] = new_filename;
1146 /* Output tracking events */
1148 record__synthesize(rec, false);
1151 * In 'perf record --switch-output' without -a,
1152 * record__synthesize() in record__switch_output() won't
1153 * generate tracking events because there's no thread_map
1154 * in evlist. Which causes newly created perf.data doesn't
1155 * contain map and comm information.
1156 * Create a fake thread_map and directly call
1157 * perf_event__synthesize_thread_map() for those events.
1159 if (target__none(&rec->opts.target))
1160 record__synthesize_workload(rec, false);
1165 static volatile int workload_exec_errno;
1168 * perf_evlist__prepare_workload will send a SIGUSR1
1169 * if the fork fails, since we asked by setting its
1170 * want_signal to true.
1172 static void workload_exec_failed_signal(int signo __maybe_unused,
1174 void *ucontext __maybe_unused)
1176 workload_exec_errno = info->si_value.sival_int;
1181 static void snapshot_sig_handler(int sig);
1182 static void alarm_sig_handler(int sig);
1184 static const struct perf_event_mmap_page *
1185 perf_evlist__pick_pc(struct evlist *evlist)
1188 if (evlist->mmap && evlist->mmap[0].base)
1189 return evlist->mmap[0].base;
1190 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].base)
1191 return evlist->overwrite_mmap[0].base;
1196 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1198 const struct perf_event_mmap_page *pc;
1200 pc = perf_evlist__pick_pc(rec->evlist);
1206 static int record__synthesize(struct record *rec, bool tail)
1208 struct perf_session *session = rec->session;
1209 struct machine *machine = &session->machines.host;
1210 struct perf_data *data = &rec->data;
1211 struct record_opts *opts = &rec->opts;
1212 struct perf_tool *tool = &rec->tool;
1213 int fd = perf_data__fd(data);
1216 if (rec->opts.tail_synthesize != tail)
1219 if (data->is_pipe) {
1221 * We need to synthesize events first, because some
1222 * features works on top of them (on report side).
1224 err = perf_event__synthesize_attrs(tool, rec->evlist,
1225 process_synthesized_event);
1227 pr_err("Couldn't synthesize attrs.\n");
1231 err = perf_event__synthesize_features(tool, session, rec->evlist,
1232 process_synthesized_event);
1234 pr_err("Couldn't synthesize features.\n");
1238 if (have_tracepoints(&rec->evlist->core.entries)) {
1240 * FIXME err <= 0 here actually means that
1241 * there were no tracepoints so its not really
1242 * an error, just that we don't need to
1243 * synthesize anything. We really have to
1244 * return this more properly and also
1245 * propagate errors that now are calling die()
1247 err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
1248 process_synthesized_event);
1250 pr_err("Couldn't record tracing data.\n");
1253 rec->bytes_written += err;
1257 err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1258 process_synthesized_event, machine);
1262 if (rec->opts.full_auxtrace) {
1263 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1264 session, process_synthesized_event);
1269 if (!perf_evlist__exclude_kernel(rec->evlist)) {
1270 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1272 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1273 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1274 "Check /proc/kallsyms permission or run as root.\n");
1276 err = perf_event__synthesize_modules(tool, process_synthesized_event,
1278 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1279 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1280 "Check /proc/modules permission or run as root.\n");
1284 machines__process_guests(&session->machines,
1285 perf_event__synthesize_guest_os, tool);
1288 err = perf_event__synthesize_extra_attr(&rec->tool,
1290 process_synthesized_event,
1295 err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1296 process_synthesized_event,
1299 pr_err("Couldn't synthesize thread map.\n");
1303 err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1304 process_synthesized_event, NULL);
1306 pr_err("Couldn't synthesize cpu map.\n");
1310 err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1313 pr_warning("Couldn't synthesize bpf events.\n");
1315 err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1316 process_synthesized_event, opts->sample_address,
1322 static int __cmd_record(struct record *rec, int argc, const char **argv)
1326 unsigned long waking = 0;
1327 const bool forks = argc > 0;
1328 struct perf_tool *tool = &rec->tool;
1329 struct record_opts *opts = &rec->opts;
1330 struct perf_data *data = &rec->data;
1331 struct perf_session *session;
1332 bool disabled = false, draining = false;
1333 struct evlist *sb_evlist = NULL;
1337 atexit(record__sig_exit);
1338 signal(SIGCHLD, sig_handler);
1339 signal(SIGINT, sig_handler);
1340 signal(SIGTERM, sig_handler);
1341 signal(SIGSEGV, sigsegv_handler);
1343 if (rec->opts.record_namespaces)
1344 tool->namespace_events = true;
1346 if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1347 signal(SIGUSR2, snapshot_sig_handler);
1348 if (rec->opts.auxtrace_snapshot_mode)
1349 trigger_on(&auxtrace_snapshot_trigger);
1350 if (rec->switch_output.enabled)
1351 trigger_on(&switch_output_trigger);
1353 signal(SIGUSR2, SIG_IGN);
1356 session = perf_session__new(data, false, tool);
1357 if (session == NULL) {
1358 pr_err("Perf session creation failed.\n");
1362 fd = perf_data__fd(data);
1363 rec->session = session;
1365 if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1366 pr_err("Compression initialization failed.\n");
1370 session->header.env.comp_type = PERF_COMP_ZSTD;
1371 session->header.env.comp_level = rec->opts.comp_level;
1373 record__init_features(rec);
1375 if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1376 session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1379 err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1380 argv, data->is_pipe,
1381 workload_exec_failed_signal);
1383 pr_err("Couldn't run the workload!\n");
1385 goto out_delete_session;
1390 * If we have just single event and are sending data
1391 * through pipe, we need to force the ids allocation,
1392 * because we synthesize event name through the pipe
1393 * and need the id for that.
1395 if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1396 rec->opts.sample_id = true;
1398 if (record__open(rec) != 0) {
1402 session->header.env.comp_mmap_len = session->evlist->mmap_len;
1404 err = bpf__apply_obj_config();
1406 char errbuf[BUFSIZ];
1408 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1409 pr_err("ERROR: Apply config to BPF failed: %s\n",
1415 * Normally perf_session__new would do this, but it doesn't have the
1418 if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1419 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1420 rec->tool.ordered_events = false;
1423 if (!rec->evlist->nr_groups)
1424 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1426 if (data->is_pipe) {
1427 err = perf_header__write_pipe(fd);
1431 err = perf_session__write_header(session, rec->evlist, fd, false);
1436 if (!rec->no_buildid
1437 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1438 pr_err("Couldn't generate buildids. "
1439 "Use --no-buildid to profile anyway.\n");
1444 if (!opts->no_bpf_event)
1445 bpf_event__add_sb_event(&sb_evlist, &session->header.env);
1447 if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) {
1448 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1449 opts->no_bpf_event = true;
1452 err = record__synthesize(rec, false);
1456 if (rec->realtime_prio) {
1457 struct sched_param param;
1459 param.sched_priority = rec->realtime_prio;
1460 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) {
1461 pr_err("Could not set realtime priority.\n");
1468 * When perf is starting the traced process, all the events
1469 * (apart from group members) have enable_on_exec=1 set,
1470 * so don't spoil it by prematurely enabling them.
1472 if (!target__none(&opts->target) && !opts->initial_delay)
1473 evlist__enable(rec->evlist);
1479 struct machine *machine = &session->machines.host;
1480 union perf_event *event;
1483 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1484 if (event == NULL) {
1490 * Some H/W events are generated before COMM event
1491 * which is emitted during exec(), so perf script
1492 * cannot see a correct process name for those events.
1493 * Synthesize COMM event to prevent it.
1495 tgid = perf_event__synthesize_comm(tool, event,
1496 rec->evlist->workload.pid,
1497 process_synthesized_event,
1504 event = malloc(sizeof(event->namespaces) +
1505 (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1506 machine->id_hdr_size);
1507 if (event == NULL) {
1513 * Synthesize NAMESPACES event for the command specified.
1515 perf_event__synthesize_namespaces(tool, event,
1516 rec->evlist->workload.pid,
1517 tgid, process_synthesized_event,
1521 perf_evlist__start_workload(rec->evlist);
1524 if (opts->initial_delay) {
1525 usleep(opts->initial_delay * USEC_PER_MSEC);
1526 evlist__enable(rec->evlist);
1529 trigger_ready(&auxtrace_snapshot_trigger);
1530 trigger_ready(&switch_output_trigger);
1531 perf_hooks__invoke_record_start();
1533 unsigned long long hits = rec->samples;
1536 * rec->evlist->bkw_mmap_state is possible to be
1537 * BKW_MMAP_EMPTY here: when done == true and
1538 * hits != rec->samples in previous round.
1540 * perf_evlist__toggle_bkw_mmap ensure we never
1541 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1543 if (trigger_is_hit(&switch_output_trigger) || done || draining)
1544 perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1546 if (record__mmap_read_all(rec, false) < 0) {
1547 trigger_error(&auxtrace_snapshot_trigger);
1548 trigger_error(&switch_output_trigger);
1553 if (auxtrace_record__snapshot_started) {
1554 auxtrace_record__snapshot_started = 0;
1555 if (!trigger_is_error(&auxtrace_snapshot_trigger))
1556 record__read_auxtrace_snapshot(rec, false);
1557 if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1558 pr_err("AUX area tracing snapshot failed\n");
1564 if (trigger_is_hit(&switch_output_trigger)) {
1566 * If switch_output_trigger is hit, the data in
1567 * overwritable ring buffer should have been collected,
1568 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1570 * If SIGUSR2 raise after or during record__mmap_read_all(),
1571 * record__mmap_read_all() didn't collect data from
1572 * overwritable ring buffer. Read again.
1574 if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1576 trigger_ready(&switch_output_trigger);
1579 * Reenable events in overwrite ring buffer after
1580 * record__mmap_read_all(): we should have collected
1583 perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1586 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1589 fd = record__switch_output(rec, false);
1591 pr_err("Failed to switch to new file\n");
1592 trigger_error(&switch_output_trigger);
1597 /* re-arm the alarm */
1598 if (rec->switch_output.time)
1599 alarm(rec->switch_output.time);
1602 if (hits == rec->samples) {
1603 if (done || draining)
1605 err = perf_evlist__poll(rec->evlist, -1);
1607 * Propagate error, only if there's any. Ignore positive
1608 * number of returned events and interrupt error.
1610 if (err > 0 || (err < 0 && errno == EINTR))
1614 if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1619 * When perf is starting the traced process, at the end events
1620 * die with the process and we wait for that. Thus no need to
1621 * disable events in this case.
1623 if (done && !disabled && !target__none(&opts->target)) {
1624 trigger_off(&auxtrace_snapshot_trigger);
1625 evlist__disable(rec->evlist);
1630 trigger_off(&auxtrace_snapshot_trigger);
1631 trigger_off(&switch_output_trigger);
1633 if (opts->auxtrace_snapshot_on_exit)
1634 record__auxtrace_snapshot_exit(rec);
1636 if (forks && workload_exec_errno) {
1637 char msg[STRERR_BUFSIZE];
1638 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1639 pr_err("Workload failed: %s\n", emsg);
1645 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1647 if (target__none(&rec->opts.target))
1648 record__synthesize_workload(rec, true);
1651 record__mmap_read_all(rec, true);
1652 record__aio_mmap_read_sync(rec);
1654 if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1655 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1656 session->header.env.comp_ratio = ratio + 0.5;
1662 if (!child_finished)
1663 kill(rec->evlist->workload.pid, SIGTERM);
1669 else if (WIFEXITED(exit_status))
1670 status = WEXITSTATUS(exit_status);
1671 else if (WIFSIGNALED(exit_status))
1672 signr = WTERMSIG(exit_status);
1676 record__synthesize(rec, true);
1677 /* this will be recalculated during process_buildids() */
1681 if (!rec->timestamp_filename) {
1682 record__finish_output(rec);
1684 fd = record__switch_output(rec, true);
1687 goto out_delete_session;
1692 perf_hooks__invoke_record_end();
1694 if (!err && !quiet) {
1696 const char *postfix = rec->timestamp_filename ?
1697 ".<timestamp>" : "";
1699 if (rec->samples && !rec->opts.full_auxtrace)
1700 scnprintf(samples, sizeof(samples),
1701 " (%" PRIu64 " samples)", rec->samples);
1705 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
1706 perf_data__size(data) / 1024.0 / 1024.0,
1707 data->path, postfix, samples);
1709 fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
1710 rec->session->bytes_transferred / 1024.0 / 1024.0,
1713 fprintf(stderr, " ]\n");
1717 zstd_fini(&session->zstd_data);
1718 perf_session__delete(session);
1720 if (!opts->no_bpf_event)
1721 perf_evlist__stop_sb_thread(sb_evlist);
1725 static void callchain_debug(struct callchain_param *callchain)
1727 static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1729 pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1731 if (callchain->record_mode == CALLCHAIN_DWARF)
1732 pr_debug("callchain: stack dump size %d\n",
1733 callchain->dump_size);
1736 int record_opts__parse_callchain(struct record_opts *record,
1737 struct callchain_param *callchain,
1738 const char *arg, bool unset)
1741 callchain->enabled = !unset;
1743 /* --no-call-graph */
1745 callchain->record_mode = CALLCHAIN_NONE;
1746 pr_debug("callchain: disabled\n");
1750 ret = parse_callchain_record_opt(arg, callchain);
1752 /* Enable data address sampling for DWARF unwind. */
1753 if (callchain->record_mode == CALLCHAIN_DWARF)
1754 record->sample_address = true;
1755 callchain_debug(callchain);
1761 int record_parse_callchain_opt(const struct option *opt,
1765 return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1768 int record_callchain_opt(const struct option *opt,
1769 const char *arg __maybe_unused,
1770 int unset __maybe_unused)
1772 struct callchain_param *callchain = opt->value;
1774 callchain->enabled = true;
1776 if (callchain->record_mode == CALLCHAIN_NONE)
1777 callchain->record_mode = CALLCHAIN_FP;
1779 callchain_debug(callchain);
1783 static int perf_record_config(const char *var, const char *value, void *cb)
1785 struct record *rec = cb;
1787 if (!strcmp(var, "record.build-id")) {
1788 if (!strcmp(value, "cache"))
1789 rec->no_buildid_cache = false;
1790 else if (!strcmp(value, "no-cache"))
1791 rec->no_buildid_cache = true;
1792 else if (!strcmp(value, "skip"))
1793 rec->no_buildid = true;
1798 if (!strcmp(var, "record.call-graph")) {
1799 var = "call-graph.record-mode";
1800 return perf_default_config(var, value, cb);
1802 #ifdef HAVE_AIO_SUPPORT
1803 if (!strcmp(var, "record.aio")) {
1804 rec->opts.nr_cblocks = strtol(value, NULL, 0);
1805 if (!rec->opts.nr_cblocks)
1806 rec->opts.nr_cblocks = nr_cblocks_default;
1813 struct clockid_map {
1818 #define CLOCKID_MAP(n, c) \
1819 { .name = n, .clockid = (c), }
1821 #define CLOCKID_END { .name = NULL, }
1825 * Add the missing ones, we need to build on many distros...
1827 #ifndef CLOCK_MONOTONIC_RAW
1828 #define CLOCK_MONOTONIC_RAW 4
1830 #ifndef CLOCK_BOOTTIME
1831 #define CLOCK_BOOTTIME 7
1834 #define CLOCK_TAI 11
1837 static const struct clockid_map clockids[] = {
1838 /* available for all events, NMI safe */
1839 CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1840 CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1842 /* available for some events */
1843 CLOCKID_MAP("realtime", CLOCK_REALTIME),
1844 CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1845 CLOCKID_MAP("tai", CLOCK_TAI),
1847 /* available for the lazy */
1848 CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1849 CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1850 CLOCKID_MAP("real", CLOCK_REALTIME),
1851 CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1856 static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1858 struct timespec res;
1861 if (!clock_getres(clk_id, &res))
1862 *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1864 pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1869 static int parse_clockid(const struct option *opt, const char *str, int unset)
1871 struct record_opts *opts = (struct record_opts *)opt->value;
1872 const struct clockid_map *cm;
1873 const char *ostr = str;
1876 opts->use_clockid = 0;
1884 /* no setting it twice */
1885 if (opts->use_clockid)
1888 opts->use_clockid = true;
1890 /* if its a number, we're done */
1891 if (sscanf(str, "%d", &opts->clockid) == 1)
1892 return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
1894 /* allow a "CLOCK_" prefix to the name */
1895 if (!strncasecmp(str, "CLOCK_", 6))
1898 for (cm = clockids; cm->name; cm++) {
1899 if (!strcasecmp(str, cm->name)) {
1900 opts->clockid = cm->clockid;
1901 return get_clockid_res(opts->clockid,
1902 &opts->clockid_res_ns);
1906 opts->use_clockid = false;
1907 ui__warning("unknown clockid %s, check man page\n", ostr);
1911 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
1913 struct record_opts *opts = (struct record_opts *)opt->value;
1918 if (!strcasecmp(str, "node"))
1919 opts->affinity = PERF_AFFINITY_NODE;
1920 else if (!strcasecmp(str, "cpu"))
1921 opts->affinity = PERF_AFFINITY_CPU;
1926 static int record__parse_mmap_pages(const struct option *opt,
1928 int unset __maybe_unused)
1930 struct record_opts *opts = opt->value;
1932 unsigned int mmap_pages;
1947 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1950 opts->mmap_pages = mmap_pages;
1958 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1962 opts->auxtrace_mmap_pages = mmap_pages;
1969 static void switch_output_size_warn(struct record *rec)
1971 u64 wakeup_size = perf_evlist__mmap_size(rec->opts.mmap_pages);
1972 struct switch_output *s = &rec->switch_output;
1976 if (s->size < wakeup_size) {
1979 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
1980 pr_warning("WARNING: switch-output data size lower than "
1981 "wakeup kernel buffer size (%s) "
1982 "expect bigger perf.data sizes\n", buf);
1986 static int switch_output_setup(struct record *rec)
1988 struct switch_output *s = &rec->switch_output;
1989 static struct parse_tag tags_size[] = {
1990 { .tag = 'B', .mult = 1 },
1991 { .tag = 'K', .mult = 1 << 10 },
1992 { .tag = 'M', .mult = 1 << 20 },
1993 { .tag = 'G', .mult = 1 << 30 },
1996 static struct parse_tag tags_time[] = {
1997 { .tag = 's', .mult = 1 },
1998 { .tag = 'm', .mult = 60 },
1999 { .tag = 'h', .mult = 60*60 },
2000 { .tag = 'd', .mult = 60*60*24 },
2008 if (!strcmp(s->str, "signal")) {
2010 pr_debug("switch-output with SIGUSR2 signal\n");
2014 val = parse_tag_value(s->str, tags_size);
2015 if (val != (unsigned long) -1) {
2017 pr_debug("switch-output with %s size threshold\n", s->str);
2021 val = parse_tag_value(s->str, tags_time);
2022 if (val != (unsigned long) -1) {
2024 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2032 rec->timestamp_filename = true;
2035 if (s->size && !rec->opts.no_buffering)
2036 switch_output_size_warn(rec);
2041 static const char * const __record_usage[] = {
2042 "perf record [<options>] [<command>]",
2043 "perf record [<options>] -- <command> [<options>]",
2046 const char * const *record_usage = __record_usage;
2049 * XXX Ideally would be local to cmd_record() and passed to a record__new
2050 * because we need to have access to it in record__exit, that is called
2051 * after cmd_record() exits, but since record_options need to be accessible to
2052 * builtin-script, leave it here.
2054 * At least we don't ouch it in all the other functions here directly.
2056 * Just say no to tons of global variables, sigh.
2058 static struct record record = {
2060 .sample_time = true,
2061 .mmap_pages = UINT_MAX,
2062 .user_freq = UINT_MAX,
2063 .user_interval = ULLONG_MAX,
2067 .default_per_cpu = true,
2069 .mmap_flush = MMAP_FLUSH_DEFAULT,
2072 .sample = process_sample_event,
2073 .fork = perf_event__process_fork,
2074 .exit = perf_event__process_exit,
2075 .comm = perf_event__process_comm,
2076 .namespaces = perf_event__process_namespaces,
2077 .mmap = perf_event__process_mmap,
2078 .mmap2 = perf_event__process_mmap2,
2079 .ordered_events = true,
2083 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2084 "\n\t\t\t\tDefault: fp";
2086 static bool dry_run;
2089 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2090 * with it and switch to use the library functions in perf_evlist that came
2091 * from builtin-record.c, i.e. use record_opts,
2092 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2095 static struct option __record_options[] = {
2096 OPT_CALLBACK('e', "event", &record.evlist, "event",
2097 "event selector. use 'perf list' to list available events",
2098 parse_events_option),
2099 OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2100 "event filter", parse_filter),
2101 OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2102 NULL, "don't record events from perf itself",
2104 OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2105 "record events on existing process id"),
2106 OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2107 "record events on existing thread id"),
2108 OPT_INTEGER('r', "realtime", &record.realtime_prio,
2109 "collect data with this RT SCHED_FIFO priority"),
2110 OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2111 "collect data without buffering"),
2112 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2113 "collect raw sample records from all opened counters"),
2114 OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2115 "system-wide collection from all CPUs"),
2116 OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2117 "list of cpus to monitor"),
2118 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2119 OPT_STRING('o', "output", &record.data.path, "file",
2120 "output file name"),
2121 OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2122 &record.opts.no_inherit_set,
2123 "child tasks do not inherit counters"),
2124 OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2125 "synthesize non-sample events at the end of output"),
2126 OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2127 OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
2128 OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2129 "Fail if the specified frequency can't be used"),
2130 OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2131 "profile at this frequency",
2132 record__parse_freq),
2133 OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2134 "number of mmap data pages and AUX area tracing mmap pages",
2135 record__parse_mmap_pages),
2136 OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2137 "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2138 record__mmap_flush_parse),
2139 OPT_BOOLEAN(0, "group", &record.opts.group,
2140 "put the counters into a counter group"),
2141 OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2142 NULL, "enables call-graph recording" ,
2143 &record_callchain_opt),
2144 OPT_CALLBACK(0, "call-graph", &record.opts,
2145 "record_mode[,record_size]", record_callchain_help,
2146 &record_parse_callchain_opt),
2147 OPT_INCR('v', "verbose", &verbose,
2148 "be more verbose (show counter open errors, etc)"),
2149 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2150 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2151 "per thread counts"),
2152 OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2153 OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2154 "Record the sample physical addresses"),
2155 OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2156 OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2157 &record.opts.sample_time_set,
2158 "Record the sample timestamps"),
2159 OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2160 "Record the sample period"),
2161 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2163 OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2164 &record.no_buildid_cache_set,
2165 "do not update the buildid cache"),
2166 OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2167 &record.no_buildid_set,
2168 "do not collect buildids in perf.data"),
2169 OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2170 "monitor event in cgroup name only",
2172 OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2173 "ms to wait before starting measurement after program start"),
2174 OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2177 OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2178 "branch any", "sample any taken branches",
2179 parse_branch_stack),
2181 OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2182 "branch filter mask", "branch stack filter modes",
2183 parse_branch_stack),
2184 OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2185 "sample by weight (on special events only)"),
2186 OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2187 "sample transaction flags (special events only)"),
2188 OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2189 "use per-thread mmaps"),
2190 OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2191 "sample selected machine registers on interrupt,"
2192 " use '-I?' to list register names", parse_intr_regs),
2193 OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2194 "sample selected machine registers on interrupt,"
2195 " use '--user-regs=?' to list register names", parse_user_regs),
2196 OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2197 "Record running/enabled time of read (:S) events"),
2198 OPT_CALLBACK('k', "clockid", &record.opts,
2199 "clockid", "clockid to use for events, see clock_gettime()",
2201 OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2202 "opts", "AUX area tracing Snapshot Mode", ""),
2203 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2204 "per thread proc mmap processing timeout in ms"),
2205 OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2206 "Record namespaces events"),
2207 OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
2208 "Record context switch events"),
2209 OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2210 "Configure all used events to run in kernel space.",
2211 PARSE_OPT_EXCLUSIVE),
2212 OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2213 "Configure all used events to run in user space.",
2214 PARSE_OPT_EXCLUSIVE),
2215 OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2216 "collect kernel callchains"),
2217 OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2218 "collect user callchains"),
2219 OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2220 "clang binary to use for compiling BPF scriptlets"),
2221 OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2222 "options passed to clang when compiling BPF scriptlets"),
2223 OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2224 "file", "vmlinux pathname"),
2225 OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2226 "Record build-id of all DSOs regardless of hits"),
2227 OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2228 "append timestamp to output filename"),
2229 OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2230 "Record timestamp boundary (time of first/last samples)"),
2231 OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2232 &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2233 "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2235 OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2236 "Limit number of switch output generated files"),
2237 OPT_BOOLEAN(0, "dry-run", &dry_run,
2238 "Parse options then exit"),
2239 #ifdef HAVE_AIO_SUPPORT
2240 OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2241 &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2244 OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2245 "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2246 record__parse_affinity),
2247 #ifdef HAVE_ZSTD_SUPPORT
2248 OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2249 "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2250 record__parse_comp_level),
2255 struct option *record_options = __record_options;
2257 int cmd_record(int argc, const char **argv)
2260 struct record *rec = &record;
2261 char errbuf[BUFSIZ];
2263 setlocale(LC_ALL, "");
2265 #ifndef HAVE_LIBBPF_SUPPORT
2266 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2267 set_nobuild('\0', "clang-path", true);
2268 set_nobuild('\0', "clang-opt", true);
2272 #ifndef HAVE_BPF_PROLOGUE
2273 # if !defined (HAVE_DWARF_SUPPORT)
2274 # define REASON "NO_DWARF=1"
2275 # elif !defined (HAVE_LIBBPF_SUPPORT)
2276 # define REASON "NO_LIBBPF=1"
2278 # define REASON "this architecture doesn't support BPF prologue"
2280 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2281 set_nobuild('\0', "vmlinux", true);
2286 CPU_ZERO(&rec->affinity_mask);
2287 rec->opts.affinity = PERF_AFFINITY_SYS;
2289 rec->evlist = evlist__new();
2290 if (rec->evlist == NULL)
2293 err = perf_config(perf_record_config, rec);
2297 argc = parse_options(argc, argv, record_options, record_usage,
2298 PARSE_OPT_STOP_AT_NON_OPTION);
2300 perf_quiet_option();
2302 /* Make system wide (-a) the default target. */
2303 if (!argc && target__none(&rec->opts.target))
2304 rec->opts.target.system_wide = true;
2306 if (nr_cgroups && !rec->opts.target.system_wide) {
2307 usage_with_options_msg(record_usage, record_options,
2308 "cgroup monitoring only available in system-wide mode");
2312 if (rec->opts.comp_level != 0) {
2313 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2314 rec->no_buildid = true;
2317 if (rec->opts.record_switch_events &&
2318 !perf_can_record_switch_events()) {
2319 ui__error("kernel does not support recording context switch events\n");
2320 parse_options_usage(record_usage, record_options, "switch-events", 0);
2324 if (switch_output_setup(rec)) {
2325 parse_options_usage(record_usage, record_options, "switch-output", 0);
2329 if (rec->switch_output.time) {
2330 signal(SIGALRM, alarm_sig_handler);
2331 alarm(rec->switch_output.time);
2334 if (rec->switch_output.num_files) {
2335 rec->switch_output.filenames = calloc(sizeof(char *),
2336 rec->switch_output.num_files);
2337 if (!rec->switch_output.filenames)
2342 * Allow aliases to facilitate the lookup of symbols for address
2343 * filters. Refer to auxtrace_parse_filters().
2345 symbol_conf.allow_aliases = true;
2349 err = record__auxtrace_init(rec);
2356 err = bpf__setup_stdout(rec->evlist);
2358 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2359 pr_err("ERROR: Setup BPF stdout failed: %s\n",
2366 if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(rec->evlist))
2368 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
2369 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
2370 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
2371 "file is not found in the buildid cache or in the vmlinux path.\n\n"
2372 "Samples in kernel modules won't be resolved at all.\n\n"
2373 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
2374 "even with a suitable vmlinux or kallsyms file.\n\n");
2376 if (rec->no_buildid_cache || rec->no_buildid) {
2377 disable_buildid_cache();
2378 } else if (rec->switch_output.enabled) {
2380 * In 'perf record --switch-output', disable buildid
2381 * generation by default to reduce data file switching
2382 * overhead. Still generate buildid if they are required
2385 * perf record --switch-output --no-no-buildid \
2386 * --no-no-buildid-cache
2388 * Following code equals to:
2390 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2391 * (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2392 * disable_buildid_cache();
2394 bool disable = true;
2396 if (rec->no_buildid_set && !rec->no_buildid)
2398 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2401 rec->no_buildid = true;
2402 rec->no_buildid_cache = true;
2403 disable_buildid_cache();
2407 if (record.opts.overwrite)
2408 record.opts.tail_synthesize = true;
2410 if (rec->evlist->core.nr_entries == 0 &&
2411 __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2412 pr_err("Not enough memory for event selector list\n");
2416 if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2417 rec->opts.no_inherit = true;
2419 err = target__validate(&rec->opts.target);
2421 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2422 ui__warning("%s\n", errbuf);
2425 err = target__parse_uid(&rec->opts.target);
2427 int saved_errno = errno;
2429 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2430 ui__error("%s", errbuf);
2436 /* Enable ignoring missing threads when -u/-p option is defined. */
2437 rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2440 if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2441 usage_with_options(record_usage, record_options);
2443 err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2448 * We take all buildids when the file contains
2449 * AUX area tracing data because we do not decode the
2450 * trace because it would take too long.
2452 if (rec->opts.full_auxtrace)
2453 rec->buildid_all = true;
2455 if (record_opts__config(&rec->opts)) {
2460 if (rec->opts.nr_cblocks > nr_cblocks_max)
2461 rec->opts.nr_cblocks = nr_cblocks_max;
2462 pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2464 pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2465 pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2467 if (rec->opts.comp_level > comp_level_max)
2468 rec->opts.comp_level = comp_level_max;
2469 pr_debug("comp level: %d\n", rec->opts.comp_level);
2471 err = __cmd_record(&record, argc, argv);
2473 evlist__delete(rec->evlist);
2475 auxtrace_record__free(rec->itr);
2479 static void snapshot_sig_handler(int sig __maybe_unused)
2481 struct record *rec = &record;
2483 if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2484 trigger_hit(&auxtrace_snapshot_trigger);
2485 auxtrace_record__snapshot_started = 1;
2486 if (auxtrace_record__snapshot_start(record.itr))
2487 trigger_error(&auxtrace_snapshot_trigger);
2490 if (switch_output_signal(rec))
2491 trigger_hit(&switch_output_trigger);
2494 static void alarm_sig_handler(int sig __maybe_unused)
2496 struct record *rec = &record;
2498 if (switch_output_time(rec))
2499 trigger_hit(&switch_output_trigger);