1 // SPDX-License-Identifier: GPL-2.0
5 * Builtin record command: Record the profile of a workload
6 * (or a CPU, or a PID) into the perf.data output file - for
7 * later analysis via perf report.
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/target.h"
24 #include "util/session.h"
25 #include "util/tool.h"
26 #include "util/symbol.h"
27 #include "util/record.h"
28 #include "util/cpumap.h"
29 #include "util/thread_map.h"
30 #include "util/data.h"
31 #include "util/perf_regs.h"
32 #include "util/auxtrace.h"
34 #include "util/parse-branch-options.h"
35 #include "util/parse-regs-options.h"
36 #include "util/llvm-utils.h"
37 #include "util/bpf-loader.h"
38 #include "util/trigger.h"
39 #include "util/perf-hooks.h"
40 #include "util/cpu-set-sched.h"
41 #include "util/time-utils.h"
42 #include "util/units.h"
43 #include "util/bpf-event.h"
55 #include <linux/time64.h>
56 #include <linux/zalloc.h>
58 struct switch_output {
71 struct perf_tool tool;
72 struct record_opts opts;
74 struct perf_data data;
75 struct auxtrace_record *itr;
76 struct evlist *evlist;
77 struct perf_session *session;
81 bool no_buildid_cache;
82 bool no_buildid_cache_set;
84 bool timestamp_filename;
85 bool timestamp_boundary;
86 struct switch_output switch_output;
87 unsigned long long samples;
88 cpu_set_t affinity_mask;
91 static volatile int auxtrace_record__snapshot_started;
92 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
93 static DEFINE_TRIGGER(switch_output_trigger);
95 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
99 static bool switch_output_signal(struct record *rec)
101 return rec->switch_output.signal &&
102 trigger_is_ready(&switch_output_trigger);
105 static bool switch_output_size(struct record *rec)
107 return rec->switch_output.size &&
108 trigger_is_ready(&switch_output_trigger) &&
109 (rec->bytes_written >= rec->switch_output.size);
112 static bool switch_output_time(struct record *rec)
114 return rec->switch_output.time &&
115 trigger_is_ready(&switch_output_trigger);
118 static int record__write(struct record *rec, struct perf_mmap *map __maybe_unused,
119 void *bf, size_t size)
121 struct perf_data_file *file = &rec->session->data->file;
123 if (perf_data_file__write(file, bf, size) < 0) {
124 pr_err("failed to write perf data, error: %m\n");
128 rec->bytes_written += size;
130 if (switch_output_size(rec))
131 trigger_hit(&switch_output_trigger);
136 static int record__aio_enabled(struct record *rec);
137 static int record__comp_enabled(struct record *rec);
138 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
139 void *src, size_t src_size);
141 #ifdef HAVE_AIO_SUPPORT
142 static int record__aio_write(struct aiocb *cblock, int trace_fd,
143 void *buf, size_t size, off_t off)
147 cblock->aio_fildes = trace_fd;
148 cblock->aio_buf = buf;
149 cblock->aio_nbytes = size;
150 cblock->aio_offset = off;
151 cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
154 rc = aio_write(cblock);
157 } else if (errno != EAGAIN) {
158 cblock->aio_fildes = -1;
159 pr_err("failed to queue perf data, error: %m\n");
167 static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
173 ssize_t aio_ret, written;
175 aio_errno = aio_error(cblock);
176 if (aio_errno == EINPROGRESS)
179 written = aio_ret = aio_return(cblock);
181 if (aio_errno != EINTR)
182 pr_err("failed to write perf data, error: %m\n");
186 rem_size = cblock->aio_nbytes - written;
189 cblock->aio_fildes = -1;
191 * md->refcount is incremented in record__aio_pushfn() for
192 * every aio write request started in record__aio_push() so
193 * decrement it because the request is now complete.
199 * aio write request may require restart with the
200 * reminder if the kernel didn't write whole
203 rem_off = cblock->aio_offset + written;
204 rem_buf = (void *)(cblock->aio_buf + written);
205 record__aio_write(cblock, cblock->aio_fildes,
206 rem_buf, rem_size, rem_off);
213 static int record__aio_sync(struct perf_mmap *md, bool sync_all)
215 struct aiocb **aiocb = md->aio.aiocb;
216 struct aiocb *cblocks = md->aio.cblocks;
217 struct timespec timeout = { 0, 1000 * 1000 * 1 }; /* 1ms */
222 for (i = 0; i < md->aio.nr_cblocks; ++i) {
223 if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
230 * Started aio write is not complete yet
231 * so it has to be waited before the
234 aiocb[i] = &cblocks[i];
241 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
242 if (!(errno == EAGAIN || errno == EINTR))
243 pr_err("failed to sync perf data, error: %m\n");
254 static int record__aio_pushfn(struct perf_mmap *map, void *to, void *buf, size_t size)
256 struct record_aio *aio = to;
259 * map->base data pointed by buf is copied into free map->aio.data[] buffer
260 * to release space in the kernel buffer as fast as possible, calling
261 * perf_mmap__consume() from perf_mmap__push() function.
263 * That lets the kernel to proceed with storing more profiling data into
264 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
266 * Coping can be done in two steps in case the chunk of profiling data
267 * crosses the upper bound of the kernel buffer. In this case we first move
268 * part of data from map->start till the upper bound and then the reminder
269 * from the beginning of the kernel buffer till the end of the data chunk.
272 if (record__comp_enabled(aio->rec)) {
273 size = zstd_compress(aio->rec->session, aio->data + aio->size,
274 perf_mmap__mmap_len(map) - aio->size,
277 memcpy(aio->data + aio->size, buf, size);
282 * Increment map->refcount to guard map->aio.data[] buffer
283 * from premature deallocation because map object can be
284 * released earlier than aio write request started on
285 * map->aio.data[] buffer is complete.
287 * perf_mmap__put() is done at record__aio_complete()
288 * after started aio request completion or at record__aio_push()
289 * if the request failed to start.
299 static int record__aio_push(struct record *rec, struct perf_mmap *map, off_t *off)
302 int trace_fd = rec->session->data->file.fd;
303 struct record_aio aio = { .rec = rec, .size = 0 };
306 * Call record__aio_sync() to wait till map->aio.data[] buffer
307 * becomes available after previous aio write operation.
310 idx = record__aio_sync(map, false);
311 aio.data = map->aio.data[idx];
312 ret = perf_mmap__push(map, &aio, record__aio_pushfn);
313 if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
317 ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
320 rec->bytes_written += aio.size;
321 if (switch_output_size(rec))
322 trigger_hit(&switch_output_trigger);
325 * Decrement map->refcount incremented in record__aio_pushfn()
326 * back if record__aio_write() operation failed to start, otherwise
327 * map->refcount is decremented in record__aio_complete() after
328 * aio write operation finishes successfully.
336 static off_t record__aio_get_pos(int trace_fd)
338 return lseek(trace_fd, 0, SEEK_CUR);
341 static void record__aio_set_pos(int trace_fd, off_t pos)
343 lseek(trace_fd, pos, SEEK_SET);
346 static void record__aio_mmap_read_sync(struct record *rec)
349 struct evlist *evlist = rec->evlist;
350 struct perf_mmap *maps = evlist->mmap;
352 if (!record__aio_enabled(rec))
355 for (i = 0; i < evlist->nr_mmaps; i++) {
356 struct perf_mmap *map = &maps[i];
359 record__aio_sync(map, true);
363 static int nr_cblocks_default = 1;
364 static int nr_cblocks_max = 4;
366 static int record__aio_parse(const struct option *opt,
370 struct record_opts *opts = (struct record_opts *)opt->value;
373 opts->nr_cblocks = 0;
376 opts->nr_cblocks = strtol(str, NULL, 0);
377 if (!opts->nr_cblocks)
378 opts->nr_cblocks = nr_cblocks_default;
383 #else /* HAVE_AIO_SUPPORT */
384 static int nr_cblocks_max = 0;
386 static int record__aio_push(struct record *rec __maybe_unused, struct perf_mmap *map __maybe_unused,
387 off_t *off __maybe_unused)
392 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
397 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
401 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
406 static int record__aio_enabled(struct record *rec)
408 return rec->opts.nr_cblocks > 0;
411 #define MMAP_FLUSH_DEFAULT 1
412 static int record__mmap_flush_parse(const struct option *opt,
417 struct record_opts *opts = (struct record_opts *)opt->value;
418 static struct parse_tag tags[] = {
419 { .tag = 'B', .mult = 1 },
420 { .tag = 'K', .mult = 1 << 10 },
421 { .tag = 'M', .mult = 1 << 20 },
422 { .tag = 'G', .mult = 1 << 30 },
430 opts->mmap_flush = parse_tag_value(str, tags);
431 if (opts->mmap_flush == (int)-1)
432 opts->mmap_flush = strtol(str, NULL, 0);
435 if (!opts->mmap_flush)
436 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
438 flush_max = perf_evlist__mmap_size(opts->mmap_pages);
440 if (opts->mmap_flush > flush_max)
441 opts->mmap_flush = flush_max;
446 #ifdef HAVE_ZSTD_SUPPORT
447 static unsigned int comp_level_default = 1;
449 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
451 struct record_opts *opts = opt->value;
454 opts->comp_level = 0;
457 opts->comp_level = strtol(str, NULL, 0);
458 if (!opts->comp_level)
459 opts->comp_level = comp_level_default;
465 static unsigned int comp_level_max = 22;
467 static int record__comp_enabled(struct record *rec)
469 return rec->opts.comp_level > 0;
472 static int process_synthesized_event(struct perf_tool *tool,
473 union perf_event *event,
474 struct perf_sample *sample __maybe_unused,
475 struct machine *machine __maybe_unused)
477 struct record *rec = container_of(tool, struct record, tool);
478 return record__write(rec, NULL, event, event->header.size);
481 static int record__pushfn(struct perf_mmap *map, void *to, void *bf, size_t size)
483 struct record *rec = to;
485 if (record__comp_enabled(rec)) {
486 size = zstd_compress(rec->session, map->data, perf_mmap__mmap_len(map), bf, size);
491 return record__write(rec, map, bf, size);
494 static volatile int done;
495 static volatile int signr = -1;
496 static volatile int child_finished;
498 static void sig_handler(int sig)
508 static void sigsegv_handler(int sig)
510 perf_hooks__recover();
511 sighandler_dump_stack(sig);
514 static void record__sig_exit(void)
519 signal(signr, SIG_DFL);
523 #ifdef HAVE_AUXTRACE_SUPPORT
525 static int record__process_auxtrace(struct perf_tool *tool,
526 struct perf_mmap *map,
527 union perf_event *event, void *data1,
528 size_t len1, void *data2, size_t len2)
530 struct record *rec = container_of(tool, struct record, tool);
531 struct perf_data *data = &rec->data;
535 if (!perf_data__is_pipe(data) && !perf_data__is_dir(data)) {
537 int fd = perf_data__fd(data);
540 file_offset = lseek(fd, 0, SEEK_CUR);
541 if (file_offset == -1)
543 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
549 /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
550 padding = (len1 + len2) & 7;
552 padding = 8 - padding;
554 record__write(rec, map, event, event->header.size);
555 record__write(rec, map, data1, len1);
557 record__write(rec, map, data2, len2);
558 record__write(rec, map, &pad, padding);
563 static int record__auxtrace_mmap_read(struct record *rec,
564 struct perf_mmap *map)
568 ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
569 record__process_auxtrace);
579 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
580 struct perf_mmap *map)
584 ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
585 record__process_auxtrace,
586 rec->opts.auxtrace_snapshot_size);
596 static int record__auxtrace_read_snapshot_all(struct record *rec)
601 for (i = 0; i < rec->evlist->nr_mmaps; i++) {
602 struct perf_mmap *map = &rec->evlist->mmap[i];
604 if (!map->auxtrace_mmap.base)
607 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
616 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
618 pr_debug("Recording AUX area tracing snapshot\n");
619 if (record__auxtrace_read_snapshot_all(rec) < 0) {
620 trigger_error(&auxtrace_snapshot_trigger);
622 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
623 trigger_error(&auxtrace_snapshot_trigger);
625 trigger_ready(&auxtrace_snapshot_trigger);
629 static int record__auxtrace_snapshot_exit(struct record *rec)
631 if (trigger_is_error(&auxtrace_snapshot_trigger))
634 if (!auxtrace_record__snapshot_started &&
635 auxtrace_record__snapshot_start(rec->itr))
638 record__read_auxtrace_snapshot(rec, true);
639 if (trigger_is_error(&auxtrace_snapshot_trigger))
645 static int record__auxtrace_init(struct record *rec)
650 rec->itr = auxtrace_record__init(rec->evlist, &err);
655 err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
656 rec->opts.auxtrace_snapshot_opts);
660 return auxtrace_parse_filters(rec->evlist);
666 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
667 struct perf_mmap *map __maybe_unused)
673 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
674 bool on_exit __maybe_unused)
679 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
685 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
690 static int record__auxtrace_init(struct record *rec __maybe_unused)
697 static int record__mmap_evlist(struct record *rec,
698 struct evlist *evlist)
700 struct record_opts *opts = &rec->opts;
703 if (opts->affinity != PERF_AFFINITY_SYS)
704 cpu__setup_cpunode_map();
706 if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
707 opts->auxtrace_mmap_pages,
708 opts->auxtrace_snapshot_mode,
709 opts->nr_cblocks, opts->affinity,
710 opts->mmap_flush, opts->comp_level) < 0) {
711 if (errno == EPERM) {
712 pr_err("Permission error mapping pages.\n"
713 "Consider increasing "
714 "/proc/sys/kernel/perf_event_mlock_kb,\n"
715 "or try again with a smaller value of -m/--mmap_pages.\n"
716 "(current value: %u,%u)\n",
717 opts->mmap_pages, opts->auxtrace_mmap_pages);
720 pr_err("failed to mmap with %d (%s)\n", errno,
721 str_error_r(errno, msg, sizeof(msg)));
731 static int record__mmap(struct record *rec)
733 return record__mmap_evlist(rec, rec->evlist);
736 static int record__open(struct record *rec)
740 struct evlist *evlist = rec->evlist;
741 struct perf_session *session = rec->session;
742 struct record_opts *opts = &rec->opts;
746 * For initial_delay we need to add a dummy event so that we can track
747 * PERF_RECORD_MMAP while we wait for the initial delay to enable the
748 * real events, the ones asked by the user.
750 if (opts->initial_delay) {
751 if (perf_evlist__add_dummy(evlist))
754 pos = perf_evlist__first(evlist);
756 pos = perf_evlist__last(evlist);
758 pos->core.attr.enable_on_exec = 1;
761 perf_evlist__config(evlist, opts, &callchain_param);
763 evlist__for_each_entry(evlist, pos) {
765 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
766 if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
768 ui__warning("%s\n", msg);
771 if ((errno == EINVAL || errno == EBADF) &&
772 pos->leader != pos &&
774 pos = perf_evlist__reset_weak_group(evlist, pos);
778 perf_evsel__open_strerror(pos, &opts->target,
779 errno, msg, sizeof(msg));
780 ui__error("%s\n", msg);
784 pos->supported = true;
787 if (perf_evlist__apply_filters(evlist, &pos)) {
788 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
789 pos->filter, perf_evsel__name(pos), errno,
790 str_error_r(errno, msg, sizeof(msg)));
795 rc = record__mmap(rec);
799 session->evlist = evlist;
800 perf_session__set_id_hdr_size(session);
805 static int process_sample_event(struct perf_tool *tool,
806 union perf_event *event,
807 struct perf_sample *sample,
809 struct machine *machine)
811 struct record *rec = container_of(tool, struct record, tool);
813 if (rec->evlist->first_sample_time == 0)
814 rec->evlist->first_sample_time = sample->time;
816 rec->evlist->last_sample_time = sample->time;
818 if (rec->buildid_all)
822 return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
825 static int process_buildids(struct record *rec)
827 struct perf_session *session = rec->session;
829 if (perf_data__size(&rec->data) == 0)
833 * During this process, it'll load kernel map and replace the
834 * dso->long_name to a real pathname it found. In this case
835 * we prefer the vmlinux path like
836 * /lib/modules/3.16.4/build/vmlinux
838 * rather than build-id path (in debug directory).
839 * $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
841 symbol_conf.ignore_vmlinux_buildid = true;
844 * If --buildid-all is given, it marks all DSO regardless of hits,
845 * so no need to process samples. But if timestamp_boundary is enabled,
846 * it still needs to walk on all samples to get the timestamps of
847 * first/last samples.
849 if (rec->buildid_all && !rec->timestamp_boundary)
850 rec->tool.sample = NULL;
852 return perf_session__process_events(session);
855 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
858 struct perf_tool *tool = data;
860 *As for guest kernel when processing subcommand record&report,
861 *we arrange module mmap prior to guest kernel mmap and trigger
862 *a preload dso because default guest module symbols are loaded
863 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
864 *method is used to avoid symbol missing when the first addr is
865 *in module instead of in guest kernel.
867 err = perf_event__synthesize_modules(tool, process_synthesized_event,
870 pr_err("Couldn't record guest kernel [%d]'s reference"
871 " relocation symbol.\n", machine->pid);
874 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
875 * have no _text sometimes.
877 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
880 pr_err("Couldn't record guest kernel [%d]'s reference"
881 " relocation symbol.\n", machine->pid);
884 static struct perf_event_header finished_round_event = {
885 .size = sizeof(struct perf_event_header),
886 .type = PERF_RECORD_FINISHED_ROUND,
889 static void record__adjust_affinity(struct record *rec, struct perf_mmap *map)
891 if (rec->opts.affinity != PERF_AFFINITY_SYS &&
892 !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
893 CPU_ZERO(&rec->affinity_mask);
894 CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask);
895 sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask);
899 static size_t process_comp_header(void *record, size_t increment)
901 struct perf_record_compressed *event = record;
902 size_t size = sizeof(*event);
905 event->header.size += increment;
909 event->header.type = PERF_RECORD_COMPRESSED;
910 event->header.size = size;
915 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
916 void *src, size_t src_size)
919 size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
921 compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
922 max_record_size, process_comp_header);
924 session->bytes_transferred += src_size;
925 session->bytes_compressed += compressed;
930 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
931 bool overwrite, bool synch)
933 u64 bytes_written = rec->bytes_written;
936 struct perf_mmap *maps;
937 int trace_fd = rec->data.file.fd;
943 maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
947 if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
950 if (record__aio_enabled(rec))
951 off = record__aio_get_pos(trace_fd);
953 for (i = 0; i < evlist->nr_mmaps; i++) {
955 struct perf_mmap *map = &maps[i];
958 record__adjust_affinity(rec, map);
963 if (!record__aio_enabled(rec)) {
964 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
971 if (record__aio_push(rec, map, &off) < 0) {
972 record__aio_set_pos(trace_fd, off);
983 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
984 record__auxtrace_mmap_read(rec, map) != 0) {
990 if (record__aio_enabled(rec))
991 record__aio_set_pos(trace_fd, off);
994 * Mark the round finished in case we wrote
995 * at least one event.
997 if (bytes_written != rec->bytes_written)
998 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1001 perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1006 static int record__mmap_read_all(struct record *rec, bool synch)
1010 err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1014 return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1017 static void record__init_features(struct record *rec)
1019 struct perf_session *session = rec->session;
1022 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1023 perf_header__set_feat(&session->header, feat);
1025 if (rec->no_buildid)
1026 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1028 if (!have_tracepoints(&rec->evlist->core.entries))
1029 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1031 if (!rec->opts.branch_stack)
1032 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1034 if (!rec->opts.full_auxtrace)
1035 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1037 if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1038 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1040 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1041 if (!record__comp_enabled(rec))
1042 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1044 perf_header__clear_feat(&session->header, HEADER_STAT);
1048 record__finish_output(struct record *rec)
1050 struct perf_data *data = &rec->data;
1051 int fd = perf_data__fd(data);
1056 rec->session->header.data_size += rec->bytes_written;
1057 data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1059 if (!rec->no_buildid) {
1060 process_buildids(rec);
1062 if (rec->buildid_all)
1063 dsos__hit_all(rec->session);
1065 perf_session__write_header(rec->session, rec->evlist, fd, true);
1070 static int record__synthesize_workload(struct record *rec, bool tail)
1073 struct perf_thread_map *thread_map;
1075 if (rec->opts.tail_synthesize != tail)
1078 thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1079 if (thread_map == NULL)
1082 err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1083 process_synthesized_event,
1084 &rec->session->machines.host,
1085 rec->opts.sample_address);
1086 perf_thread_map__put(thread_map);
1090 static int record__synthesize(struct record *rec, bool tail);
1093 record__switch_output(struct record *rec, bool at_exit)
1095 struct perf_data *data = &rec->data;
1099 /* Same Size: "2015122520103046"*/
1100 char timestamp[] = "InvalidTimestamp";
1102 record__aio_mmap_read_sync(rec);
1104 record__synthesize(rec, true);
1105 if (target__none(&rec->opts.target))
1106 record__synthesize_workload(rec, true);
1109 record__finish_output(rec);
1110 err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1112 pr_err("Failed to get current timestamp\n");
1116 fd = perf_data__switch(data, timestamp,
1117 rec->session->header.data_offset,
1118 at_exit, &new_filename);
1119 if (fd >= 0 && !at_exit) {
1120 rec->bytes_written = 0;
1121 rec->session->header.data_size = 0;
1125 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1126 data->path, timestamp);
1128 if (rec->switch_output.num_files) {
1129 int n = rec->switch_output.cur_file + 1;
1131 if (n >= rec->switch_output.num_files)
1133 rec->switch_output.cur_file = n;
1134 if (rec->switch_output.filenames[n]) {
1135 remove(rec->switch_output.filenames[n]);
1136 zfree(&rec->switch_output.filenames[n]);
1138 rec->switch_output.filenames[n] = new_filename;
1143 /* Output tracking events */
1145 record__synthesize(rec, false);
1148 * In 'perf record --switch-output' without -a,
1149 * record__synthesize() in record__switch_output() won't
1150 * generate tracking events because there's no thread_map
1151 * in evlist. Which causes newly created perf.data doesn't
1152 * contain map and comm information.
1153 * Create a fake thread_map and directly call
1154 * perf_event__synthesize_thread_map() for those events.
1156 if (target__none(&rec->opts.target))
1157 record__synthesize_workload(rec, false);
1162 static volatile int workload_exec_errno;
1165 * perf_evlist__prepare_workload will send a SIGUSR1
1166 * if the fork fails, since we asked by setting its
1167 * want_signal to true.
1169 static void workload_exec_failed_signal(int signo __maybe_unused,
1171 void *ucontext __maybe_unused)
1173 workload_exec_errno = info->si_value.sival_int;
1178 static void snapshot_sig_handler(int sig);
1179 static void alarm_sig_handler(int sig);
1182 perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused,
1183 struct perf_tool *tool __maybe_unused,
1184 perf_event__handler_t process __maybe_unused,
1185 struct machine *machine __maybe_unused)
1190 static const struct perf_event_mmap_page *
1191 perf_evlist__pick_pc(struct evlist *evlist)
1194 if (evlist->mmap && evlist->mmap[0].base)
1195 return evlist->mmap[0].base;
1196 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].base)
1197 return evlist->overwrite_mmap[0].base;
1202 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1204 const struct perf_event_mmap_page *pc;
1206 pc = perf_evlist__pick_pc(rec->evlist);
1212 static int record__synthesize(struct record *rec, bool tail)
1214 struct perf_session *session = rec->session;
1215 struct machine *machine = &session->machines.host;
1216 struct perf_data *data = &rec->data;
1217 struct record_opts *opts = &rec->opts;
1218 struct perf_tool *tool = &rec->tool;
1219 int fd = perf_data__fd(data);
1222 if (rec->opts.tail_synthesize != tail)
1225 if (data->is_pipe) {
1227 * We need to synthesize events first, because some
1228 * features works on top of them (on report side).
1230 err = perf_event__synthesize_attrs(tool, rec->evlist,
1231 process_synthesized_event);
1233 pr_err("Couldn't synthesize attrs.\n");
1237 err = perf_event__synthesize_features(tool, session, rec->evlist,
1238 process_synthesized_event);
1240 pr_err("Couldn't synthesize features.\n");
1244 if (have_tracepoints(&rec->evlist->core.entries)) {
1246 * FIXME err <= 0 here actually means that
1247 * there were no tracepoints so its not really
1248 * an error, just that we don't need to
1249 * synthesize anything. We really have to
1250 * return this more properly and also
1251 * propagate errors that now are calling die()
1253 err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
1254 process_synthesized_event);
1256 pr_err("Couldn't record tracing data.\n");
1259 rec->bytes_written += err;
1263 err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1264 process_synthesized_event, machine);
1268 if (rec->opts.full_auxtrace) {
1269 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1270 session, process_synthesized_event);
1275 if (!perf_evlist__exclude_kernel(rec->evlist)) {
1276 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1278 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1279 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1280 "Check /proc/kallsyms permission or run as root.\n");
1282 err = perf_event__synthesize_modules(tool, process_synthesized_event,
1284 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1285 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1286 "Check /proc/modules permission or run as root.\n");
1290 machines__process_guests(&session->machines,
1291 perf_event__synthesize_guest_os, tool);
1294 err = perf_event__synthesize_extra_attr(&rec->tool,
1296 process_synthesized_event,
1301 err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1302 process_synthesized_event,
1305 pr_err("Couldn't synthesize thread map.\n");
1309 err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1310 process_synthesized_event, NULL);
1312 pr_err("Couldn't synthesize cpu map.\n");
1316 err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1319 pr_warning("Couldn't synthesize bpf events.\n");
1321 err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1322 process_synthesized_event, opts->sample_address,
1328 static int __cmd_record(struct record *rec, int argc, const char **argv)
1332 unsigned long waking = 0;
1333 const bool forks = argc > 0;
1334 struct perf_tool *tool = &rec->tool;
1335 struct record_opts *opts = &rec->opts;
1336 struct perf_data *data = &rec->data;
1337 struct perf_session *session;
1338 bool disabled = false, draining = false;
1339 struct evlist *sb_evlist = NULL;
1343 atexit(record__sig_exit);
1344 signal(SIGCHLD, sig_handler);
1345 signal(SIGINT, sig_handler);
1346 signal(SIGTERM, sig_handler);
1347 signal(SIGSEGV, sigsegv_handler);
1349 if (rec->opts.record_namespaces)
1350 tool->namespace_events = true;
1352 if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1353 signal(SIGUSR2, snapshot_sig_handler);
1354 if (rec->opts.auxtrace_snapshot_mode)
1355 trigger_on(&auxtrace_snapshot_trigger);
1356 if (rec->switch_output.enabled)
1357 trigger_on(&switch_output_trigger);
1359 signal(SIGUSR2, SIG_IGN);
1362 session = perf_session__new(data, false, tool);
1363 if (session == NULL) {
1364 pr_err("Perf session creation failed.\n");
1368 fd = perf_data__fd(data);
1369 rec->session = session;
1371 if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1372 pr_err("Compression initialization failed.\n");
1376 session->header.env.comp_type = PERF_COMP_ZSTD;
1377 session->header.env.comp_level = rec->opts.comp_level;
1379 record__init_features(rec);
1381 if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1382 session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1385 err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1386 argv, data->is_pipe,
1387 workload_exec_failed_signal);
1389 pr_err("Couldn't run the workload!\n");
1391 goto out_delete_session;
1396 * If we have just single event and are sending data
1397 * through pipe, we need to force the ids allocation,
1398 * because we synthesize event name through the pipe
1399 * and need the id for that.
1401 if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1402 rec->opts.sample_id = true;
1404 if (record__open(rec) != 0) {
1408 session->header.env.comp_mmap_len = session->evlist->mmap_len;
1410 err = bpf__apply_obj_config();
1412 char errbuf[BUFSIZ];
1414 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1415 pr_err("ERROR: Apply config to BPF failed: %s\n",
1421 * Normally perf_session__new would do this, but it doesn't have the
1424 if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1425 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1426 rec->tool.ordered_events = false;
1429 if (!rec->evlist->nr_groups)
1430 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1432 if (data->is_pipe) {
1433 err = perf_header__write_pipe(fd);
1437 err = perf_session__write_header(session, rec->evlist, fd, false);
1442 if (!rec->no_buildid
1443 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1444 pr_err("Couldn't generate buildids. "
1445 "Use --no-buildid to profile anyway.\n");
1450 if (!opts->no_bpf_event)
1451 bpf_event__add_sb_event(&sb_evlist, &session->header.env);
1453 if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) {
1454 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1455 opts->no_bpf_event = true;
1458 err = record__synthesize(rec, false);
1462 if (rec->realtime_prio) {
1463 struct sched_param param;
1465 param.sched_priority = rec->realtime_prio;
1466 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) {
1467 pr_err("Could not set realtime priority.\n");
1474 * When perf is starting the traced process, all the events
1475 * (apart from group members) have enable_on_exec=1 set,
1476 * so don't spoil it by prematurely enabling them.
1478 if (!target__none(&opts->target) && !opts->initial_delay)
1479 evlist__enable(rec->evlist);
1485 struct machine *machine = &session->machines.host;
1486 union perf_event *event;
1489 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1490 if (event == NULL) {
1496 * Some H/W events are generated before COMM event
1497 * which is emitted during exec(), so perf script
1498 * cannot see a correct process name for those events.
1499 * Synthesize COMM event to prevent it.
1501 tgid = perf_event__synthesize_comm(tool, event,
1502 rec->evlist->workload.pid,
1503 process_synthesized_event,
1510 event = malloc(sizeof(event->namespaces) +
1511 (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1512 machine->id_hdr_size);
1513 if (event == NULL) {
1519 * Synthesize NAMESPACES event for the command specified.
1521 perf_event__synthesize_namespaces(tool, event,
1522 rec->evlist->workload.pid,
1523 tgid, process_synthesized_event,
1527 perf_evlist__start_workload(rec->evlist);
1530 if (opts->initial_delay) {
1531 usleep(opts->initial_delay * USEC_PER_MSEC);
1532 evlist__enable(rec->evlist);
1535 trigger_ready(&auxtrace_snapshot_trigger);
1536 trigger_ready(&switch_output_trigger);
1537 perf_hooks__invoke_record_start();
1539 unsigned long long hits = rec->samples;
1542 * rec->evlist->bkw_mmap_state is possible to be
1543 * BKW_MMAP_EMPTY here: when done == true and
1544 * hits != rec->samples in previous round.
1546 * perf_evlist__toggle_bkw_mmap ensure we never
1547 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1549 if (trigger_is_hit(&switch_output_trigger) || done || draining)
1550 perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1552 if (record__mmap_read_all(rec, false) < 0) {
1553 trigger_error(&auxtrace_snapshot_trigger);
1554 trigger_error(&switch_output_trigger);
1559 if (auxtrace_record__snapshot_started) {
1560 auxtrace_record__snapshot_started = 0;
1561 if (!trigger_is_error(&auxtrace_snapshot_trigger))
1562 record__read_auxtrace_snapshot(rec, false);
1563 if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1564 pr_err("AUX area tracing snapshot failed\n");
1570 if (trigger_is_hit(&switch_output_trigger)) {
1572 * If switch_output_trigger is hit, the data in
1573 * overwritable ring buffer should have been collected,
1574 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1576 * If SIGUSR2 raise after or during record__mmap_read_all(),
1577 * record__mmap_read_all() didn't collect data from
1578 * overwritable ring buffer. Read again.
1580 if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1582 trigger_ready(&switch_output_trigger);
1585 * Reenable events in overwrite ring buffer after
1586 * record__mmap_read_all(): we should have collected
1589 perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1592 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1595 fd = record__switch_output(rec, false);
1597 pr_err("Failed to switch to new file\n");
1598 trigger_error(&switch_output_trigger);
1603 /* re-arm the alarm */
1604 if (rec->switch_output.time)
1605 alarm(rec->switch_output.time);
1608 if (hits == rec->samples) {
1609 if (done || draining)
1611 err = perf_evlist__poll(rec->evlist, -1);
1613 * Propagate error, only if there's any. Ignore positive
1614 * number of returned events and interrupt error.
1616 if (err > 0 || (err < 0 && errno == EINTR))
1620 if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1625 * When perf is starting the traced process, at the end events
1626 * die with the process and we wait for that. Thus no need to
1627 * disable events in this case.
1629 if (done && !disabled && !target__none(&opts->target)) {
1630 trigger_off(&auxtrace_snapshot_trigger);
1631 evlist__disable(rec->evlist);
1636 trigger_off(&auxtrace_snapshot_trigger);
1637 trigger_off(&switch_output_trigger);
1639 if (opts->auxtrace_snapshot_on_exit)
1640 record__auxtrace_snapshot_exit(rec);
1642 if (forks && workload_exec_errno) {
1643 char msg[STRERR_BUFSIZE];
1644 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1645 pr_err("Workload failed: %s\n", emsg);
1651 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1653 if (target__none(&rec->opts.target))
1654 record__synthesize_workload(rec, true);
1657 record__mmap_read_all(rec, true);
1658 record__aio_mmap_read_sync(rec);
1660 if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1661 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1662 session->header.env.comp_ratio = ratio + 0.5;
1668 if (!child_finished)
1669 kill(rec->evlist->workload.pid, SIGTERM);
1675 else if (WIFEXITED(exit_status))
1676 status = WEXITSTATUS(exit_status);
1677 else if (WIFSIGNALED(exit_status))
1678 signr = WTERMSIG(exit_status);
1682 record__synthesize(rec, true);
1683 /* this will be recalculated during process_buildids() */
1687 if (!rec->timestamp_filename) {
1688 record__finish_output(rec);
1690 fd = record__switch_output(rec, true);
1693 goto out_delete_session;
1698 perf_hooks__invoke_record_end();
1700 if (!err && !quiet) {
1702 const char *postfix = rec->timestamp_filename ?
1703 ".<timestamp>" : "";
1705 if (rec->samples && !rec->opts.full_auxtrace)
1706 scnprintf(samples, sizeof(samples),
1707 " (%" PRIu64 " samples)", rec->samples);
1711 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
1712 perf_data__size(data) / 1024.0 / 1024.0,
1713 data->path, postfix, samples);
1715 fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
1716 rec->session->bytes_transferred / 1024.0 / 1024.0,
1719 fprintf(stderr, " ]\n");
1723 zstd_fini(&session->zstd_data);
1724 perf_session__delete(session);
1726 if (!opts->no_bpf_event)
1727 perf_evlist__stop_sb_thread(sb_evlist);
1731 static void callchain_debug(struct callchain_param *callchain)
1733 static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1735 pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1737 if (callchain->record_mode == CALLCHAIN_DWARF)
1738 pr_debug("callchain: stack dump size %d\n",
1739 callchain->dump_size);
1742 int record_opts__parse_callchain(struct record_opts *record,
1743 struct callchain_param *callchain,
1744 const char *arg, bool unset)
1747 callchain->enabled = !unset;
1749 /* --no-call-graph */
1751 callchain->record_mode = CALLCHAIN_NONE;
1752 pr_debug("callchain: disabled\n");
1756 ret = parse_callchain_record_opt(arg, callchain);
1758 /* Enable data address sampling for DWARF unwind. */
1759 if (callchain->record_mode == CALLCHAIN_DWARF)
1760 record->sample_address = true;
1761 callchain_debug(callchain);
1767 int record_parse_callchain_opt(const struct option *opt,
1771 return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1774 int record_callchain_opt(const struct option *opt,
1775 const char *arg __maybe_unused,
1776 int unset __maybe_unused)
1778 struct callchain_param *callchain = opt->value;
1780 callchain->enabled = true;
1782 if (callchain->record_mode == CALLCHAIN_NONE)
1783 callchain->record_mode = CALLCHAIN_FP;
1785 callchain_debug(callchain);
1789 static int perf_record_config(const char *var, const char *value, void *cb)
1791 struct record *rec = cb;
1793 if (!strcmp(var, "record.build-id")) {
1794 if (!strcmp(value, "cache"))
1795 rec->no_buildid_cache = false;
1796 else if (!strcmp(value, "no-cache"))
1797 rec->no_buildid_cache = true;
1798 else if (!strcmp(value, "skip"))
1799 rec->no_buildid = true;
1804 if (!strcmp(var, "record.call-graph")) {
1805 var = "call-graph.record-mode";
1806 return perf_default_config(var, value, cb);
1808 #ifdef HAVE_AIO_SUPPORT
1809 if (!strcmp(var, "record.aio")) {
1810 rec->opts.nr_cblocks = strtol(value, NULL, 0);
1811 if (!rec->opts.nr_cblocks)
1812 rec->opts.nr_cblocks = nr_cblocks_default;
1819 struct clockid_map {
1824 #define CLOCKID_MAP(n, c) \
1825 { .name = n, .clockid = (c), }
1827 #define CLOCKID_END { .name = NULL, }
1831 * Add the missing ones, we need to build on many distros...
1833 #ifndef CLOCK_MONOTONIC_RAW
1834 #define CLOCK_MONOTONIC_RAW 4
1836 #ifndef CLOCK_BOOTTIME
1837 #define CLOCK_BOOTTIME 7
1840 #define CLOCK_TAI 11
1843 static const struct clockid_map clockids[] = {
1844 /* available for all events, NMI safe */
1845 CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1846 CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1848 /* available for some events */
1849 CLOCKID_MAP("realtime", CLOCK_REALTIME),
1850 CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1851 CLOCKID_MAP("tai", CLOCK_TAI),
1853 /* available for the lazy */
1854 CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1855 CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1856 CLOCKID_MAP("real", CLOCK_REALTIME),
1857 CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1862 static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1864 struct timespec res;
1867 if (!clock_getres(clk_id, &res))
1868 *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1870 pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1875 static int parse_clockid(const struct option *opt, const char *str, int unset)
1877 struct record_opts *opts = (struct record_opts *)opt->value;
1878 const struct clockid_map *cm;
1879 const char *ostr = str;
1882 opts->use_clockid = 0;
1890 /* no setting it twice */
1891 if (opts->use_clockid)
1894 opts->use_clockid = true;
1896 /* if its a number, we're done */
1897 if (sscanf(str, "%d", &opts->clockid) == 1)
1898 return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
1900 /* allow a "CLOCK_" prefix to the name */
1901 if (!strncasecmp(str, "CLOCK_", 6))
1904 for (cm = clockids; cm->name; cm++) {
1905 if (!strcasecmp(str, cm->name)) {
1906 opts->clockid = cm->clockid;
1907 return get_clockid_res(opts->clockid,
1908 &opts->clockid_res_ns);
1912 opts->use_clockid = false;
1913 ui__warning("unknown clockid %s, check man page\n", ostr);
1917 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
1919 struct record_opts *opts = (struct record_opts *)opt->value;
1924 if (!strcasecmp(str, "node"))
1925 opts->affinity = PERF_AFFINITY_NODE;
1926 else if (!strcasecmp(str, "cpu"))
1927 opts->affinity = PERF_AFFINITY_CPU;
1932 static int record__parse_mmap_pages(const struct option *opt,
1934 int unset __maybe_unused)
1936 struct record_opts *opts = opt->value;
1938 unsigned int mmap_pages;
1953 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1956 opts->mmap_pages = mmap_pages;
1964 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1968 opts->auxtrace_mmap_pages = mmap_pages;
1975 static void switch_output_size_warn(struct record *rec)
1977 u64 wakeup_size = perf_evlist__mmap_size(rec->opts.mmap_pages);
1978 struct switch_output *s = &rec->switch_output;
1982 if (s->size < wakeup_size) {
1985 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
1986 pr_warning("WARNING: switch-output data size lower than "
1987 "wakeup kernel buffer size (%s) "
1988 "expect bigger perf.data sizes\n", buf);
1992 static int switch_output_setup(struct record *rec)
1994 struct switch_output *s = &rec->switch_output;
1995 static struct parse_tag tags_size[] = {
1996 { .tag = 'B', .mult = 1 },
1997 { .tag = 'K', .mult = 1 << 10 },
1998 { .tag = 'M', .mult = 1 << 20 },
1999 { .tag = 'G', .mult = 1 << 30 },
2002 static struct parse_tag tags_time[] = {
2003 { .tag = 's', .mult = 1 },
2004 { .tag = 'm', .mult = 60 },
2005 { .tag = 'h', .mult = 60*60 },
2006 { .tag = 'd', .mult = 60*60*24 },
2014 if (!strcmp(s->str, "signal")) {
2016 pr_debug("switch-output with SIGUSR2 signal\n");
2020 val = parse_tag_value(s->str, tags_size);
2021 if (val != (unsigned long) -1) {
2023 pr_debug("switch-output with %s size threshold\n", s->str);
2027 val = parse_tag_value(s->str, tags_time);
2028 if (val != (unsigned long) -1) {
2030 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2038 rec->timestamp_filename = true;
2041 if (s->size && !rec->opts.no_buffering)
2042 switch_output_size_warn(rec);
2047 static const char * const __record_usage[] = {
2048 "perf record [<options>] [<command>]",
2049 "perf record [<options>] -- <command> [<options>]",
2052 const char * const *record_usage = __record_usage;
2055 * XXX Ideally would be local to cmd_record() and passed to a record__new
2056 * because we need to have access to it in record__exit, that is called
2057 * after cmd_record() exits, but since record_options need to be accessible to
2058 * builtin-script, leave it here.
2060 * At least we don't ouch it in all the other functions here directly.
2062 * Just say no to tons of global variables, sigh.
2064 static struct record record = {
2066 .sample_time = true,
2067 .mmap_pages = UINT_MAX,
2068 .user_freq = UINT_MAX,
2069 .user_interval = ULLONG_MAX,
2073 .default_per_cpu = true,
2075 .mmap_flush = MMAP_FLUSH_DEFAULT,
2078 .sample = process_sample_event,
2079 .fork = perf_event__process_fork,
2080 .exit = perf_event__process_exit,
2081 .comm = perf_event__process_comm,
2082 .namespaces = perf_event__process_namespaces,
2083 .mmap = perf_event__process_mmap,
2084 .mmap2 = perf_event__process_mmap2,
2085 .ordered_events = true,
2089 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2090 "\n\t\t\t\tDefault: fp";
2092 static bool dry_run;
2095 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2096 * with it and switch to use the library functions in perf_evlist that came
2097 * from builtin-record.c, i.e. use record_opts,
2098 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2101 static struct option __record_options[] = {
2102 OPT_CALLBACK('e', "event", &record.evlist, "event",
2103 "event selector. use 'perf list' to list available events",
2104 parse_events_option),
2105 OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2106 "event filter", parse_filter),
2107 OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2108 NULL, "don't record events from perf itself",
2110 OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2111 "record events on existing process id"),
2112 OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2113 "record events on existing thread id"),
2114 OPT_INTEGER('r', "realtime", &record.realtime_prio,
2115 "collect data with this RT SCHED_FIFO priority"),
2116 OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2117 "collect data without buffering"),
2118 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2119 "collect raw sample records from all opened counters"),
2120 OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2121 "system-wide collection from all CPUs"),
2122 OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2123 "list of cpus to monitor"),
2124 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2125 OPT_STRING('o', "output", &record.data.path, "file",
2126 "output file name"),
2127 OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2128 &record.opts.no_inherit_set,
2129 "child tasks do not inherit counters"),
2130 OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2131 "synthesize non-sample events at the end of output"),
2132 OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2133 OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
2134 OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2135 "Fail if the specified frequency can't be used"),
2136 OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2137 "profile at this frequency",
2138 record__parse_freq),
2139 OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2140 "number of mmap data pages and AUX area tracing mmap pages",
2141 record__parse_mmap_pages),
2142 OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2143 "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2144 record__mmap_flush_parse),
2145 OPT_BOOLEAN(0, "group", &record.opts.group,
2146 "put the counters into a counter group"),
2147 OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2148 NULL, "enables call-graph recording" ,
2149 &record_callchain_opt),
2150 OPT_CALLBACK(0, "call-graph", &record.opts,
2151 "record_mode[,record_size]", record_callchain_help,
2152 &record_parse_callchain_opt),
2153 OPT_INCR('v', "verbose", &verbose,
2154 "be more verbose (show counter open errors, etc)"),
2155 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2156 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2157 "per thread counts"),
2158 OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2159 OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2160 "Record the sample physical addresses"),
2161 OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2162 OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2163 &record.opts.sample_time_set,
2164 "Record the sample timestamps"),
2165 OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2166 "Record the sample period"),
2167 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2169 OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2170 &record.no_buildid_cache_set,
2171 "do not update the buildid cache"),
2172 OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2173 &record.no_buildid_set,
2174 "do not collect buildids in perf.data"),
2175 OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2176 "monitor event in cgroup name only",
2178 OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2179 "ms to wait before starting measurement after program start"),
2180 OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2183 OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2184 "branch any", "sample any taken branches",
2185 parse_branch_stack),
2187 OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2188 "branch filter mask", "branch stack filter modes",
2189 parse_branch_stack),
2190 OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2191 "sample by weight (on special events only)"),
2192 OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2193 "sample transaction flags (special events only)"),
2194 OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2195 "use per-thread mmaps"),
2196 OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2197 "sample selected machine registers on interrupt,"
2198 " use '-I?' to list register names", parse_intr_regs),
2199 OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2200 "sample selected machine registers on interrupt,"
2201 " use '--user-regs=?' to list register names", parse_user_regs),
2202 OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2203 "Record running/enabled time of read (:S) events"),
2204 OPT_CALLBACK('k', "clockid", &record.opts,
2205 "clockid", "clockid to use for events, see clock_gettime()",
2207 OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2208 "opts", "AUX area tracing Snapshot Mode", ""),
2209 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2210 "per thread proc mmap processing timeout in ms"),
2211 OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2212 "Record namespaces events"),
2213 OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
2214 "Record context switch events"),
2215 OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2216 "Configure all used events to run in kernel space.",
2217 PARSE_OPT_EXCLUSIVE),
2218 OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2219 "Configure all used events to run in user space.",
2220 PARSE_OPT_EXCLUSIVE),
2221 OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2222 "collect kernel callchains"),
2223 OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2224 "collect user callchains"),
2225 OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2226 "clang binary to use for compiling BPF scriptlets"),
2227 OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2228 "options passed to clang when compiling BPF scriptlets"),
2229 OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2230 "file", "vmlinux pathname"),
2231 OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2232 "Record build-id of all DSOs regardless of hits"),
2233 OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2234 "append timestamp to output filename"),
2235 OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2236 "Record timestamp boundary (time of first/last samples)"),
2237 OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2238 &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2239 "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2241 OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2242 "Limit number of switch output generated files"),
2243 OPT_BOOLEAN(0, "dry-run", &dry_run,
2244 "Parse options then exit"),
2245 #ifdef HAVE_AIO_SUPPORT
2246 OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2247 &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2250 OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2251 "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2252 record__parse_affinity),
2253 #ifdef HAVE_ZSTD_SUPPORT
2254 OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2255 "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2256 record__parse_comp_level),
2261 struct option *record_options = __record_options;
2263 int cmd_record(int argc, const char **argv)
2266 struct record *rec = &record;
2267 char errbuf[BUFSIZ];
2269 setlocale(LC_ALL, "");
2271 #ifndef HAVE_LIBBPF_SUPPORT
2272 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2273 set_nobuild('\0', "clang-path", true);
2274 set_nobuild('\0', "clang-opt", true);
2278 #ifndef HAVE_BPF_PROLOGUE
2279 # if !defined (HAVE_DWARF_SUPPORT)
2280 # define REASON "NO_DWARF=1"
2281 # elif !defined (HAVE_LIBBPF_SUPPORT)
2282 # define REASON "NO_LIBBPF=1"
2284 # define REASON "this architecture doesn't support BPF prologue"
2286 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2287 set_nobuild('\0', "vmlinux", true);
2292 CPU_ZERO(&rec->affinity_mask);
2293 rec->opts.affinity = PERF_AFFINITY_SYS;
2295 rec->evlist = evlist__new();
2296 if (rec->evlist == NULL)
2299 err = perf_config(perf_record_config, rec);
2303 argc = parse_options(argc, argv, record_options, record_usage,
2304 PARSE_OPT_STOP_AT_NON_OPTION);
2306 perf_quiet_option();
2308 /* Make system wide (-a) the default target. */
2309 if (!argc && target__none(&rec->opts.target))
2310 rec->opts.target.system_wide = true;
2312 if (nr_cgroups && !rec->opts.target.system_wide) {
2313 usage_with_options_msg(record_usage, record_options,
2314 "cgroup monitoring only available in system-wide mode");
2318 if (rec->opts.comp_level != 0) {
2319 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2320 rec->no_buildid = true;
2323 if (rec->opts.record_switch_events &&
2324 !perf_can_record_switch_events()) {
2325 ui__error("kernel does not support recording context switch events\n");
2326 parse_options_usage(record_usage, record_options, "switch-events", 0);
2330 if (switch_output_setup(rec)) {
2331 parse_options_usage(record_usage, record_options, "switch-output", 0);
2335 if (rec->switch_output.time) {
2336 signal(SIGALRM, alarm_sig_handler);
2337 alarm(rec->switch_output.time);
2340 if (rec->switch_output.num_files) {
2341 rec->switch_output.filenames = calloc(sizeof(char *),
2342 rec->switch_output.num_files);
2343 if (!rec->switch_output.filenames)
2348 * Allow aliases to facilitate the lookup of symbols for address
2349 * filters. Refer to auxtrace_parse_filters().
2351 symbol_conf.allow_aliases = true;
2355 err = record__auxtrace_init(rec);
2362 err = bpf__setup_stdout(rec->evlist);
2364 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2365 pr_err("ERROR: Setup BPF stdout failed: %s\n",
2372 if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(rec->evlist))
2374 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
2375 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
2376 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
2377 "file is not found in the buildid cache or in the vmlinux path.\n\n"
2378 "Samples in kernel modules won't be resolved at all.\n\n"
2379 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
2380 "even with a suitable vmlinux or kallsyms file.\n\n");
2382 if (rec->no_buildid_cache || rec->no_buildid) {
2383 disable_buildid_cache();
2384 } else if (rec->switch_output.enabled) {
2386 * In 'perf record --switch-output', disable buildid
2387 * generation by default to reduce data file switching
2388 * overhead. Still generate buildid if they are required
2391 * perf record --switch-output --no-no-buildid \
2392 * --no-no-buildid-cache
2394 * Following code equals to:
2396 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2397 * (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2398 * disable_buildid_cache();
2400 bool disable = true;
2402 if (rec->no_buildid_set && !rec->no_buildid)
2404 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2407 rec->no_buildid = true;
2408 rec->no_buildid_cache = true;
2409 disable_buildid_cache();
2413 if (record.opts.overwrite)
2414 record.opts.tail_synthesize = true;
2416 if (rec->evlist->core.nr_entries == 0 &&
2417 __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2418 pr_err("Not enough memory for event selector list\n");
2422 if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2423 rec->opts.no_inherit = true;
2425 err = target__validate(&rec->opts.target);
2427 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2428 ui__warning("%s\n", errbuf);
2431 err = target__parse_uid(&rec->opts.target);
2433 int saved_errno = errno;
2435 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2436 ui__error("%s", errbuf);
2442 /* Enable ignoring missing threads when -u/-p option is defined. */
2443 rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2446 if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2447 usage_with_options(record_usage, record_options);
2449 err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2454 * We take all buildids when the file contains
2455 * AUX area tracing data because we do not decode the
2456 * trace because it would take too long.
2458 if (rec->opts.full_auxtrace)
2459 rec->buildid_all = true;
2461 if (record_opts__config(&rec->opts)) {
2466 if (rec->opts.nr_cblocks > nr_cblocks_max)
2467 rec->opts.nr_cblocks = nr_cblocks_max;
2468 pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2470 pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2471 pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2473 if (rec->opts.comp_level > comp_level_max)
2474 rec->opts.comp_level = comp_level_max;
2475 pr_debug("comp level: %d\n", rec->opts.comp_level);
2477 err = __cmd_record(&record, argc, argv);
2479 evlist__delete(rec->evlist);
2481 auxtrace_record__free(rec->itr);
2485 static void snapshot_sig_handler(int sig __maybe_unused)
2487 struct record *rec = &record;
2489 if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2490 trigger_hit(&auxtrace_snapshot_trigger);
2491 auxtrace_record__snapshot_started = 1;
2492 if (auxtrace_record__snapshot_start(record.itr))
2493 trigger_error(&auxtrace_snapshot_trigger);
2496 if (switch_output_signal(rec))
2497 trigger_hit(&switch_output_trigger);
2500 static void alarm_sig_handler(int sig __maybe_unused)
2502 struct record *rec = &record;
2504 if (switch_output_time(rec))
2505 trigger_hit(&switch_output_trigger);