1 // SPDX-License-Identifier: GPL-2.0
5 * Builtin record command: Record the profile of a workload
6 * (or a CPU, or a PID) into the perf.data output file - for
7 * later analysis via perf report.
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/target.h"
24 #include "util/session.h"
25 #include "util/tool.h"
26 #include "util/symbol.h"
27 #include "util/record.h"
28 #include "util/cpumap.h"
29 #include "util/thread_map.h"
30 #include "util/data.h"
31 #include "util/perf_regs.h"
32 #include "util/auxtrace.h"
34 #include "util/parse-branch-options.h"
35 #include "util/parse-regs-options.h"
36 #include "util/llvm-utils.h"
37 #include "util/bpf-loader.h"
38 #include "util/trigger.h"
39 #include "util/perf-hooks.h"
40 #include "util/cpu-set-sched.h"
41 #include "util/synthetic-events.h"
42 #include "util/time-utils.h"
43 #include "util/units.h"
44 #include "util/bpf-event.h"
57 #include <linux/err.h>
58 #include <linux/string.h>
59 #include <linux/time64.h>
60 #include <linux/zalloc.h>
62 struct switch_output {
75 struct perf_tool tool;
76 struct record_opts opts;
78 struct perf_data data;
79 struct auxtrace_record *itr;
80 struct evlist *evlist;
81 struct perf_session *session;
85 bool no_buildid_cache;
86 bool no_buildid_cache_set;
88 bool timestamp_filename;
89 bool timestamp_boundary;
90 struct switch_output switch_output;
91 unsigned long long samples;
92 cpu_set_t affinity_mask;
95 static volatile int auxtrace_record__snapshot_started;
96 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
97 static DEFINE_TRIGGER(switch_output_trigger);
99 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
103 static bool switch_output_signal(struct record *rec)
105 return rec->switch_output.signal &&
106 trigger_is_ready(&switch_output_trigger);
109 static bool switch_output_size(struct record *rec)
111 return rec->switch_output.size &&
112 trigger_is_ready(&switch_output_trigger) &&
113 (rec->bytes_written >= rec->switch_output.size);
116 static bool switch_output_time(struct record *rec)
118 return rec->switch_output.time &&
119 trigger_is_ready(&switch_output_trigger);
122 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
123 void *bf, size_t size)
125 struct perf_data_file *file = &rec->session->data->file;
127 if (perf_data_file__write(file, bf, size) < 0) {
128 pr_err("failed to write perf data, error: %m\n");
132 rec->bytes_written += size;
134 if (switch_output_size(rec))
135 trigger_hit(&switch_output_trigger);
140 static int record__aio_enabled(struct record *rec);
141 static int record__comp_enabled(struct record *rec);
142 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
143 void *src, size_t src_size);
145 #ifdef HAVE_AIO_SUPPORT
146 static int record__aio_write(struct aiocb *cblock, int trace_fd,
147 void *buf, size_t size, off_t off)
151 cblock->aio_fildes = trace_fd;
152 cblock->aio_buf = buf;
153 cblock->aio_nbytes = size;
154 cblock->aio_offset = off;
155 cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
158 rc = aio_write(cblock);
161 } else if (errno != EAGAIN) {
162 cblock->aio_fildes = -1;
163 pr_err("failed to queue perf data, error: %m\n");
171 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
177 ssize_t aio_ret, written;
179 aio_errno = aio_error(cblock);
180 if (aio_errno == EINPROGRESS)
183 written = aio_ret = aio_return(cblock);
185 if (aio_errno != EINTR)
186 pr_err("failed to write perf data, error: %m\n");
190 rem_size = cblock->aio_nbytes - written;
193 cblock->aio_fildes = -1;
195 * md->refcount is incremented in record__aio_pushfn() for
196 * every aio write request started in record__aio_push() so
197 * decrement it because the request is now complete.
203 * aio write request may require restart with the
204 * reminder if the kernel didn't write whole
207 rem_off = cblock->aio_offset + written;
208 rem_buf = (void *)(cblock->aio_buf + written);
209 record__aio_write(cblock, cblock->aio_fildes,
210 rem_buf, rem_size, rem_off);
217 static int record__aio_sync(struct mmap *md, bool sync_all)
219 struct aiocb **aiocb = md->aio.aiocb;
220 struct aiocb *cblocks = md->aio.cblocks;
221 struct timespec timeout = { 0, 1000 * 1000 * 1 }; /* 1ms */
226 for (i = 0; i < md->aio.nr_cblocks; ++i) {
227 if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
234 * Started aio write is not complete yet
235 * so it has to be waited before the
238 aiocb[i] = &cblocks[i];
245 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
246 if (!(errno == EAGAIN || errno == EINTR))
247 pr_err("failed to sync perf data, error: %m\n");
258 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
260 struct record_aio *aio = to;
263 * map->base data pointed by buf is copied into free map->aio.data[] buffer
264 * to release space in the kernel buffer as fast as possible, calling
265 * perf_mmap__consume() from perf_mmap__push() function.
267 * That lets the kernel to proceed with storing more profiling data into
268 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
270 * Coping can be done in two steps in case the chunk of profiling data
271 * crosses the upper bound of the kernel buffer. In this case we first move
272 * part of data from map->start till the upper bound and then the reminder
273 * from the beginning of the kernel buffer till the end of the data chunk.
276 if (record__comp_enabled(aio->rec)) {
277 size = zstd_compress(aio->rec->session, aio->data + aio->size,
278 perf_mmap__mmap_len(map) - aio->size,
281 memcpy(aio->data + aio->size, buf, size);
286 * Increment map->refcount to guard map->aio.data[] buffer
287 * from premature deallocation because map object can be
288 * released earlier than aio write request started on
289 * map->aio.data[] buffer is complete.
291 * perf_mmap__put() is done at record__aio_complete()
292 * after started aio request completion or at record__aio_push()
293 * if the request failed to start.
303 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
306 int trace_fd = rec->session->data->file.fd;
307 struct record_aio aio = { .rec = rec, .size = 0 };
310 * Call record__aio_sync() to wait till map->aio.data[] buffer
311 * becomes available after previous aio write operation.
314 idx = record__aio_sync(map, false);
315 aio.data = map->aio.data[idx];
316 ret = perf_mmap__push(map, &aio, record__aio_pushfn);
317 if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
321 ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
324 rec->bytes_written += aio.size;
325 if (switch_output_size(rec))
326 trigger_hit(&switch_output_trigger);
329 * Decrement map->refcount incremented in record__aio_pushfn()
330 * back if record__aio_write() operation failed to start, otherwise
331 * map->refcount is decremented in record__aio_complete() after
332 * aio write operation finishes successfully.
340 static off_t record__aio_get_pos(int trace_fd)
342 return lseek(trace_fd, 0, SEEK_CUR);
345 static void record__aio_set_pos(int trace_fd, off_t pos)
347 lseek(trace_fd, pos, SEEK_SET);
350 static void record__aio_mmap_read_sync(struct record *rec)
353 struct evlist *evlist = rec->evlist;
354 struct mmap *maps = evlist->mmap;
356 if (!record__aio_enabled(rec))
359 for (i = 0; i < evlist->nr_mmaps; i++) {
360 struct mmap *map = &maps[i];
363 record__aio_sync(map, true);
367 static int nr_cblocks_default = 1;
368 static int nr_cblocks_max = 4;
370 static int record__aio_parse(const struct option *opt,
374 struct record_opts *opts = (struct record_opts *)opt->value;
377 opts->nr_cblocks = 0;
380 opts->nr_cblocks = strtol(str, NULL, 0);
381 if (!opts->nr_cblocks)
382 opts->nr_cblocks = nr_cblocks_default;
387 #else /* HAVE_AIO_SUPPORT */
388 static int nr_cblocks_max = 0;
390 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
391 off_t *off __maybe_unused)
396 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
401 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
405 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
410 static int record__aio_enabled(struct record *rec)
412 return rec->opts.nr_cblocks > 0;
415 #define MMAP_FLUSH_DEFAULT 1
416 static int record__mmap_flush_parse(const struct option *opt,
421 struct record_opts *opts = (struct record_opts *)opt->value;
422 static struct parse_tag tags[] = {
423 { .tag = 'B', .mult = 1 },
424 { .tag = 'K', .mult = 1 << 10 },
425 { .tag = 'M', .mult = 1 << 20 },
426 { .tag = 'G', .mult = 1 << 30 },
434 opts->mmap_flush = parse_tag_value(str, tags);
435 if (opts->mmap_flush == (int)-1)
436 opts->mmap_flush = strtol(str, NULL, 0);
439 if (!opts->mmap_flush)
440 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
442 flush_max = perf_evlist__mmap_size(opts->mmap_pages);
444 if (opts->mmap_flush > flush_max)
445 opts->mmap_flush = flush_max;
450 #ifdef HAVE_ZSTD_SUPPORT
451 static unsigned int comp_level_default = 1;
453 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
455 struct record_opts *opts = opt->value;
458 opts->comp_level = 0;
461 opts->comp_level = strtol(str, NULL, 0);
462 if (!opts->comp_level)
463 opts->comp_level = comp_level_default;
469 static unsigned int comp_level_max = 22;
471 static int record__comp_enabled(struct record *rec)
473 return rec->opts.comp_level > 0;
476 static int process_synthesized_event(struct perf_tool *tool,
477 union perf_event *event,
478 struct perf_sample *sample __maybe_unused,
479 struct machine *machine __maybe_unused)
481 struct record *rec = container_of(tool, struct record, tool);
482 return record__write(rec, NULL, event, event->header.size);
485 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
487 struct record *rec = to;
489 if (record__comp_enabled(rec)) {
490 size = zstd_compress(rec->session, map->data, perf_mmap__mmap_len(map), bf, size);
495 return record__write(rec, map, bf, size);
498 static volatile int done;
499 static volatile int signr = -1;
500 static volatile int child_finished;
502 static void sig_handler(int sig)
512 static void sigsegv_handler(int sig)
514 perf_hooks__recover();
515 sighandler_dump_stack(sig);
518 static void record__sig_exit(void)
523 signal(signr, SIG_DFL);
527 #ifdef HAVE_AUXTRACE_SUPPORT
529 static int record__process_auxtrace(struct perf_tool *tool,
531 union perf_event *event, void *data1,
532 size_t len1, void *data2, size_t len2)
534 struct record *rec = container_of(tool, struct record, tool);
535 struct perf_data *data = &rec->data;
539 if (!perf_data__is_pipe(data) && !perf_data__is_dir(data)) {
541 int fd = perf_data__fd(data);
544 file_offset = lseek(fd, 0, SEEK_CUR);
545 if (file_offset == -1)
547 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
553 /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
554 padding = (len1 + len2) & 7;
556 padding = 8 - padding;
558 record__write(rec, map, event, event->header.size);
559 record__write(rec, map, data1, len1);
561 record__write(rec, map, data2, len2);
562 record__write(rec, map, &pad, padding);
567 static int record__auxtrace_mmap_read(struct record *rec,
572 ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
573 record__process_auxtrace);
583 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
588 ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
589 record__process_auxtrace,
590 rec->opts.auxtrace_snapshot_size);
600 static int record__auxtrace_read_snapshot_all(struct record *rec)
605 for (i = 0; i < rec->evlist->nr_mmaps; i++) {
606 struct mmap *map = &rec->evlist->mmap[i];
608 if (!map->auxtrace_mmap.base)
611 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
620 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
622 pr_debug("Recording AUX area tracing snapshot\n");
623 if (record__auxtrace_read_snapshot_all(rec) < 0) {
624 trigger_error(&auxtrace_snapshot_trigger);
626 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
627 trigger_error(&auxtrace_snapshot_trigger);
629 trigger_ready(&auxtrace_snapshot_trigger);
633 static int record__auxtrace_snapshot_exit(struct record *rec)
635 if (trigger_is_error(&auxtrace_snapshot_trigger))
638 if (!auxtrace_record__snapshot_started &&
639 auxtrace_record__snapshot_start(rec->itr))
642 record__read_auxtrace_snapshot(rec, true);
643 if (trigger_is_error(&auxtrace_snapshot_trigger))
649 static int record__auxtrace_init(struct record *rec)
654 rec->itr = auxtrace_record__init(rec->evlist, &err);
659 err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
660 rec->opts.auxtrace_snapshot_opts);
664 return auxtrace_parse_filters(rec->evlist);
670 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
671 struct mmap *map __maybe_unused)
677 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
678 bool on_exit __maybe_unused)
683 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
689 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
694 static int record__auxtrace_init(struct record *rec __maybe_unused)
701 static int record__mmap_evlist(struct record *rec,
702 struct evlist *evlist)
704 struct record_opts *opts = &rec->opts;
707 if (opts->affinity != PERF_AFFINITY_SYS)
708 cpu__setup_cpunode_map();
710 if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
711 opts->auxtrace_mmap_pages,
712 opts->auxtrace_snapshot_mode,
713 opts->nr_cblocks, opts->affinity,
714 opts->mmap_flush, opts->comp_level) < 0) {
715 if (errno == EPERM) {
716 pr_err("Permission error mapping pages.\n"
717 "Consider increasing "
718 "/proc/sys/kernel/perf_event_mlock_kb,\n"
719 "or try again with a smaller value of -m/--mmap_pages.\n"
720 "(current value: %u,%u)\n",
721 opts->mmap_pages, opts->auxtrace_mmap_pages);
724 pr_err("failed to mmap with %d (%s)\n", errno,
725 str_error_r(errno, msg, sizeof(msg)));
735 static int record__mmap(struct record *rec)
737 return record__mmap_evlist(rec, rec->evlist);
740 static int record__open(struct record *rec)
744 struct evlist *evlist = rec->evlist;
745 struct perf_session *session = rec->session;
746 struct record_opts *opts = &rec->opts;
750 * For initial_delay we need to add a dummy event so that we can track
751 * PERF_RECORD_MMAP while we wait for the initial delay to enable the
752 * real events, the ones asked by the user.
754 if (opts->initial_delay) {
755 if (perf_evlist__add_dummy(evlist))
758 pos = perf_evlist__first(evlist);
760 pos = perf_evlist__last(evlist);
762 pos->core.attr.enable_on_exec = 1;
765 perf_evlist__config(evlist, opts, &callchain_param);
767 evlist__for_each_entry(evlist, pos) {
769 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
770 if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
772 ui__warning("%s\n", msg);
775 if ((errno == EINVAL || errno == EBADF) &&
776 pos->leader != pos &&
778 pos = perf_evlist__reset_weak_group(evlist, pos);
782 perf_evsel__open_strerror(pos, &opts->target,
783 errno, msg, sizeof(msg));
784 ui__error("%s\n", msg);
788 pos->supported = true;
791 if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(evlist)) {
793 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
794 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
795 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
796 "file is not found in the buildid cache or in the vmlinux path.\n\n"
797 "Samples in kernel modules won't be resolved at all.\n\n"
798 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
799 "even with a suitable vmlinux or kallsyms file.\n\n");
802 if (perf_evlist__apply_filters(evlist, &pos)) {
803 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
804 pos->filter, perf_evsel__name(pos), errno,
805 str_error_r(errno, msg, sizeof(msg)));
810 rc = record__mmap(rec);
814 session->evlist = evlist;
815 perf_session__set_id_hdr_size(session);
820 static int process_sample_event(struct perf_tool *tool,
821 union perf_event *event,
822 struct perf_sample *sample,
824 struct machine *machine)
826 struct record *rec = container_of(tool, struct record, tool);
828 if (rec->evlist->first_sample_time == 0)
829 rec->evlist->first_sample_time = sample->time;
831 rec->evlist->last_sample_time = sample->time;
833 if (rec->buildid_all)
837 return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
840 static int process_buildids(struct record *rec)
842 struct perf_session *session = rec->session;
844 if (perf_data__size(&rec->data) == 0)
848 * During this process, it'll load kernel map and replace the
849 * dso->long_name to a real pathname it found. In this case
850 * we prefer the vmlinux path like
851 * /lib/modules/3.16.4/build/vmlinux
853 * rather than build-id path (in debug directory).
854 * $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
856 symbol_conf.ignore_vmlinux_buildid = true;
859 * If --buildid-all is given, it marks all DSO regardless of hits,
860 * so no need to process samples. But if timestamp_boundary is enabled,
861 * it still needs to walk on all samples to get the timestamps of
862 * first/last samples.
864 if (rec->buildid_all && !rec->timestamp_boundary)
865 rec->tool.sample = NULL;
867 return perf_session__process_events(session);
870 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
873 struct perf_tool *tool = data;
875 *As for guest kernel when processing subcommand record&report,
876 *we arrange module mmap prior to guest kernel mmap and trigger
877 *a preload dso because default guest module symbols are loaded
878 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
879 *method is used to avoid symbol missing when the first addr is
880 *in module instead of in guest kernel.
882 err = perf_event__synthesize_modules(tool, process_synthesized_event,
885 pr_err("Couldn't record guest kernel [%d]'s reference"
886 " relocation symbol.\n", machine->pid);
889 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
890 * have no _text sometimes.
892 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
895 pr_err("Couldn't record guest kernel [%d]'s reference"
896 " relocation symbol.\n", machine->pid);
899 static struct perf_event_header finished_round_event = {
900 .size = sizeof(struct perf_event_header),
901 .type = PERF_RECORD_FINISHED_ROUND,
904 static void record__adjust_affinity(struct record *rec, struct mmap *map)
906 if (rec->opts.affinity != PERF_AFFINITY_SYS &&
907 !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
908 CPU_ZERO(&rec->affinity_mask);
909 CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask);
910 sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask);
914 static size_t process_comp_header(void *record, size_t increment)
916 struct perf_record_compressed *event = record;
917 size_t size = sizeof(*event);
920 event->header.size += increment;
924 event->header.type = PERF_RECORD_COMPRESSED;
925 event->header.size = size;
930 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
931 void *src, size_t src_size)
934 size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
936 compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
937 max_record_size, process_comp_header);
939 session->bytes_transferred += src_size;
940 session->bytes_compressed += compressed;
945 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
946 bool overwrite, bool synch)
948 u64 bytes_written = rec->bytes_written;
952 int trace_fd = rec->data.file.fd;
958 maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
962 if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
965 if (record__aio_enabled(rec))
966 off = record__aio_get_pos(trace_fd);
968 for (i = 0; i < evlist->nr_mmaps; i++) {
970 struct mmap *map = &maps[i];
973 record__adjust_affinity(rec, map);
978 if (!record__aio_enabled(rec)) {
979 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
986 if (record__aio_push(rec, map, &off) < 0) {
987 record__aio_set_pos(trace_fd, off);
998 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
999 record__auxtrace_mmap_read(rec, map) != 0) {
1005 if (record__aio_enabled(rec))
1006 record__aio_set_pos(trace_fd, off);
1009 * Mark the round finished in case we wrote
1010 * at least one event.
1012 if (bytes_written != rec->bytes_written)
1013 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1016 perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1021 static int record__mmap_read_all(struct record *rec, bool synch)
1025 err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1029 return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1032 static void record__init_features(struct record *rec)
1034 struct perf_session *session = rec->session;
1037 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1038 perf_header__set_feat(&session->header, feat);
1040 if (rec->no_buildid)
1041 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1043 if (!have_tracepoints(&rec->evlist->core.entries))
1044 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1046 if (!rec->opts.branch_stack)
1047 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1049 if (!rec->opts.full_auxtrace)
1050 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1052 if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1053 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1055 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1056 if (!record__comp_enabled(rec))
1057 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1059 perf_header__clear_feat(&session->header, HEADER_STAT);
1063 record__finish_output(struct record *rec)
1065 struct perf_data *data = &rec->data;
1066 int fd = perf_data__fd(data);
1071 rec->session->header.data_size += rec->bytes_written;
1072 data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1074 if (!rec->no_buildid) {
1075 process_buildids(rec);
1077 if (rec->buildid_all)
1078 dsos__hit_all(rec->session);
1080 perf_session__write_header(rec->session, rec->evlist, fd, true);
1085 static int record__synthesize_workload(struct record *rec, bool tail)
1088 struct perf_thread_map *thread_map;
1090 if (rec->opts.tail_synthesize != tail)
1093 thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1094 if (thread_map == NULL)
1097 err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1098 process_synthesized_event,
1099 &rec->session->machines.host,
1100 rec->opts.sample_address);
1101 perf_thread_map__put(thread_map);
1105 static int record__synthesize(struct record *rec, bool tail);
1108 record__switch_output(struct record *rec, bool at_exit)
1110 struct perf_data *data = &rec->data;
1114 /* Same Size: "2015122520103046"*/
1115 char timestamp[] = "InvalidTimestamp";
1117 record__aio_mmap_read_sync(rec);
1119 record__synthesize(rec, true);
1120 if (target__none(&rec->opts.target))
1121 record__synthesize_workload(rec, true);
1124 record__finish_output(rec);
1125 err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1127 pr_err("Failed to get current timestamp\n");
1131 fd = perf_data__switch(data, timestamp,
1132 rec->session->header.data_offset,
1133 at_exit, &new_filename);
1134 if (fd >= 0 && !at_exit) {
1135 rec->bytes_written = 0;
1136 rec->session->header.data_size = 0;
1140 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1141 data->path, timestamp);
1143 if (rec->switch_output.num_files) {
1144 int n = rec->switch_output.cur_file + 1;
1146 if (n >= rec->switch_output.num_files)
1148 rec->switch_output.cur_file = n;
1149 if (rec->switch_output.filenames[n]) {
1150 remove(rec->switch_output.filenames[n]);
1151 zfree(&rec->switch_output.filenames[n]);
1153 rec->switch_output.filenames[n] = new_filename;
1158 /* Output tracking events */
1160 record__synthesize(rec, false);
1163 * In 'perf record --switch-output' without -a,
1164 * record__synthesize() in record__switch_output() won't
1165 * generate tracking events because there's no thread_map
1166 * in evlist. Which causes newly created perf.data doesn't
1167 * contain map and comm information.
1168 * Create a fake thread_map and directly call
1169 * perf_event__synthesize_thread_map() for those events.
1171 if (target__none(&rec->opts.target))
1172 record__synthesize_workload(rec, false);
1177 static volatile int workload_exec_errno;
1180 * perf_evlist__prepare_workload will send a SIGUSR1
1181 * if the fork fails, since we asked by setting its
1182 * want_signal to true.
1184 static void workload_exec_failed_signal(int signo __maybe_unused,
1186 void *ucontext __maybe_unused)
1188 workload_exec_errno = info->si_value.sival_int;
1193 static void snapshot_sig_handler(int sig);
1194 static void alarm_sig_handler(int sig);
1196 static const struct perf_event_mmap_page *
1197 perf_evlist__pick_pc(struct evlist *evlist)
1200 if (evlist->mmap && evlist->mmap[0].base)
1201 return evlist->mmap[0].base;
1202 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].base)
1203 return evlist->overwrite_mmap[0].base;
1208 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1210 const struct perf_event_mmap_page *pc;
1212 pc = perf_evlist__pick_pc(rec->evlist);
1218 static int record__synthesize(struct record *rec, bool tail)
1220 struct perf_session *session = rec->session;
1221 struct machine *machine = &session->machines.host;
1222 struct perf_data *data = &rec->data;
1223 struct record_opts *opts = &rec->opts;
1224 struct perf_tool *tool = &rec->tool;
1225 int fd = perf_data__fd(data);
1228 if (rec->opts.tail_synthesize != tail)
1231 if (data->is_pipe) {
1233 * We need to synthesize events first, because some
1234 * features works on top of them (on report side).
1236 err = perf_event__synthesize_attrs(tool, rec->evlist,
1237 process_synthesized_event);
1239 pr_err("Couldn't synthesize attrs.\n");
1243 err = perf_event__synthesize_features(tool, session, rec->evlist,
1244 process_synthesized_event);
1246 pr_err("Couldn't synthesize features.\n");
1250 if (have_tracepoints(&rec->evlist->core.entries)) {
1252 * FIXME err <= 0 here actually means that
1253 * there were no tracepoints so its not really
1254 * an error, just that we don't need to
1255 * synthesize anything. We really have to
1256 * return this more properly and also
1257 * propagate errors that now are calling die()
1259 err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
1260 process_synthesized_event);
1262 pr_err("Couldn't record tracing data.\n");
1265 rec->bytes_written += err;
1269 err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1270 process_synthesized_event, machine);
1274 if (rec->opts.full_auxtrace) {
1275 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1276 session, process_synthesized_event);
1281 if (!perf_evlist__exclude_kernel(rec->evlist)) {
1282 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1284 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1285 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1286 "Check /proc/kallsyms permission or run as root.\n");
1288 err = perf_event__synthesize_modules(tool, process_synthesized_event,
1290 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1291 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1292 "Check /proc/modules permission or run as root.\n");
1296 machines__process_guests(&session->machines,
1297 perf_event__synthesize_guest_os, tool);
1300 err = perf_event__synthesize_extra_attr(&rec->tool,
1302 process_synthesized_event,
1307 err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1308 process_synthesized_event,
1311 pr_err("Couldn't synthesize thread map.\n");
1315 err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1316 process_synthesized_event, NULL);
1318 pr_err("Couldn't synthesize cpu map.\n");
1322 err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1325 pr_warning("Couldn't synthesize bpf events.\n");
1327 err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1328 process_synthesized_event, opts->sample_address,
1334 static int __cmd_record(struct record *rec, int argc, const char **argv)
1338 unsigned long waking = 0;
1339 const bool forks = argc > 0;
1340 struct perf_tool *tool = &rec->tool;
1341 struct record_opts *opts = &rec->opts;
1342 struct perf_data *data = &rec->data;
1343 struct perf_session *session;
1344 bool disabled = false, draining = false;
1345 struct evlist *sb_evlist = NULL;
1349 atexit(record__sig_exit);
1350 signal(SIGCHLD, sig_handler);
1351 signal(SIGINT, sig_handler);
1352 signal(SIGTERM, sig_handler);
1353 signal(SIGSEGV, sigsegv_handler);
1355 if (rec->opts.record_namespaces)
1356 tool->namespace_events = true;
1358 if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1359 signal(SIGUSR2, snapshot_sig_handler);
1360 if (rec->opts.auxtrace_snapshot_mode)
1361 trigger_on(&auxtrace_snapshot_trigger);
1362 if (rec->switch_output.enabled)
1363 trigger_on(&switch_output_trigger);
1365 signal(SIGUSR2, SIG_IGN);
1368 session = perf_session__new(data, false, tool);
1369 if (IS_ERR(session)) {
1370 pr_err("Perf session creation failed.\n");
1371 return PTR_ERR(session);
1374 fd = perf_data__fd(data);
1375 rec->session = session;
1377 if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1378 pr_err("Compression initialization failed.\n");
1382 session->header.env.comp_type = PERF_COMP_ZSTD;
1383 session->header.env.comp_level = rec->opts.comp_level;
1385 record__init_features(rec);
1387 if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1388 session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1391 err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1392 argv, data->is_pipe,
1393 workload_exec_failed_signal);
1395 pr_err("Couldn't run the workload!\n");
1397 goto out_delete_session;
1402 * If we have just single event and are sending data
1403 * through pipe, we need to force the ids allocation,
1404 * because we synthesize event name through the pipe
1405 * and need the id for that.
1407 if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1408 rec->opts.sample_id = true;
1410 if (record__open(rec) != 0) {
1414 session->header.env.comp_mmap_len = session->evlist->mmap_len;
1416 err = bpf__apply_obj_config();
1418 char errbuf[BUFSIZ];
1420 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1421 pr_err("ERROR: Apply config to BPF failed: %s\n",
1427 * Normally perf_session__new would do this, but it doesn't have the
1430 if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1431 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1432 rec->tool.ordered_events = false;
1435 if (!rec->evlist->nr_groups)
1436 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1438 if (data->is_pipe) {
1439 err = perf_header__write_pipe(fd);
1443 err = perf_session__write_header(session, rec->evlist, fd, false);
1448 if (!rec->no_buildid
1449 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1450 pr_err("Couldn't generate buildids. "
1451 "Use --no-buildid to profile anyway.\n");
1456 if (!opts->no_bpf_event)
1457 bpf_event__add_sb_event(&sb_evlist, &session->header.env);
1459 if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) {
1460 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1461 opts->no_bpf_event = true;
1464 err = record__synthesize(rec, false);
1468 if (rec->realtime_prio) {
1469 struct sched_param param;
1471 param.sched_priority = rec->realtime_prio;
1472 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) {
1473 pr_err("Could not set realtime priority.\n");
1480 * When perf is starting the traced process, all the events
1481 * (apart from group members) have enable_on_exec=1 set,
1482 * so don't spoil it by prematurely enabling them.
1484 if (!target__none(&opts->target) && !opts->initial_delay)
1485 evlist__enable(rec->evlist);
1491 struct machine *machine = &session->machines.host;
1492 union perf_event *event;
1495 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1496 if (event == NULL) {
1502 * Some H/W events are generated before COMM event
1503 * which is emitted during exec(), so perf script
1504 * cannot see a correct process name for those events.
1505 * Synthesize COMM event to prevent it.
1507 tgid = perf_event__synthesize_comm(tool, event,
1508 rec->evlist->workload.pid,
1509 process_synthesized_event,
1516 event = malloc(sizeof(event->namespaces) +
1517 (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1518 machine->id_hdr_size);
1519 if (event == NULL) {
1525 * Synthesize NAMESPACES event for the command specified.
1527 perf_event__synthesize_namespaces(tool, event,
1528 rec->evlist->workload.pid,
1529 tgid, process_synthesized_event,
1533 perf_evlist__start_workload(rec->evlist);
1536 if (opts->initial_delay) {
1537 usleep(opts->initial_delay * USEC_PER_MSEC);
1538 evlist__enable(rec->evlist);
1541 trigger_ready(&auxtrace_snapshot_trigger);
1542 trigger_ready(&switch_output_trigger);
1543 perf_hooks__invoke_record_start();
1545 unsigned long long hits = rec->samples;
1548 * rec->evlist->bkw_mmap_state is possible to be
1549 * BKW_MMAP_EMPTY here: when done == true and
1550 * hits != rec->samples in previous round.
1552 * perf_evlist__toggle_bkw_mmap ensure we never
1553 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1555 if (trigger_is_hit(&switch_output_trigger) || done || draining)
1556 perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1558 if (record__mmap_read_all(rec, false) < 0) {
1559 trigger_error(&auxtrace_snapshot_trigger);
1560 trigger_error(&switch_output_trigger);
1565 if (auxtrace_record__snapshot_started) {
1566 auxtrace_record__snapshot_started = 0;
1567 if (!trigger_is_error(&auxtrace_snapshot_trigger))
1568 record__read_auxtrace_snapshot(rec, false);
1569 if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1570 pr_err("AUX area tracing snapshot failed\n");
1576 if (trigger_is_hit(&switch_output_trigger)) {
1578 * If switch_output_trigger is hit, the data in
1579 * overwritable ring buffer should have been collected,
1580 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1582 * If SIGUSR2 raise after or during record__mmap_read_all(),
1583 * record__mmap_read_all() didn't collect data from
1584 * overwritable ring buffer. Read again.
1586 if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1588 trigger_ready(&switch_output_trigger);
1591 * Reenable events in overwrite ring buffer after
1592 * record__mmap_read_all(): we should have collected
1595 perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1598 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1601 fd = record__switch_output(rec, false);
1603 pr_err("Failed to switch to new file\n");
1604 trigger_error(&switch_output_trigger);
1609 /* re-arm the alarm */
1610 if (rec->switch_output.time)
1611 alarm(rec->switch_output.time);
1614 if (hits == rec->samples) {
1615 if (done || draining)
1617 err = perf_evlist__poll(rec->evlist, -1);
1619 * Propagate error, only if there's any. Ignore positive
1620 * number of returned events and interrupt error.
1622 if (err > 0 || (err < 0 && errno == EINTR))
1626 if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1631 * When perf is starting the traced process, at the end events
1632 * die with the process and we wait for that. Thus no need to
1633 * disable events in this case.
1635 if (done && !disabled && !target__none(&opts->target)) {
1636 trigger_off(&auxtrace_snapshot_trigger);
1637 evlist__disable(rec->evlist);
1642 trigger_off(&auxtrace_snapshot_trigger);
1643 trigger_off(&switch_output_trigger);
1645 if (opts->auxtrace_snapshot_on_exit)
1646 record__auxtrace_snapshot_exit(rec);
1648 if (forks && workload_exec_errno) {
1649 char msg[STRERR_BUFSIZE];
1650 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1651 pr_err("Workload failed: %s\n", emsg);
1657 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1659 if (target__none(&rec->opts.target))
1660 record__synthesize_workload(rec, true);
1663 record__mmap_read_all(rec, true);
1664 record__aio_mmap_read_sync(rec);
1666 if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1667 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1668 session->header.env.comp_ratio = ratio + 0.5;
1674 if (!child_finished)
1675 kill(rec->evlist->workload.pid, SIGTERM);
1681 else if (WIFEXITED(exit_status))
1682 status = WEXITSTATUS(exit_status);
1683 else if (WIFSIGNALED(exit_status))
1684 signr = WTERMSIG(exit_status);
1688 record__synthesize(rec, true);
1689 /* this will be recalculated during process_buildids() */
1693 if (!rec->timestamp_filename) {
1694 record__finish_output(rec);
1696 fd = record__switch_output(rec, true);
1699 goto out_delete_session;
1704 perf_hooks__invoke_record_end();
1706 if (!err && !quiet) {
1708 const char *postfix = rec->timestamp_filename ?
1709 ".<timestamp>" : "";
1711 if (rec->samples && !rec->opts.full_auxtrace)
1712 scnprintf(samples, sizeof(samples),
1713 " (%" PRIu64 " samples)", rec->samples);
1717 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
1718 perf_data__size(data) / 1024.0 / 1024.0,
1719 data->path, postfix, samples);
1721 fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
1722 rec->session->bytes_transferred / 1024.0 / 1024.0,
1725 fprintf(stderr, " ]\n");
1729 zstd_fini(&session->zstd_data);
1730 perf_session__delete(session);
1732 if (!opts->no_bpf_event)
1733 perf_evlist__stop_sb_thread(sb_evlist);
1737 static void callchain_debug(struct callchain_param *callchain)
1739 static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1741 pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1743 if (callchain->record_mode == CALLCHAIN_DWARF)
1744 pr_debug("callchain: stack dump size %d\n",
1745 callchain->dump_size);
1748 int record_opts__parse_callchain(struct record_opts *record,
1749 struct callchain_param *callchain,
1750 const char *arg, bool unset)
1753 callchain->enabled = !unset;
1755 /* --no-call-graph */
1757 callchain->record_mode = CALLCHAIN_NONE;
1758 pr_debug("callchain: disabled\n");
1762 ret = parse_callchain_record_opt(arg, callchain);
1764 /* Enable data address sampling for DWARF unwind. */
1765 if (callchain->record_mode == CALLCHAIN_DWARF)
1766 record->sample_address = true;
1767 callchain_debug(callchain);
1773 int record_parse_callchain_opt(const struct option *opt,
1777 return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1780 int record_callchain_opt(const struct option *opt,
1781 const char *arg __maybe_unused,
1782 int unset __maybe_unused)
1784 struct callchain_param *callchain = opt->value;
1786 callchain->enabled = true;
1788 if (callchain->record_mode == CALLCHAIN_NONE)
1789 callchain->record_mode = CALLCHAIN_FP;
1791 callchain_debug(callchain);
1795 static int perf_record_config(const char *var, const char *value, void *cb)
1797 struct record *rec = cb;
1799 if (!strcmp(var, "record.build-id")) {
1800 if (!strcmp(value, "cache"))
1801 rec->no_buildid_cache = false;
1802 else if (!strcmp(value, "no-cache"))
1803 rec->no_buildid_cache = true;
1804 else if (!strcmp(value, "skip"))
1805 rec->no_buildid = true;
1810 if (!strcmp(var, "record.call-graph")) {
1811 var = "call-graph.record-mode";
1812 return perf_default_config(var, value, cb);
1814 #ifdef HAVE_AIO_SUPPORT
1815 if (!strcmp(var, "record.aio")) {
1816 rec->opts.nr_cblocks = strtol(value, NULL, 0);
1817 if (!rec->opts.nr_cblocks)
1818 rec->opts.nr_cblocks = nr_cblocks_default;
1825 struct clockid_map {
1830 #define CLOCKID_MAP(n, c) \
1831 { .name = n, .clockid = (c), }
1833 #define CLOCKID_END { .name = NULL, }
1837 * Add the missing ones, we need to build on many distros...
1839 #ifndef CLOCK_MONOTONIC_RAW
1840 #define CLOCK_MONOTONIC_RAW 4
1842 #ifndef CLOCK_BOOTTIME
1843 #define CLOCK_BOOTTIME 7
1846 #define CLOCK_TAI 11
1849 static const struct clockid_map clockids[] = {
1850 /* available for all events, NMI safe */
1851 CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1852 CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1854 /* available for some events */
1855 CLOCKID_MAP("realtime", CLOCK_REALTIME),
1856 CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1857 CLOCKID_MAP("tai", CLOCK_TAI),
1859 /* available for the lazy */
1860 CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1861 CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1862 CLOCKID_MAP("real", CLOCK_REALTIME),
1863 CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1868 static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1870 struct timespec res;
1873 if (!clock_getres(clk_id, &res))
1874 *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1876 pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1881 static int parse_clockid(const struct option *opt, const char *str, int unset)
1883 struct record_opts *opts = (struct record_opts *)opt->value;
1884 const struct clockid_map *cm;
1885 const char *ostr = str;
1888 opts->use_clockid = 0;
1896 /* no setting it twice */
1897 if (opts->use_clockid)
1900 opts->use_clockid = true;
1902 /* if its a number, we're done */
1903 if (sscanf(str, "%d", &opts->clockid) == 1)
1904 return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
1906 /* allow a "CLOCK_" prefix to the name */
1907 if (!strncasecmp(str, "CLOCK_", 6))
1910 for (cm = clockids; cm->name; cm++) {
1911 if (!strcasecmp(str, cm->name)) {
1912 opts->clockid = cm->clockid;
1913 return get_clockid_res(opts->clockid,
1914 &opts->clockid_res_ns);
1918 opts->use_clockid = false;
1919 ui__warning("unknown clockid %s, check man page\n", ostr);
1923 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
1925 struct record_opts *opts = (struct record_opts *)opt->value;
1930 if (!strcasecmp(str, "node"))
1931 opts->affinity = PERF_AFFINITY_NODE;
1932 else if (!strcasecmp(str, "cpu"))
1933 opts->affinity = PERF_AFFINITY_CPU;
1938 static int record__parse_mmap_pages(const struct option *opt,
1940 int unset __maybe_unused)
1942 struct record_opts *opts = opt->value;
1944 unsigned int mmap_pages;
1959 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1962 opts->mmap_pages = mmap_pages;
1970 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1974 opts->auxtrace_mmap_pages = mmap_pages;
1981 static void switch_output_size_warn(struct record *rec)
1983 u64 wakeup_size = perf_evlist__mmap_size(rec->opts.mmap_pages);
1984 struct switch_output *s = &rec->switch_output;
1988 if (s->size < wakeup_size) {
1991 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
1992 pr_warning("WARNING: switch-output data size lower than "
1993 "wakeup kernel buffer size (%s) "
1994 "expect bigger perf.data sizes\n", buf);
1998 static int switch_output_setup(struct record *rec)
2000 struct switch_output *s = &rec->switch_output;
2001 static struct parse_tag tags_size[] = {
2002 { .tag = 'B', .mult = 1 },
2003 { .tag = 'K', .mult = 1 << 10 },
2004 { .tag = 'M', .mult = 1 << 20 },
2005 { .tag = 'G', .mult = 1 << 30 },
2008 static struct parse_tag tags_time[] = {
2009 { .tag = 's', .mult = 1 },
2010 { .tag = 'm', .mult = 60 },
2011 { .tag = 'h', .mult = 60*60 },
2012 { .tag = 'd', .mult = 60*60*24 },
2020 if (!strcmp(s->str, "signal")) {
2022 pr_debug("switch-output with SIGUSR2 signal\n");
2026 val = parse_tag_value(s->str, tags_size);
2027 if (val != (unsigned long) -1) {
2029 pr_debug("switch-output with %s size threshold\n", s->str);
2033 val = parse_tag_value(s->str, tags_time);
2034 if (val != (unsigned long) -1) {
2036 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2044 rec->timestamp_filename = true;
2047 if (s->size && !rec->opts.no_buffering)
2048 switch_output_size_warn(rec);
2053 static const char * const __record_usage[] = {
2054 "perf record [<options>] [<command>]",
2055 "perf record [<options>] -- <command> [<options>]",
2058 const char * const *record_usage = __record_usage;
2061 * XXX Ideally would be local to cmd_record() and passed to a record__new
2062 * because we need to have access to it in record__exit, that is called
2063 * after cmd_record() exits, but since record_options need to be accessible to
2064 * builtin-script, leave it here.
2066 * At least we don't ouch it in all the other functions here directly.
2068 * Just say no to tons of global variables, sigh.
2070 static struct record record = {
2072 .sample_time = true,
2073 .mmap_pages = UINT_MAX,
2074 .user_freq = UINT_MAX,
2075 .user_interval = ULLONG_MAX,
2079 .default_per_cpu = true,
2081 .mmap_flush = MMAP_FLUSH_DEFAULT,
2084 .sample = process_sample_event,
2085 .fork = perf_event__process_fork,
2086 .exit = perf_event__process_exit,
2087 .comm = perf_event__process_comm,
2088 .namespaces = perf_event__process_namespaces,
2089 .mmap = perf_event__process_mmap,
2090 .mmap2 = perf_event__process_mmap2,
2091 .ordered_events = true,
2095 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2096 "\n\t\t\t\tDefault: fp";
2098 static bool dry_run;
2101 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2102 * with it and switch to use the library functions in perf_evlist that came
2103 * from builtin-record.c, i.e. use record_opts,
2104 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2107 static struct option __record_options[] = {
2108 OPT_CALLBACK('e', "event", &record.evlist, "event",
2109 "event selector. use 'perf list' to list available events",
2110 parse_events_option),
2111 OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2112 "event filter", parse_filter),
2113 OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2114 NULL, "don't record events from perf itself",
2116 OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2117 "record events on existing process id"),
2118 OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2119 "record events on existing thread id"),
2120 OPT_INTEGER('r', "realtime", &record.realtime_prio,
2121 "collect data with this RT SCHED_FIFO priority"),
2122 OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2123 "collect data without buffering"),
2124 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2125 "collect raw sample records from all opened counters"),
2126 OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2127 "system-wide collection from all CPUs"),
2128 OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2129 "list of cpus to monitor"),
2130 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2131 OPT_STRING('o', "output", &record.data.path, "file",
2132 "output file name"),
2133 OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2134 &record.opts.no_inherit_set,
2135 "child tasks do not inherit counters"),
2136 OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2137 "synthesize non-sample events at the end of output"),
2138 OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2139 OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
2140 OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2141 "Fail if the specified frequency can't be used"),
2142 OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2143 "profile at this frequency",
2144 record__parse_freq),
2145 OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2146 "number of mmap data pages and AUX area tracing mmap pages",
2147 record__parse_mmap_pages),
2148 OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2149 "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2150 record__mmap_flush_parse),
2151 OPT_BOOLEAN(0, "group", &record.opts.group,
2152 "put the counters into a counter group"),
2153 OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2154 NULL, "enables call-graph recording" ,
2155 &record_callchain_opt),
2156 OPT_CALLBACK(0, "call-graph", &record.opts,
2157 "record_mode[,record_size]", record_callchain_help,
2158 &record_parse_callchain_opt),
2159 OPT_INCR('v', "verbose", &verbose,
2160 "be more verbose (show counter open errors, etc)"),
2161 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2162 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2163 "per thread counts"),
2164 OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2165 OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2166 "Record the sample physical addresses"),
2167 OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2168 OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2169 &record.opts.sample_time_set,
2170 "Record the sample timestamps"),
2171 OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2172 "Record the sample period"),
2173 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2175 OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2176 &record.no_buildid_cache_set,
2177 "do not update the buildid cache"),
2178 OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2179 &record.no_buildid_set,
2180 "do not collect buildids in perf.data"),
2181 OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2182 "monitor event in cgroup name only",
2184 OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2185 "ms to wait before starting measurement after program start"),
2186 OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2189 OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2190 "branch any", "sample any taken branches",
2191 parse_branch_stack),
2193 OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2194 "branch filter mask", "branch stack filter modes",
2195 parse_branch_stack),
2196 OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2197 "sample by weight (on special events only)"),
2198 OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2199 "sample transaction flags (special events only)"),
2200 OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2201 "use per-thread mmaps"),
2202 OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2203 "sample selected machine registers on interrupt,"
2204 " use '-I?' to list register names", parse_intr_regs),
2205 OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2206 "sample selected machine registers on interrupt,"
2207 " use '--user-regs=?' to list register names", parse_user_regs),
2208 OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2209 "Record running/enabled time of read (:S) events"),
2210 OPT_CALLBACK('k', "clockid", &record.opts,
2211 "clockid", "clockid to use for events, see clock_gettime()",
2213 OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2214 "opts", "AUX area tracing Snapshot Mode", ""),
2215 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2216 "per thread proc mmap processing timeout in ms"),
2217 OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2218 "Record namespaces events"),
2219 OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
2220 "Record context switch events"),
2221 OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2222 "Configure all used events to run in kernel space.",
2223 PARSE_OPT_EXCLUSIVE),
2224 OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2225 "Configure all used events to run in user space.",
2226 PARSE_OPT_EXCLUSIVE),
2227 OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2228 "collect kernel callchains"),
2229 OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2230 "collect user callchains"),
2231 OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2232 "clang binary to use for compiling BPF scriptlets"),
2233 OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2234 "options passed to clang when compiling BPF scriptlets"),
2235 OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2236 "file", "vmlinux pathname"),
2237 OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2238 "Record build-id of all DSOs regardless of hits"),
2239 OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2240 "append timestamp to output filename"),
2241 OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2242 "Record timestamp boundary (time of first/last samples)"),
2243 OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2244 &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2245 "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2247 OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2248 "Limit number of switch output generated files"),
2249 OPT_BOOLEAN(0, "dry-run", &dry_run,
2250 "Parse options then exit"),
2251 #ifdef HAVE_AIO_SUPPORT
2252 OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2253 &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2256 OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2257 "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2258 record__parse_affinity),
2259 #ifdef HAVE_ZSTD_SUPPORT
2260 OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2261 "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2262 record__parse_comp_level),
2267 struct option *record_options = __record_options;
2269 int cmd_record(int argc, const char **argv)
2272 struct record *rec = &record;
2273 char errbuf[BUFSIZ];
2275 setlocale(LC_ALL, "");
2277 #ifndef HAVE_LIBBPF_SUPPORT
2278 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2279 set_nobuild('\0', "clang-path", true);
2280 set_nobuild('\0', "clang-opt", true);
2284 #ifndef HAVE_BPF_PROLOGUE
2285 # if !defined (HAVE_DWARF_SUPPORT)
2286 # define REASON "NO_DWARF=1"
2287 # elif !defined (HAVE_LIBBPF_SUPPORT)
2288 # define REASON "NO_LIBBPF=1"
2290 # define REASON "this architecture doesn't support BPF prologue"
2292 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2293 set_nobuild('\0', "vmlinux", true);
2298 CPU_ZERO(&rec->affinity_mask);
2299 rec->opts.affinity = PERF_AFFINITY_SYS;
2301 rec->evlist = evlist__new();
2302 if (rec->evlist == NULL)
2305 err = perf_config(perf_record_config, rec);
2309 argc = parse_options(argc, argv, record_options, record_usage,
2310 PARSE_OPT_STOP_AT_NON_OPTION);
2312 perf_quiet_option();
2314 /* Make system wide (-a) the default target. */
2315 if (!argc && target__none(&rec->opts.target))
2316 rec->opts.target.system_wide = true;
2318 if (nr_cgroups && !rec->opts.target.system_wide) {
2319 usage_with_options_msg(record_usage, record_options,
2320 "cgroup monitoring only available in system-wide mode");
2324 if (rec->opts.comp_level != 0) {
2325 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2326 rec->no_buildid = true;
2329 if (rec->opts.record_switch_events &&
2330 !perf_can_record_switch_events()) {
2331 ui__error("kernel does not support recording context switch events\n");
2332 parse_options_usage(record_usage, record_options, "switch-events", 0);
2336 if (switch_output_setup(rec)) {
2337 parse_options_usage(record_usage, record_options, "switch-output", 0);
2341 if (rec->switch_output.time) {
2342 signal(SIGALRM, alarm_sig_handler);
2343 alarm(rec->switch_output.time);
2346 if (rec->switch_output.num_files) {
2347 rec->switch_output.filenames = calloc(sizeof(char *),
2348 rec->switch_output.num_files);
2349 if (!rec->switch_output.filenames)
2354 * Allow aliases to facilitate the lookup of symbols for address
2355 * filters. Refer to auxtrace_parse_filters().
2357 symbol_conf.allow_aliases = true;
2361 err = record__auxtrace_init(rec);
2368 err = bpf__setup_stdout(rec->evlist);
2370 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2371 pr_err("ERROR: Setup BPF stdout failed: %s\n",
2378 if (rec->no_buildid_cache || rec->no_buildid) {
2379 disable_buildid_cache();
2380 } else if (rec->switch_output.enabled) {
2382 * In 'perf record --switch-output', disable buildid
2383 * generation by default to reduce data file switching
2384 * overhead. Still generate buildid if they are required
2387 * perf record --switch-output --no-no-buildid \
2388 * --no-no-buildid-cache
2390 * Following code equals to:
2392 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2393 * (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2394 * disable_buildid_cache();
2396 bool disable = true;
2398 if (rec->no_buildid_set && !rec->no_buildid)
2400 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2403 rec->no_buildid = true;
2404 rec->no_buildid_cache = true;
2405 disable_buildid_cache();
2409 if (record.opts.overwrite)
2410 record.opts.tail_synthesize = true;
2412 if (rec->evlist->core.nr_entries == 0 &&
2413 __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2414 pr_err("Not enough memory for event selector list\n");
2418 if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2419 rec->opts.no_inherit = true;
2421 err = target__validate(&rec->opts.target);
2423 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2424 ui__warning("%s\n", errbuf);
2427 err = target__parse_uid(&rec->opts.target);
2429 int saved_errno = errno;
2431 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2432 ui__error("%s", errbuf);
2438 /* Enable ignoring missing threads when -u/-p option is defined. */
2439 rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2442 if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2443 usage_with_options(record_usage, record_options);
2445 err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2450 * We take all buildids when the file contains
2451 * AUX area tracing data because we do not decode the
2452 * trace because it would take too long.
2454 if (rec->opts.full_auxtrace)
2455 rec->buildid_all = true;
2457 if (record_opts__config(&rec->opts)) {
2462 if (rec->opts.nr_cblocks > nr_cblocks_max)
2463 rec->opts.nr_cblocks = nr_cblocks_max;
2464 pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2466 pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2467 pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2469 if (rec->opts.comp_level > comp_level_max)
2470 rec->opts.comp_level = comp_level_max;
2471 pr_debug("comp level: %d\n", rec->opts.comp_level);
2473 err = __cmd_record(&record, argc, argv);
2475 evlist__delete(rec->evlist);
2477 auxtrace_record__free(rec->itr);
2481 static void snapshot_sig_handler(int sig __maybe_unused)
2483 struct record *rec = &record;
2485 if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2486 trigger_hit(&auxtrace_snapshot_trigger);
2487 auxtrace_record__snapshot_started = 1;
2488 if (auxtrace_record__snapshot_start(record.itr))
2489 trigger_error(&auxtrace_snapshot_trigger);
2492 if (switch_output_signal(rec))
2493 trigger_hit(&switch_output_trigger);
2496 static void alarm_sig_handler(int sig __maybe_unused)
2498 struct record *rec = &record;
2500 if (switch_output_time(rec))
2501 trigger_hit(&switch_output_trigger);