]> asedeno.scripts.mit.edu Git - linux.git/blob - tools/perf/builtin-record.c
perf record: Implement COMPRESSED event record and its attributes
[linux.git] / tools / perf / builtin-record.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10
11 #include "perf.h"
12
13 #include "util/build-id.h"
14 #include "util/util.h"
15 #include <subcmd/parse-options.h>
16 #include "util/parse-events.h"
17 #include "util/config.h"
18
19 #include "util/callchain.h"
20 #include "util/cgroup.h"
21 #include "util/header.h"
22 #include "util/event.h"
23 #include "util/evlist.h"
24 #include "util/evsel.h"
25 #include "util/debug.h"
26 #include "util/session.h"
27 #include "util/tool.h"
28 #include "util/symbol.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/llvm-utils.h"
38 #include "util/bpf-loader.h"
39 #include "util/trigger.h"
40 #include "util/perf-hooks.h"
41 #include "util/cpu-set-sched.h"
42 #include "util/time-utils.h"
43 #include "util/units.h"
44 #include "util/bpf-event.h"
45 #include "asm/bug.h"
46
47 #include <errno.h>
48 #include <inttypes.h>
49 #include <locale.h>
50 #include <poll.h>
51 #include <unistd.h>
52 #include <sched.h>
53 #include <signal.h>
54 #include <sys/mman.h>
55 #include <sys/wait.h>
56 #include <linux/time64.h>
57
58 struct switch_output {
59         bool             enabled;
60         bool             signal;
61         unsigned long    size;
62         unsigned long    time;
63         const char      *str;
64         bool             set;
65         char             **filenames;
66         int              num_files;
67         int              cur_file;
68 };
69
70 struct record {
71         struct perf_tool        tool;
72         struct record_opts      opts;
73         u64                     bytes_written;
74         struct perf_data        data;
75         struct auxtrace_record  *itr;
76         struct perf_evlist      *evlist;
77         struct perf_session     *session;
78         int                     realtime_prio;
79         bool                    no_buildid;
80         bool                    no_buildid_set;
81         bool                    no_buildid_cache;
82         bool                    no_buildid_cache_set;
83         bool                    buildid_all;
84         bool                    timestamp_filename;
85         bool                    timestamp_boundary;
86         struct switch_output    switch_output;
87         unsigned long long      samples;
88         cpu_set_t               affinity_mask;
89 };
90
91 static volatile int auxtrace_record__snapshot_started;
92 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
93 static DEFINE_TRIGGER(switch_output_trigger);
94
95 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
96         "SYS", "NODE", "CPU"
97 };
98
99 static bool switch_output_signal(struct record *rec)
100 {
101         return rec->switch_output.signal &&
102                trigger_is_ready(&switch_output_trigger);
103 }
104
105 static bool switch_output_size(struct record *rec)
106 {
107         return rec->switch_output.size &&
108                trigger_is_ready(&switch_output_trigger) &&
109                (rec->bytes_written >= rec->switch_output.size);
110 }
111
112 static bool switch_output_time(struct record *rec)
113 {
114         return rec->switch_output.time &&
115                trigger_is_ready(&switch_output_trigger);
116 }
117
118 static int record__write(struct record *rec, struct perf_mmap *map __maybe_unused,
119                          void *bf, size_t size)
120 {
121         struct perf_data_file *file = &rec->session->data->file;
122
123         if (perf_data_file__write(file, bf, size) < 0) {
124                 pr_err("failed to write perf data, error: %m\n");
125                 return -1;
126         }
127
128         rec->bytes_written += size;
129
130         if (switch_output_size(rec))
131                 trigger_hit(&switch_output_trigger);
132
133         return 0;
134 }
135
136 #ifdef HAVE_AIO_SUPPORT
137 static int record__aio_write(struct aiocb *cblock, int trace_fd,
138                 void *buf, size_t size, off_t off)
139 {
140         int rc;
141
142         cblock->aio_fildes = trace_fd;
143         cblock->aio_buf    = buf;
144         cblock->aio_nbytes = size;
145         cblock->aio_offset = off;
146         cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
147
148         do {
149                 rc = aio_write(cblock);
150                 if (rc == 0) {
151                         break;
152                 } else if (errno != EAGAIN) {
153                         cblock->aio_fildes = -1;
154                         pr_err("failed to queue perf data, error: %m\n");
155                         break;
156                 }
157         } while (1);
158
159         return rc;
160 }
161
162 static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
163 {
164         void *rem_buf;
165         off_t rem_off;
166         size_t rem_size;
167         int rc, aio_errno;
168         ssize_t aio_ret, written;
169
170         aio_errno = aio_error(cblock);
171         if (aio_errno == EINPROGRESS)
172                 return 0;
173
174         written = aio_ret = aio_return(cblock);
175         if (aio_ret < 0) {
176                 if (aio_errno != EINTR)
177                         pr_err("failed to write perf data, error: %m\n");
178                 written = 0;
179         }
180
181         rem_size = cblock->aio_nbytes - written;
182
183         if (rem_size == 0) {
184                 cblock->aio_fildes = -1;
185                 /*
186                  * md->refcount is incremented in perf_mmap__push() for
187                  * every enqueued aio write request so decrement it because
188                  * the request is now complete.
189                  */
190                 perf_mmap__put(md);
191                 rc = 1;
192         } else {
193                 /*
194                  * aio write request may require restart with the
195                  * reminder if the kernel didn't write whole
196                  * chunk at once.
197                  */
198                 rem_off = cblock->aio_offset + written;
199                 rem_buf = (void *)(cblock->aio_buf + written);
200                 record__aio_write(cblock, cblock->aio_fildes,
201                                 rem_buf, rem_size, rem_off);
202                 rc = 0;
203         }
204
205         return rc;
206 }
207
208 static int record__aio_sync(struct perf_mmap *md, bool sync_all)
209 {
210         struct aiocb **aiocb = md->aio.aiocb;
211         struct aiocb *cblocks = md->aio.cblocks;
212         struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
213         int i, do_suspend;
214
215         do {
216                 do_suspend = 0;
217                 for (i = 0; i < md->aio.nr_cblocks; ++i) {
218                         if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
219                                 if (sync_all)
220                                         aiocb[i] = NULL;
221                                 else
222                                         return i;
223                         } else {
224                                 /*
225                                  * Started aio write is not complete yet
226                                  * so it has to be waited before the
227                                  * next allocation.
228                                  */
229                                 aiocb[i] = &cblocks[i];
230                                 do_suspend = 1;
231                         }
232                 }
233                 if (!do_suspend)
234                         return -1;
235
236                 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
237                         if (!(errno == EAGAIN || errno == EINTR))
238                                 pr_err("failed to sync perf data, error: %m\n");
239                 }
240         } while (1);
241 }
242
243 static int record__aio_pushfn(void *to, struct aiocb *cblock, void *bf, size_t size, off_t off)
244 {
245         struct record *rec = to;
246         int ret, trace_fd = rec->session->data->file.fd;
247
248         rec->samples++;
249
250         ret = record__aio_write(cblock, trace_fd, bf, size, off);
251         if (!ret) {
252                 rec->bytes_written += size;
253                 if (switch_output_size(rec))
254                         trigger_hit(&switch_output_trigger);
255         }
256
257         return ret;
258 }
259
260 static off_t record__aio_get_pos(int trace_fd)
261 {
262         return lseek(trace_fd, 0, SEEK_CUR);
263 }
264
265 static void record__aio_set_pos(int trace_fd, off_t pos)
266 {
267         lseek(trace_fd, pos, SEEK_SET);
268 }
269
270 static void record__aio_mmap_read_sync(struct record *rec)
271 {
272         int i;
273         struct perf_evlist *evlist = rec->evlist;
274         struct perf_mmap *maps = evlist->mmap;
275
276         if (!rec->opts.nr_cblocks)
277                 return;
278
279         for (i = 0; i < evlist->nr_mmaps; i++) {
280                 struct perf_mmap *map = &maps[i];
281
282                 if (map->base)
283                         record__aio_sync(map, true);
284         }
285 }
286
287 static int nr_cblocks_default = 1;
288 static int nr_cblocks_max = 4;
289
290 static int record__aio_parse(const struct option *opt,
291                              const char *str,
292                              int unset)
293 {
294         struct record_opts *opts = (struct record_opts *)opt->value;
295
296         if (unset) {
297                 opts->nr_cblocks = 0;
298         } else {
299                 if (str)
300                         opts->nr_cblocks = strtol(str, NULL, 0);
301                 if (!opts->nr_cblocks)
302                         opts->nr_cblocks = nr_cblocks_default;
303         }
304
305         return 0;
306 }
307 #else /* HAVE_AIO_SUPPORT */
308 static int nr_cblocks_max = 0;
309
310 static int record__aio_sync(struct perf_mmap *md __maybe_unused, bool sync_all __maybe_unused)
311 {
312         return -1;
313 }
314
315 static int record__aio_pushfn(void *to __maybe_unused, struct aiocb *cblock __maybe_unused,
316                 void *bf __maybe_unused, size_t size __maybe_unused, off_t off __maybe_unused)
317 {
318         return -1;
319 }
320
321 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
322 {
323         return -1;
324 }
325
326 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
327 {
328 }
329
330 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
331 {
332 }
333 #endif
334
335 static int record__aio_enabled(struct record *rec)
336 {
337         return rec->opts.nr_cblocks > 0;
338 }
339
340 #define MMAP_FLUSH_DEFAULT 1
341 static int record__mmap_flush_parse(const struct option *opt,
342                                     const char *str,
343                                     int unset)
344 {
345         int flush_max;
346         struct record_opts *opts = (struct record_opts *)opt->value;
347         static struct parse_tag tags[] = {
348                         { .tag  = 'B', .mult = 1       },
349                         { .tag  = 'K', .mult = 1 << 10 },
350                         { .tag  = 'M', .mult = 1 << 20 },
351                         { .tag  = 'G', .mult = 1 << 30 },
352                         { .tag  = 0 },
353         };
354
355         if (unset)
356                 return 0;
357
358         if (str) {
359                 opts->mmap_flush = parse_tag_value(str, tags);
360                 if (opts->mmap_flush == (int)-1)
361                         opts->mmap_flush = strtol(str, NULL, 0);
362         }
363
364         if (!opts->mmap_flush)
365                 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
366
367         flush_max = perf_evlist__mmap_size(opts->mmap_pages);
368         flush_max /= 4;
369         if (opts->mmap_flush > flush_max)
370                 opts->mmap_flush = flush_max;
371
372         return 0;
373 }
374
375 static int record__comp_enabled(struct record *rec)
376 {
377         return rec->opts.comp_level > 0;
378 }
379
380 static int process_synthesized_event(struct perf_tool *tool,
381                                      union perf_event *event,
382                                      struct perf_sample *sample __maybe_unused,
383                                      struct machine *machine __maybe_unused)
384 {
385         struct record *rec = container_of(tool, struct record, tool);
386         return record__write(rec, NULL, event, event->header.size);
387 }
388
389 static int record__pushfn(struct perf_mmap *map, void *to, void *bf, size_t size)
390 {
391         struct record *rec = to;
392
393         rec->samples++;
394         return record__write(rec, map, bf, size);
395 }
396
397 static volatile int done;
398 static volatile int signr = -1;
399 static volatile int child_finished;
400
401 static void sig_handler(int sig)
402 {
403         if (sig == SIGCHLD)
404                 child_finished = 1;
405         else
406                 signr = sig;
407
408         done = 1;
409 }
410
411 static void sigsegv_handler(int sig)
412 {
413         perf_hooks__recover();
414         sighandler_dump_stack(sig);
415 }
416
417 static void record__sig_exit(void)
418 {
419         if (signr == -1)
420                 return;
421
422         signal(signr, SIG_DFL);
423         raise(signr);
424 }
425
426 #ifdef HAVE_AUXTRACE_SUPPORT
427
428 static int record__process_auxtrace(struct perf_tool *tool,
429                                     struct perf_mmap *map,
430                                     union perf_event *event, void *data1,
431                                     size_t len1, void *data2, size_t len2)
432 {
433         struct record *rec = container_of(tool, struct record, tool);
434         struct perf_data *data = &rec->data;
435         size_t padding;
436         u8 pad[8] = {0};
437
438         if (!perf_data__is_pipe(data) && !perf_data__is_dir(data)) {
439                 off_t file_offset;
440                 int fd = perf_data__fd(data);
441                 int err;
442
443                 file_offset = lseek(fd, 0, SEEK_CUR);
444                 if (file_offset == -1)
445                         return -1;
446                 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
447                                                      event, file_offset);
448                 if (err)
449                         return err;
450         }
451
452         /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
453         padding = (len1 + len2) & 7;
454         if (padding)
455                 padding = 8 - padding;
456
457         record__write(rec, map, event, event->header.size);
458         record__write(rec, map, data1, len1);
459         if (len2)
460                 record__write(rec, map, data2, len2);
461         record__write(rec, map, &pad, padding);
462
463         return 0;
464 }
465
466 static int record__auxtrace_mmap_read(struct record *rec,
467                                       struct perf_mmap *map)
468 {
469         int ret;
470
471         ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
472                                   record__process_auxtrace);
473         if (ret < 0)
474                 return ret;
475
476         if (ret)
477                 rec->samples++;
478
479         return 0;
480 }
481
482 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
483                                                struct perf_mmap *map)
484 {
485         int ret;
486
487         ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
488                                            record__process_auxtrace,
489                                            rec->opts.auxtrace_snapshot_size);
490         if (ret < 0)
491                 return ret;
492
493         if (ret)
494                 rec->samples++;
495
496         return 0;
497 }
498
499 static int record__auxtrace_read_snapshot_all(struct record *rec)
500 {
501         int i;
502         int rc = 0;
503
504         for (i = 0; i < rec->evlist->nr_mmaps; i++) {
505                 struct perf_mmap *map = &rec->evlist->mmap[i];
506
507                 if (!map->auxtrace_mmap.base)
508                         continue;
509
510                 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
511                         rc = -1;
512                         goto out;
513                 }
514         }
515 out:
516         return rc;
517 }
518
519 static void record__read_auxtrace_snapshot(struct record *rec)
520 {
521         pr_debug("Recording AUX area tracing snapshot\n");
522         if (record__auxtrace_read_snapshot_all(rec) < 0) {
523                 trigger_error(&auxtrace_snapshot_trigger);
524         } else {
525                 if (auxtrace_record__snapshot_finish(rec->itr))
526                         trigger_error(&auxtrace_snapshot_trigger);
527                 else
528                         trigger_ready(&auxtrace_snapshot_trigger);
529         }
530 }
531
532 static int record__auxtrace_init(struct record *rec)
533 {
534         int err;
535
536         if (!rec->itr) {
537                 rec->itr = auxtrace_record__init(rec->evlist, &err);
538                 if (err)
539                         return err;
540         }
541
542         err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
543                                               rec->opts.auxtrace_snapshot_opts);
544         if (err)
545                 return err;
546
547         return auxtrace_parse_filters(rec->evlist);
548 }
549
550 #else
551
552 static inline
553 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
554                                struct perf_mmap *map __maybe_unused)
555 {
556         return 0;
557 }
558
559 static inline
560 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused)
561 {
562 }
563
564 static inline
565 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
566 {
567         return 0;
568 }
569
570 static int record__auxtrace_init(struct record *rec __maybe_unused)
571 {
572         return 0;
573 }
574
575 #endif
576
577 static int record__mmap_evlist(struct record *rec,
578                                struct perf_evlist *evlist)
579 {
580         struct record_opts *opts = &rec->opts;
581         char msg[512];
582
583         if (opts->affinity != PERF_AFFINITY_SYS)
584                 cpu__setup_cpunode_map();
585
586         if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
587                                  opts->auxtrace_mmap_pages,
588                                  opts->auxtrace_snapshot_mode,
589                                  opts->nr_cblocks, opts->affinity,
590                                  opts->mmap_flush) < 0) {
591                 if (errno == EPERM) {
592                         pr_err("Permission error mapping pages.\n"
593                                "Consider increasing "
594                                "/proc/sys/kernel/perf_event_mlock_kb,\n"
595                                "or try again with a smaller value of -m/--mmap_pages.\n"
596                                "(current value: %u,%u)\n",
597                                opts->mmap_pages, opts->auxtrace_mmap_pages);
598                         return -errno;
599                 } else {
600                         pr_err("failed to mmap with %d (%s)\n", errno,
601                                 str_error_r(errno, msg, sizeof(msg)));
602                         if (errno)
603                                 return -errno;
604                         else
605                                 return -EINVAL;
606                 }
607         }
608         return 0;
609 }
610
611 static int record__mmap(struct record *rec)
612 {
613         return record__mmap_evlist(rec, rec->evlist);
614 }
615
616 static int record__open(struct record *rec)
617 {
618         char msg[BUFSIZ];
619         struct perf_evsel *pos;
620         struct perf_evlist *evlist = rec->evlist;
621         struct perf_session *session = rec->session;
622         struct record_opts *opts = &rec->opts;
623         int rc = 0;
624
625         /*
626          * For initial_delay we need to add a dummy event so that we can track
627          * PERF_RECORD_MMAP while we wait for the initial delay to enable the
628          * real events, the ones asked by the user.
629          */
630         if (opts->initial_delay) {
631                 if (perf_evlist__add_dummy(evlist))
632                         return -ENOMEM;
633
634                 pos = perf_evlist__first(evlist);
635                 pos->tracking = 0;
636                 pos = perf_evlist__last(evlist);
637                 pos->tracking = 1;
638                 pos->attr.enable_on_exec = 1;
639         }
640
641         perf_evlist__config(evlist, opts, &callchain_param);
642
643         evlist__for_each_entry(evlist, pos) {
644 try_again:
645                 if (perf_evsel__open(pos, pos->cpus, pos->threads) < 0) {
646                         if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
647                                 if (verbose > 0)
648                                         ui__warning("%s\n", msg);
649                                 goto try_again;
650                         }
651                         if ((errno == EINVAL || errno == EBADF) &&
652                             pos->leader != pos &&
653                             pos->weak_group) {
654                                 pos = perf_evlist__reset_weak_group(evlist, pos);
655                                 goto try_again;
656                         }
657                         rc = -errno;
658                         perf_evsel__open_strerror(pos, &opts->target,
659                                                   errno, msg, sizeof(msg));
660                         ui__error("%s\n", msg);
661                         goto out;
662                 }
663
664                 pos->supported = true;
665         }
666
667         if (perf_evlist__apply_filters(evlist, &pos)) {
668                 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
669                         pos->filter, perf_evsel__name(pos), errno,
670                         str_error_r(errno, msg, sizeof(msg)));
671                 rc = -1;
672                 goto out;
673         }
674
675         rc = record__mmap(rec);
676         if (rc)
677                 goto out;
678
679         session->evlist = evlist;
680         perf_session__set_id_hdr_size(session);
681 out:
682         return rc;
683 }
684
685 static int process_sample_event(struct perf_tool *tool,
686                                 union perf_event *event,
687                                 struct perf_sample *sample,
688                                 struct perf_evsel *evsel,
689                                 struct machine *machine)
690 {
691         struct record *rec = container_of(tool, struct record, tool);
692
693         if (rec->evlist->first_sample_time == 0)
694                 rec->evlist->first_sample_time = sample->time;
695
696         rec->evlist->last_sample_time = sample->time;
697
698         if (rec->buildid_all)
699                 return 0;
700
701         rec->samples++;
702         return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
703 }
704
705 static int process_buildids(struct record *rec)
706 {
707         struct perf_session *session = rec->session;
708
709         if (perf_data__size(&rec->data) == 0)
710                 return 0;
711
712         /*
713          * During this process, it'll load kernel map and replace the
714          * dso->long_name to a real pathname it found.  In this case
715          * we prefer the vmlinux path like
716          *   /lib/modules/3.16.4/build/vmlinux
717          *
718          * rather than build-id path (in debug directory).
719          *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
720          */
721         symbol_conf.ignore_vmlinux_buildid = true;
722
723         /*
724          * If --buildid-all is given, it marks all DSO regardless of hits,
725          * so no need to process samples. But if timestamp_boundary is enabled,
726          * it still needs to walk on all samples to get the timestamps of
727          * first/last samples.
728          */
729         if (rec->buildid_all && !rec->timestamp_boundary)
730                 rec->tool.sample = NULL;
731
732         return perf_session__process_events(session);
733 }
734
735 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
736 {
737         int err;
738         struct perf_tool *tool = data;
739         /*
740          *As for guest kernel when processing subcommand record&report,
741          *we arrange module mmap prior to guest kernel mmap and trigger
742          *a preload dso because default guest module symbols are loaded
743          *from guest kallsyms instead of /lib/modules/XXX/XXX. This
744          *method is used to avoid symbol missing when the first addr is
745          *in module instead of in guest kernel.
746          */
747         err = perf_event__synthesize_modules(tool, process_synthesized_event,
748                                              machine);
749         if (err < 0)
750                 pr_err("Couldn't record guest kernel [%d]'s reference"
751                        " relocation symbol.\n", machine->pid);
752
753         /*
754          * We use _stext for guest kernel because guest kernel's /proc/kallsyms
755          * have no _text sometimes.
756          */
757         err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
758                                                  machine);
759         if (err < 0)
760                 pr_err("Couldn't record guest kernel [%d]'s reference"
761                        " relocation symbol.\n", machine->pid);
762 }
763
764 static struct perf_event_header finished_round_event = {
765         .size = sizeof(struct perf_event_header),
766         .type = PERF_RECORD_FINISHED_ROUND,
767 };
768
769 static void record__adjust_affinity(struct record *rec, struct perf_mmap *map)
770 {
771         if (rec->opts.affinity != PERF_AFFINITY_SYS &&
772             !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
773                 CPU_ZERO(&rec->affinity_mask);
774                 CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask);
775                 sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask);
776         }
777 }
778
779 static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evlist,
780                                     bool overwrite, bool synch)
781 {
782         u64 bytes_written = rec->bytes_written;
783         int i;
784         int rc = 0;
785         struct perf_mmap *maps;
786         int trace_fd = rec->data.file.fd;
787         off_t off;
788
789         if (!evlist)
790                 return 0;
791
792         maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
793         if (!maps)
794                 return 0;
795
796         if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
797                 return 0;
798
799         if (record__aio_enabled(rec))
800                 off = record__aio_get_pos(trace_fd);
801
802         for (i = 0; i < evlist->nr_mmaps; i++) {
803                 u64 flush = 0;
804                 struct perf_mmap *map = &maps[i];
805
806                 if (map->base) {
807                         record__adjust_affinity(rec, map);
808                         if (synch) {
809                                 flush = map->flush;
810                                 map->flush = 1;
811                         }
812                         if (!record__aio_enabled(rec)) {
813                                 if (perf_mmap__push(map, rec, record__pushfn) != 0) {
814                                         if (synch)
815                                                 map->flush = flush;
816                                         rc = -1;
817                                         goto out;
818                                 }
819                         } else {
820                                 int idx;
821                                 /*
822                                  * Call record__aio_sync() to wait till map->data buffer
823                                  * becomes available after previous aio write request.
824                                  */
825                                 idx = record__aio_sync(map, false);
826                                 if (perf_mmap__aio_push(map, rec, idx, record__aio_pushfn, &off) != 0) {
827                                         record__aio_set_pos(trace_fd, off);
828                                         if (synch)
829                                                 map->flush = flush;
830                                         rc = -1;
831                                         goto out;
832                                 }
833                         }
834                         if (synch)
835                                 map->flush = flush;
836                 }
837
838                 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
839                     record__auxtrace_mmap_read(rec, map) != 0) {
840                         rc = -1;
841                         goto out;
842                 }
843         }
844
845         if (record__aio_enabled(rec))
846                 record__aio_set_pos(trace_fd, off);
847
848         /*
849          * Mark the round finished in case we wrote
850          * at least one event.
851          */
852         if (bytes_written != rec->bytes_written)
853                 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
854
855         if (overwrite)
856                 perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
857 out:
858         return rc;
859 }
860
861 static int record__mmap_read_all(struct record *rec, bool synch)
862 {
863         int err;
864
865         err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
866         if (err)
867                 return err;
868
869         return record__mmap_read_evlist(rec, rec->evlist, true, synch);
870 }
871
872 static void record__init_features(struct record *rec)
873 {
874         struct perf_session *session = rec->session;
875         int feat;
876
877         for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
878                 perf_header__set_feat(&session->header, feat);
879
880         if (rec->no_buildid)
881                 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
882
883         if (!have_tracepoints(&rec->evlist->entries))
884                 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
885
886         if (!rec->opts.branch_stack)
887                 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
888
889         if (!rec->opts.full_auxtrace)
890                 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
891
892         if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
893                 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
894
895         perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
896         if (!record__comp_enabled(rec))
897                 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
898
899         perf_header__clear_feat(&session->header, HEADER_STAT);
900 }
901
902 static void
903 record__finish_output(struct record *rec)
904 {
905         struct perf_data *data = &rec->data;
906         int fd = perf_data__fd(data);
907
908         if (data->is_pipe)
909                 return;
910
911         rec->session->header.data_size += rec->bytes_written;
912         data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
913
914         if (!rec->no_buildid) {
915                 process_buildids(rec);
916
917                 if (rec->buildid_all)
918                         dsos__hit_all(rec->session);
919         }
920         perf_session__write_header(rec->session, rec->evlist, fd, true);
921
922         return;
923 }
924
925 static int record__synthesize_workload(struct record *rec, bool tail)
926 {
927         int err;
928         struct thread_map *thread_map;
929
930         if (rec->opts.tail_synthesize != tail)
931                 return 0;
932
933         thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
934         if (thread_map == NULL)
935                 return -1;
936
937         err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
938                                                  process_synthesized_event,
939                                                  &rec->session->machines.host,
940                                                  rec->opts.sample_address);
941         thread_map__put(thread_map);
942         return err;
943 }
944
945 static int record__synthesize(struct record *rec, bool tail);
946
947 static int
948 record__switch_output(struct record *rec, bool at_exit)
949 {
950         struct perf_data *data = &rec->data;
951         int fd, err;
952         char *new_filename;
953
954         /* Same Size:      "2015122520103046"*/
955         char timestamp[] = "InvalidTimestamp";
956
957         record__aio_mmap_read_sync(rec);
958
959         record__synthesize(rec, true);
960         if (target__none(&rec->opts.target))
961                 record__synthesize_workload(rec, true);
962
963         rec->samples = 0;
964         record__finish_output(rec);
965         err = fetch_current_timestamp(timestamp, sizeof(timestamp));
966         if (err) {
967                 pr_err("Failed to get current timestamp\n");
968                 return -EINVAL;
969         }
970
971         fd = perf_data__switch(data, timestamp,
972                                     rec->session->header.data_offset,
973                                     at_exit, &new_filename);
974         if (fd >= 0 && !at_exit) {
975                 rec->bytes_written = 0;
976                 rec->session->header.data_size = 0;
977         }
978
979         if (!quiet)
980                 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
981                         data->path, timestamp);
982
983         if (rec->switch_output.num_files) {
984                 int n = rec->switch_output.cur_file + 1;
985
986                 if (n >= rec->switch_output.num_files)
987                         n = 0;
988                 rec->switch_output.cur_file = n;
989                 if (rec->switch_output.filenames[n]) {
990                         remove(rec->switch_output.filenames[n]);
991                         free(rec->switch_output.filenames[n]);
992                 }
993                 rec->switch_output.filenames[n] = new_filename;
994         } else {
995                 free(new_filename);
996         }
997
998         /* Output tracking events */
999         if (!at_exit) {
1000                 record__synthesize(rec, false);
1001
1002                 /*
1003                  * In 'perf record --switch-output' without -a,
1004                  * record__synthesize() in record__switch_output() won't
1005                  * generate tracking events because there's no thread_map
1006                  * in evlist. Which causes newly created perf.data doesn't
1007                  * contain map and comm information.
1008                  * Create a fake thread_map and directly call
1009                  * perf_event__synthesize_thread_map() for those events.
1010                  */
1011                 if (target__none(&rec->opts.target))
1012                         record__synthesize_workload(rec, false);
1013         }
1014         return fd;
1015 }
1016
1017 static volatile int workload_exec_errno;
1018
1019 /*
1020  * perf_evlist__prepare_workload will send a SIGUSR1
1021  * if the fork fails, since we asked by setting its
1022  * want_signal to true.
1023  */
1024 static void workload_exec_failed_signal(int signo __maybe_unused,
1025                                         siginfo_t *info,
1026                                         void *ucontext __maybe_unused)
1027 {
1028         workload_exec_errno = info->si_value.sival_int;
1029         done = 1;
1030         child_finished = 1;
1031 }
1032
1033 static void snapshot_sig_handler(int sig);
1034 static void alarm_sig_handler(int sig);
1035
1036 int __weak
1037 perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused,
1038                             struct perf_tool *tool __maybe_unused,
1039                             perf_event__handler_t process __maybe_unused,
1040                             struct machine *machine __maybe_unused)
1041 {
1042         return 0;
1043 }
1044
1045 static const struct perf_event_mmap_page *
1046 perf_evlist__pick_pc(struct perf_evlist *evlist)
1047 {
1048         if (evlist) {
1049                 if (evlist->mmap && evlist->mmap[0].base)
1050                         return evlist->mmap[0].base;
1051                 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].base)
1052                         return evlist->overwrite_mmap[0].base;
1053         }
1054         return NULL;
1055 }
1056
1057 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1058 {
1059         const struct perf_event_mmap_page *pc;
1060
1061         pc = perf_evlist__pick_pc(rec->evlist);
1062         if (pc)
1063                 return pc;
1064         return NULL;
1065 }
1066
1067 static int record__synthesize(struct record *rec, bool tail)
1068 {
1069         struct perf_session *session = rec->session;
1070         struct machine *machine = &session->machines.host;
1071         struct perf_data *data = &rec->data;
1072         struct record_opts *opts = &rec->opts;
1073         struct perf_tool *tool = &rec->tool;
1074         int fd = perf_data__fd(data);
1075         int err = 0;
1076
1077         if (rec->opts.tail_synthesize != tail)
1078                 return 0;
1079
1080         if (data->is_pipe) {
1081                 /*
1082                  * We need to synthesize events first, because some
1083                  * features works on top of them (on report side).
1084                  */
1085                 err = perf_event__synthesize_attrs(tool, rec->evlist,
1086                                                    process_synthesized_event);
1087                 if (err < 0) {
1088                         pr_err("Couldn't synthesize attrs.\n");
1089                         goto out;
1090                 }
1091
1092                 err = perf_event__synthesize_features(tool, session, rec->evlist,
1093                                                       process_synthesized_event);
1094                 if (err < 0) {
1095                         pr_err("Couldn't synthesize features.\n");
1096                         return err;
1097                 }
1098
1099                 if (have_tracepoints(&rec->evlist->entries)) {
1100                         /*
1101                          * FIXME err <= 0 here actually means that
1102                          * there were no tracepoints so its not really
1103                          * an error, just that we don't need to
1104                          * synthesize anything.  We really have to
1105                          * return this more properly and also
1106                          * propagate errors that now are calling die()
1107                          */
1108                         err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
1109                                                                   process_synthesized_event);
1110                         if (err <= 0) {
1111                                 pr_err("Couldn't record tracing data.\n");
1112                                 goto out;
1113                         }
1114                         rec->bytes_written += err;
1115                 }
1116         }
1117
1118         err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1119                                           process_synthesized_event, machine);
1120         if (err)
1121                 goto out;
1122
1123         if (rec->opts.full_auxtrace) {
1124                 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1125                                         session, process_synthesized_event);
1126                 if (err)
1127                         goto out;
1128         }
1129
1130         if (!perf_evlist__exclude_kernel(rec->evlist)) {
1131                 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1132                                                          machine);
1133                 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1134                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1135                                    "Check /proc/kallsyms permission or run as root.\n");
1136
1137                 err = perf_event__synthesize_modules(tool, process_synthesized_event,
1138                                                      machine);
1139                 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1140                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1141                                    "Check /proc/modules permission or run as root.\n");
1142         }
1143
1144         if (perf_guest) {
1145                 machines__process_guests(&session->machines,
1146                                          perf_event__synthesize_guest_os, tool);
1147         }
1148
1149         err = perf_event__synthesize_extra_attr(&rec->tool,
1150                                                 rec->evlist,
1151                                                 process_synthesized_event,
1152                                                 data->is_pipe);
1153         if (err)
1154                 goto out;
1155
1156         err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->threads,
1157                                                  process_synthesized_event,
1158                                                 NULL);
1159         if (err < 0) {
1160                 pr_err("Couldn't synthesize thread map.\n");
1161                 return err;
1162         }
1163
1164         err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->cpus,
1165                                              process_synthesized_event, NULL);
1166         if (err < 0) {
1167                 pr_err("Couldn't synthesize cpu map.\n");
1168                 return err;
1169         }
1170
1171         err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1172                                                 machine, opts);
1173         if (err < 0)
1174                 pr_warning("Couldn't synthesize bpf events.\n");
1175
1176         err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
1177                                             process_synthesized_event, opts->sample_address,
1178                                             1);
1179 out:
1180         return err;
1181 }
1182
1183 static int __cmd_record(struct record *rec, int argc, const char **argv)
1184 {
1185         int err;
1186         int status = 0;
1187         unsigned long waking = 0;
1188         const bool forks = argc > 0;
1189         struct perf_tool *tool = &rec->tool;
1190         struct record_opts *opts = &rec->opts;
1191         struct perf_data *data = &rec->data;
1192         struct perf_session *session;
1193         bool disabled = false, draining = false;
1194         struct perf_evlist *sb_evlist = NULL;
1195         int fd;
1196         float ratio = 0;
1197
1198         atexit(record__sig_exit);
1199         signal(SIGCHLD, sig_handler);
1200         signal(SIGINT, sig_handler);
1201         signal(SIGTERM, sig_handler);
1202         signal(SIGSEGV, sigsegv_handler);
1203
1204         if (rec->opts.record_namespaces)
1205                 tool->namespace_events = true;
1206
1207         if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1208                 signal(SIGUSR2, snapshot_sig_handler);
1209                 if (rec->opts.auxtrace_snapshot_mode)
1210                         trigger_on(&auxtrace_snapshot_trigger);
1211                 if (rec->switch_output.enabled)
1212                         trigger_on(&switch_output_trigger);
1213         } else {
1214                 signal(SIGUSR2, SIG_IGN);
1215         }
1216
1217         session = perf_session__new(data, false, tool);
1218         if (session == NULL) {
1219                 pr_err("Perf session creation failed.\n");
1220                 return -1;
1221         }
1222
1223         fd = perf_data__fd(data);
1224         rec->session = session;
1225
1226         record__init_features(rec);
1227
1228         if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1229                 session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1230
1231         if (forks) {
1232                 err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1233                                                     argv, data->is_pipe,
1234                                                     workload_exec_failed_signal);
1235                 if (err < 0) {
1236                         pr_err("Couldn't run the workload!\n");
1237                         status = err;
1238                         goto out_delete_session;
1239                 }
1240         }
1241
1242         /*
1243          * If we have just single event and are sending data
1244          * through pipe, we need to force the ids allocation,
1245          * because we synthesize event name through the pipe
1246          * and need the id for that.
1247          */
1248         if (data->is_pipe && rec->evlist->nr_entries == 1)
1249                 rec->opts.sample_id = true;
1250
1251         if (record__open(rec) != 0) {
1252                 err = -1;
1253                 goto out_child;
1254         }
1255         session->header.env.comp_mmap_len = session->evlist->mmap_len;
1256
1257         err = bpf__apply_obj_config();
1258         if (err) {
1259                 char errbuf[BUFSIZ];
1260
1261                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1262                 pr_err("ERROR: Apply config to BPF failed: %s\n",
1263                          errbuf);
1264                 goto out_child;
1265         }
1266
1267         /*
1268          * Normally perf_session__new would do this, but it doesn't have the
1269          * evlist.
1270          */
1271         if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1272                 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1273                 rec->tool.ordered_events = false;
1274         }
1275
1276         if (!rec->evlist->nr_groups)
1277                 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1278
1279         if (data->is_pipe) {
1280                 err = perf_header__write_pipe(fd);
1281                 if (err < 0)
1282                         goto out_child;
1283         } else {
1284                 err = perf_session__write_header(session, rec->evlist, fd, false);
1285                 if (err < 0)
1286                         goto out_child;
1287         }
1288
1289         if (!rec->no_buildid
1290             && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1291                 pr_err("Couldn't generate buildids. "
1292                        "Use --no-buildid to profile anyway.\n");
1293                 err = -1;
1294                 goto out_child;
1295         }
1296
1297         if (!opts->no_bpf_event)
1298                 bpf_event__add_sb_event(&sb_evlist, &session->header.env);
1299
1300         if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) {
1301                 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1302                 opts->no_bpf_event = true;
1303         }
1304
1305         err = record__synthesize(rec, false);
1306         if (err < 0)
1307                 goto out_child;
1308
1309         if (rec->realtime_prio) {
1310                 struct sched_param param;
1311
1312                 param.sched_priority = rec->realtime_prio;
1313                 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1314                         pr_err("Could not set realtime priority.\n");
1315                         err = -1;
1316                         goto out_child;
1317                 }
1318         }
1319
1320         /*
1321          * When perf is starting the traced process, all the events
1322          * (apart from group members) have enable_on_exec=1 set,
1323          * so don't spoil it by prematurely enabling them.
1324          */
1325         if (!target__none(&opts->target) && !opts->initial_delay)
1326                 perf_evlist__enable(rec->evlist);
1327
1328         /*
1329          * Let the child rip
1330          */
1331         if (forks) {
1332                 struct machine *machine = &session->machines.host;
1333                 union perf_event *event;
1334                 pid_t tgid;
1335
1336                 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1337                 if (event == NULL) {
1338                         err = -ENOMEM;
1339                         goto out_child;
1340                 }
1341
1342                 /*
1343                  * Some H/W events are generated before COMM event
1344                  * which is emitted during exec(), so perf script
1345                  * cannot see a correct process name for those events.
1346                  * Synthesize COMM event to prevent it.
1347                  */
1348                 tgid = perf_event__synthesize_comm(tool, event,
1349                                                    rec->evlist->workload.pid,
1350                                                    process_synthesized_event,
1351                                                    machine);
1352                 free(event);
1353
1354                 if (tgid == -1)
1355                         goto out_child;
1356
1357                 event = malloc(sizeof(event->namespaces) +
1358                                (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1359                                machine->id_hdr_size);
1360                 if (event == NULL) {
1361                         err = -ENOMEM;
1362                         goto out_child;
1363                 }
1364
1365                 /*
1366                  * Synthesize NAMESPACES event for the command specified.
1367                  */
1368                 perf_event__synthesize_namespaces(tool, event,
1369                                                   rec->evlist->workload.pid,
1370                                                   tgid, process_synthesized_event,
1371                                                   machine);
1372                 free(event);
1373
1374                 perf_evlist__start_workload(rec->evlist);
1375         }
1376
1377         if (opts->initial_delay) {
1378                 usleep(opts->initial_delay * USEC_PER_MSEC);
1379                 perf_evlist__enable(rec->evlist);
1380         }
1381
1382         trigger_ready(&auxtrace_snapshot_trigger);
1383         trigger_ready(&switch_output_trigger);
1384         perf_hooks__invoke_record_start();
1385         for (;;) {
1386                 unsigned long long hits = rec->samples;
1387
1388                 /*
1389                  * rec->evlist->bkw_mmap_state is possible to be
1390                  * BKW_MMAP_EMPTY here: when done == true and
1391                  * hits != rec->samples in previous round.
1392                  *
1393                  * perf_evlist__toggle_bkw_mmap ensure we never
1394                  * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1395                  */
1396                 if (trigger_is_hit(&switch_output_trigger) || done || draining)
1397                         perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1398
1399                 if (record__mmap_read_all(rec, false) < 0) {
1400                         trigger_error(&auxtrace_snapshot_trigger);
1401                         trigger_error(&switch_output_trigger);
1402                         err = -1;
1403                         goto out_child;
1404                 }
1405
1406                 if (auxtrace_record__snapshot_started) {
1407                         auxtrace_record__snapshot_started = 0;
1408                         if (!trigger_is_error(&auxtrace_snapshot_trigger))
1409                                 record__read_auxtrace_snapshot(rec);
1410                         if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1411                                 pr_err("AUX area tracing snapshot failed\n");
1412                                 err = -1;
1413                                 goto out_child;
1414                         }
1415                 }
1416
1417                 if (trigger_is_hit(&switch_output_trigger)) {
1418                         /*
1419                          * If switch_output_trigger is hit, the data in
1420                          * overwritable ring buffer should have been collected,
1421                          * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1422                          *
1423                          * If SIGUSR2 raise after or during record__mmap_read_all(),
1424                          * record__mmap_read_all() didn't collect data from
1425                          * overwritable ring buffer. Read again.
1426                          */
1427                         if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1428                                 continue;
1429                         trigger_ready(&switch_output_trigger);
1430
1431                         /*
1432                          * Reenable events in overwrite ring buffer after
1433                          * record__mmap_read_all(): we should have collected
1434                          * data from it.
1435                          */
1436                         perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1437
1438                         if (!quiet)
1439                                 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1440                                         waking);
1441                         waking = 0;
1442                         fd = record__switch_output(rec, false);
1443                         if (fd < 0) {
1444                                 pr_err("Failed to switch to new file\n");
1445                                 trigger_error(&switch_output_trigger);
1446                                 err = fd;
1447                                 goto out_child;
1448                         }
1449
1450                         /* re-arm the alarm */
1451                         if (rec->switch_output.time)
1452                                 alarm(rec->switch_output.time);
1453                 }
1454
1455                 if (hits == rec->samples) {
1456                         if (done || draining)
1457                                 break;
1458                         err = perf_evlist__poll(rec->evlist, -1);
1459                         /*
1460                          * Propagate error, only if there's any. Ignore positive
1461                          * number of returned events and interrupt error.
1462                          */
1463                         if (err > 0 || (err < 0 && errno == EINTR))
1464                                 err = 0;
1465                         waking++;
1466
1467                         if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1468                                 draining = true;
1469                 }
1470
1471                 /*
1472                  * When perf is starting the traced process, at the end events
1473                  * die with the process and we wait for that. Thus no need to
1474                  * disable events in this case.
1475                  */
1476                 if (done && !disabled && !target__none(&opts->target)) {
1477                         trigger_off(&auxtrace_snapshot_trigger);
1478                         perf_evlist__disable(rec->evlist);
1479                         disabled = true;
1480                 }
1481         }
1482         trigger_off(&auxtrace_snapshot_trigger);
1483         trigger_off(&switch_output_trigger);
1484
1485         if (forks && workload_exec_errno) {
1486                 char msg[STRERR_BUFSIZE];
1487                 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1488                 pr_err("Workload failed: %s\n", emsg);
1489                 err = -1;
1490                 goto out_child;
1491         }
1492
1493         if (!quiet)
1494                 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1495
1496         if (target__none(&rec->opts.target))
1497                 record__synthesize_workload(rec, true);
1498
1499 out_child:
1500         record__mmap_read_all(rec, true);
1501         record__aio_mmap_read_sync(rec);
1502
1503         if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1504                 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1505                 session->header.env.comp_ratio = ratio + 0.5;
1506         }
1507
1508         if (forks) {
1509                 int exit_status;
1510
1511                 if (!child_finished)
1512                         kill(rec->evlist->workload.pid, SIGTERM);
1513
1514                 wait(&exit_status);
1515
1516                 if (err < 0)
1517                         status = err;
1518                 else if (WIFEXITED(exit_status))
1519                         status = WEXITSTATUS(exit_status);
1520                 else if (WIFSIGNALED(exit_status))
1521                         signr = WTERMSIG(exit_status);
1522         } else
1523                 status = err;
1524
1525         record__synthesize(rec, true);
1526         /* this will be recalculated during process_buildids() */
1527         rec->samples = 0;
1528
1529         if (!err) {
1530                 if (!rec->timestamp_filename) {
1531                         record__finish_output(rec);
1532                 } else {
1533                         fd = record__switch_output(rec, true);
1534                         if (fd < 0) {
1535                                 status = fd;
1536                                 goto out_delete_session;
1537                         }
1538                 }
1539         }
1540
1541         perf_hooks__invoke_record_end();
1542
1543         if (!err && !quiet) {
1544                 char samples[128];
1545                 const char *postfix = rec->timestamp_filename ?
1546                                         ".<timestamp>" : "";
1547
1548                 if (rec->samples && !rec->opts.full_auxtrace)
1549                         scnprintf(samples, sizeof(samples),
1550                                   " (%" PRIu64 " samples)", rec->samples);
1551                 else
1552                         samples[0] = '\0';
1553
1554                 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
1555                         perf_data__size(data) / 1024.0 / 1024.0,
1556                         data->path, postfix, samples);
1557                 if (ratio) {
1558                         fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
1559                                         rec->session->bytes_transferred / 1024.0 / 1024.0,
1560                                         ratio);
1561                 }
1562                 fprintf(stderr, " ]\n");
1563         }
1564
1565 out_delete_session:
1566         perf_session__delete(session);
1567
1568         if (!opts->no_bpf_event)
1569                 perf_evlist__stop_sb_thread(sb_evlist);
1570         return status;
1571 }
1572
1573 static void callchain_debug(struct callchain_param *callchain)
1574 {
1575         static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1576
1577         pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1578
1579         if (callchain->record_mode == CALLCHAIN_DWARF)
1580                 pr_debug("callchain: stack dump size %d\n",
1581                          callchain->dump_size);
1582 }
1583
1584 int record_opts__parse_callchain(struct record_opts *record,
1585                                  struct callchain_param *callchain,
1586                                  const char *arg, bool unset)
1587 {
1588         int ret;
1589         callchain->enabled = !unset;
1590
1591         /* --no-call-graph */
1592         if (unset) {
1593                 callchain->record_mode = CALLCHAIN_NONE;
1594                 pr_debug("callchain: disabled\n");
1595                 return 0;
1596         }
1597
1598         ret = parse_callchain_record_opt(arg, callchain);
1599         if (!ret) {
1600                 /* Enable data address sampling for DWARF unwind. */
1601                 if (callchain->record_mode == CALLCHAIN_DWARF)
1602                         record->sample_address = true;
1603                 callchain_debug(callchain);
1604         }
1605
1606         return ret;
1607 }
1608
1609 int record_parse_callchain_opt(const struct option *opt,
1610                                const char *arg,
1611                                int unset)
1612 {
1613         return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1614 }
1615
1616 int record_callchain_opt(const struct option *opt,
1617                          const char *arg __maybe_unused,
1618                          int unset __maybe_unused)
1619 {
1620         struct callchain_param *callchain = opt->value;
1621
1622         callchain->enabled = true;
1623
1624         if (callchain->record_mode == CALLCHAIN_NONE)
1625                 callchain->record_mode = CALLCHAIN_FP;
1626
1627         callchain_debug(callchain);
1628         return 0;
1629 }
1630
1631 static int perf_record_config(const char *var, const char *value, void *cb)
1632 {
1633         struct record *rec = cb;
1634
1635         if (!strcmp(var, "record.build-id")) {
1636                 if (!strcmp(value, "cache"))
1637                         rec->no_buildid_cache = false;
1638                 else if (!strcmp(value, "no-cache"))
1639                         rec->no_buildid_cache = true;
1640                 else if (!strcmp(value, "skip"))
1641                         rec->no_buildid = true;
1642                 else
1643                         return -1;
1644                 return 0;
1645         }
1646         if (!strcmp(var, "record.call-graph")) {
1647                 var = "call-graph.record-mode";
1648                 return perf_default_config(var, value, cb);
1649         }
1650 #ifdef HAVE_AIO_SUPPORT
1651         if (!strcmp(var, "record.aio")) {
1652                 rec->opts.nr_cblocks = strtol(value, NULL, 0);
1653                 if (!rec->opts.nr_cblocks)
1654                         rec->opts.nr_cblocks = nr_cblocks_default;
1655         }
1656 #endif
1657
1658         return 0;
1659 }
1660
1661 struct clockid_map {
1662         const char *name;
1663         int clockid;
1664 };
1665
1666 #define CLOCKID_MAP(n, c)       \
1667         { .name = n, .clockid = (c), }
1668
1669 #define CLOCKID_END     { .name = NULL, }
1670
1671
1672 /*
1673  * Add the missing ones, we need to build on many distros...
1674  */
1675 #ifndef CLOCK_MONOTONIC_RAW
1676 #define CLOCK_MONOTONIC_RAW 4
1677 #endif
1678 #ifndef CLOCK_BOOTTIME
1679 #define CLOCK_BOOTTIME 7
1680 #endif
1681 #ifndef CLOCK_TAI
1682 #define CLOCK_TAI 11
1683 #endif
1684
1685 static const struct clockid_map clockids[] = {
1686         /* available for all events, NMI safe */
1687         CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1688         CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1689
1690         /* available for some events */
1691         CLOCKID_MAP("realtime", CLOCK_REALTIME),
1692         CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1693         CLOCKID_MAP("tai", CLOCK_TAI),
1694
1695         /* available for the lazy */
1696         CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1697         CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1698         CLOCKID_MAP("real", CLOCK_REALTIME),
1699         CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1700
1701         CLOCKID_END,
1702 };
1703
1704 static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1705 {
1706         struct timespec res;
1707
1708         *res_ns = 0;
1709         if (!clock_getres(clk_id, &res))
1710                 *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1711         else
1712                 pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1713
1714         return 0;
1715 }
1716
1717 static int parse_clockid(const struct option *opt, const char *str, int unset)
1718 {
1719         struct record_opts *opts = (struct record_opts *)opt->value;
1720         const struct clockid_map *cm;
1721         const char *ostr = str;
1722
1723         if (unset) {
1724                 opts->use_clockid = 0;
1725                 return 0;
1726         }
1727
1728         /* no arg passed */
1729         if (!str)
1730                 return 0;
1731
1732         /* no setting it twice */
1733         if (opts->use_clockid)
1734                 return -1;
1735
1736         opts->use_clockid = true;
1737
1738         /* if its a number, we're done */
1739         if (sscanf(str, "%d", &opts->clockid) == 1)
1740                 return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
1741
1742         /* allow a "CLOCK_" prefix to the name */
1743         if (!strncasecmp(str, "CLOCK_", 6))
1744                 str += 6;
1745
1746         for (cm = clockids; cm->name; cm++) {
1747                 if (!strcasecmp(str, cm->name)) {
1748                         opts->clockid = cm->clockid;
1749                         return get_clockid_res(opts->clockid,
1750                                                &opts->clockid_res_ns);
1751                 }
1752         }
1753
1754         opts->use_clockid = false;
1755         ui__warning("unknown clockid %s, check man page\n", ostr);
1756         return -1;
1757 }
1758
1759 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
1760 {
1761         struct record_opts *opts = (struct record_opts *)opt->value;
1762
1763         if (unset || !str)
1764                 return 0;
1765
1766         if (!strcasecmp(str, "node"))
1767                 opts->affinity = PERF_AFFINITY_NODE;
1768         else if (!strcasecmp(str, "cpu"))
1769                 opts->affinity = PERF_AFFINITY_CPU;
1770
1771         return 0;
1772 }
1773
1774 static int record__parse_mmap_pages(const struct option *opt,
1775                                     const char *str,
1776                                     int unset __maybe_unused)
1777 {
1778         struct record_opts *opts = opt->value;
1779         char *s, *p;
1780         unsigned int mmap_pages;
1781         int ret;
1782
1783         if (!str)
1784                 return -EINVAL;
1785
1786         s = strdup(str);
1787         if (!s)
1788                 return -ENOMEM;
1789
1790         p = strchr(s, ',');
1791         if (p)
1792                 *p = '\0';
1793
1794         if (*s) {
1795                 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1796                 if (ret)
1797                         goto out_free;
1798                 opts->mmap_pages = mmap_pages;
1799         }
1800
1801         if (!p) {
1802                 ret = 0;
1803                 goto out_free;
1804         }
1805
1806         ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1807         if (ret)
1808                 goto out_free;
1809
1810         opts->auxtrace_mmap_pages = mmap_pages;
1811
1812 out_free:
1813         free(s);
1814         return ret;
1815 }
1816
1817 static void switch_output_size_warn(struct record *rec)
1818 {
1819         u64 wakeup_size = perf_evlist__mmap_size(rec->opts.mmap_pages);
1820         struct switch_output *s = &rec->switch_output;
1821
1822         wakeup_size /= 2;
1823
1824         if (s->size < wakeup_size) {
1825                 char buf[100];
1826
1827                 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
1828                 pr_warning("WARNING: switch-output data size lower than "
1829                            "wakeup kernel buffer size (%s) "
1830                            "expect bigger perf.data sizes\n", buf);
1831         }
1832 }
1833
1834 static int switch_output_setup(struct record *rec)
1835 {
1836         struct switch_output *s = &rec->switch_output;
1837         static struct parse_tag tags_size[] = {
1838                 { .tag  = 'B', .mult = 1       },
1839                 { .tag  = 'K', .mult = 1 << 10 },
1840                 { .tag  = 'M', .mult = 1 << 20 },
1841                 { .tag  = 'G', .mult = 1 << 30 },
1842                 { .tag  = 0 },
1843         };
1844         static struct parse_tag tags_time[] = {
1845                 { .tag  = 's', .mult = 1        },
1846                 { .tag  = 'm', .mult = 60       },
1847                 { .tag  = 'h', .mult = 60*60    },
1848                 { .tag  = 'd', .mult = 60*60*24 },
1849                 { .tag  = 0 },
1850         };
1851         unsigned long val;
1852
1853         if (!s->set)
1854                 return 0;
1855
1856         if (!strcmp(s->str, "signal")) {
1857                 s->signal = true;
1858                 pr_debug("switch-output with SIGUSR2 signal\n");
1859                 goto enabled;
1860         }
1861
1862         val = parse_tag_value(s->str, tags_size);
1863         if (val != (unsigned long) -1) {
1864                 s->size = val;
1865                 pr_debug("switch-output with %s size threshold\n", s->str);
1866                 goto enabled;
1867         }
1868
1869         val = parse_tag_value(s->str, tags_time);
1870         if (val != (unsigned long) -1) {
1871                 s->time = val;
1872                 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
1873                          s->str, s->time);
1874                 goto enabled;
1875         }
1876
1877         return -1;
1878
1879 enabled:
1880         rec->timestamp_filename = true;
1881         s->enabled              = true;
1882
1883         if (s->size && !rec->opts.no_buffering)
1884                 switch_output_size_warn(rec);
1885
1886         return 0;
1887 }
1888
1889 static const char * const __record_usage[] = {
1890         "perf record [<options>] [<command>]",
1891         "perf record [<options>] -- <command> [<options>]",
1892         NULL
1893 };
1894 const char * const *record_usage = __record_usage;
1895
1896 /*
1897  * XXX Ideally would be local to cmd_record() and passed to a record__new
1898  * because we need to have access to it in record__exit, that is called
1899  * after cmd_record() exits, but since record_options need to be accessible to
1900  * builtin-script, leave it here.
1901  *
1902  * At least we don't ouch it in all the other functions here directly.
1903  *
1904  * Just say no to tons of global variables, sigh.
1905  */
1906 static struct record record = {
1907         .opts = {
1908                 .sample_time         = true,
1909                 .mmap_pages          = UINT_MAX,
1910                 .user_freq           = UINT_MAX,
1911                 .user_interval       = ULLONG_MAX,
1912                 .freq                = 4000,
1913                 .target              = {
1914                         .uses_mmap   = true,
1915                         .default_per_cpu = true,
1916                 },
1917                 .mmap_flush          = MMAP_FLUSH_DEFAULT,
1918         },
1919         .tool = {
1920                 .sample         = process_sample_event,
1921                 .fork           = perf_event__process_fork,
1922                 .exit           = perf_event__process_exit,
1923                 .comm           = perf_event__process_comm,
1924                 .namespaces     = perf_event__process_namespaces,
1925                 .mmap           = perf_event__process_mmap,
1926                 .mmap2          = perf_event__process_mmap2,
1927                 .ordered_events = true,
1928         },
1929 };
1930
1931 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
1932         "\n\t\t\t\tDefault: fp";
1933
1934 static bool dry_run;
1935
1936 /*
1937  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
1938  * with it and switch to use the library functions in perf_evlist that came
1939  * from builtin-record.c, i.e. use record_opts,
1940  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
1941  * using pipes, etc.
1942  */
1943 static struct option __record_options[] = {
1944         OPT_CALLBACK('e', "event", &record.evlist, "event",
1945                      "event selector. use 'perf list' to list available events",
1946                      parse_events_option),
1947         OPT_CALLBACK(0, "filter", &record.evlist, "filter",
1948                      "event filter", parse_filter),
1949         OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
1950                            NULL, "don't record events from perf itself",
1951                            exclude_perf),
1952         OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
1953                     "record events on existing process id"),
1954         OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
1955                     "record events on existing thread id"),
1956         OPT_INTEGER('r', "realtime", &record.realtime_prio,
1957                     "collect data with this RT SCHED_FIFO priority"),
1958         OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
1959                     "collect data without buffering"),
1960         OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
1961                     "collect raw sample records from all opened counters"),
1962         OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
1963                             "system-wide collection from all CPUs"),
1964         OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
1965                     "list of cpus to monitor"),
1966         OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
1967         OPT_STRING('o', "output", &record.data.path, "file",
1968                     "output file name"),
1969         OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
1970                         &record.opts.no_inherit_set,
1971                         "child tasks do not inherit counters"),
1972         OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
1973                     "synthesize non-sample events at the end of output"),
1974         OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
1975         OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
1976         OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
1977                     "Fail if the specified frequency can't be used"),
1978         OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
1979                      "profile at this frequency",
1980                       record__parse_freq),
1981         OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
1982                      "number of mmap data pages and AUX area tracing mmap pages",
1983                      record__parse_mmap_pages),
1984         OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
1985                      "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
1986                      record__mmap_flush_parse),
1987         OPT_BOOLEAN(0, "group", &record.opts.group,
1988                     "put the counters into a counter group"),
1989         OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
1990                            NULL, "enables call-graph recording" ,
1991                            &record_callchain_opt),
1992         OPT_CALLBACK(0, "call-graph", &record.opts,
1993                      "record_mode[,record_size]", record_callchain_help,
1994                      &record_parse_callchain_opt),
1995         OPT_INCR('v', "verbose", &verbose,
1996                     "be more verbose (show counter open errors, etc)"),
1997         OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
1998         OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
1999                     "per thread counts"),
2000         OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2001         OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2002                     "Record the sample physical addresses"),
2003         OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2004         OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2005                         &record.opts.sample_time_set,
2006                         "Record the sample timestamps"),
2007         OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2008                         "Record the sample period"),
2009         OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2010                     "don't sample"),
2011         OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2012                         &record.no_buildid_cache_set,
2013                         "do not update the buildid cache"),
2014         OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2015                         &record.no_buildid_set,
2016                         "do not collect buildids in perf.data"),
2017         OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2018                      "monitor event in cgroup name only",
2019                      parse_cgroups),
2020         OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2021                   "ms to wait before starting measurement after program start"),
2022         OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2023                    "user to profile"),
2024
2025         OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2026                      "branch any", "sample any taken branches",
2027                      parse_branch_stack),
2028
2029         OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2030                      "branch filter mask", "branch stack filter modes",
2031                      parse_branch_stack),
2032         OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2033                     "sample by weight (on special events only)"),
2034         OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2035                     "sample transaction flags (special events only)"),
2036         OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2037                     "use per-thread mmaps"),
2038         OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2039                     "sample selected machine registers on interrupt,"
2040                     " use '-I?' to list register names", parse_regs),
2041         OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2042                     "sample selected machine registers on interrupt,"
2043                     " use '-I?' to list register names", parse_regs),
2044         OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2045                     "Record running/enabled time of read (:S) events"),
2046         OPT_CALLBACK('k', "clockid", &record.opts,
2047         "clockid", "clockid to use for events, see clock_gettime()",
2048         parse_clockid),
2049         OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2050                           "opts", "AUX area tracing Snapshot Mode", ""),
2051         OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2052                         "per thread proc mmap processing timeout in ms"),
2053         OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2054                     "Record namespaces events"),
2055         OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
2056                     "Record context switch events"),
2057         OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2058                          "Configure all used events to run in kernel space.",
2059                          PARSE_OPT_EXCLUSIVE),
2060         OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2061                          "Configure all used events to run in user space.",
2062                          PARSE_OPT_EXCLUSIVE),
2063         OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2064                    "clang binary to use for compiling BPF scriptlets"),
2065         OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2066                    "options passed to clang when compiling BPF scriptlets"),
2067         OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2068                    "file", "vmlinux pathname"),
2069         OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2070                     "Record build-id of all DSOs regardless of hits"),
2071         OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2072                     "append timestamp to output filename"),
2073         OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2074                     "Record timestamp boundary (time of first/last samples)"),
2075         OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2076                           &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2077                           "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2078                           "signal"),
2079         OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2080                    "Limit number of switch output generated files"),
2081         OPT_BOOLEAN(0, "dry-run", &dry_run,
2082                     "Parse options then exit"),
2083 #ifdef HAVE_AIO_SUPPORT
2084         OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2085                      &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2086                      record__aio_parse),
2087 #endif
2088         OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2089                      "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2090                      record__parse_affinity),
2091         OPT_END()
2092 };
2093
2094 struct option *record_options = __record_options;
2095
2096 int cmd_record(int argc, const char **argv)
2097 {
2098         int err;
2099         struct record *rec = &record;
2100         char errbuf[BUFSIZ];
2101
2102         setlocale(LC_ALL, "");
2103
2104 #ifndef HAVE_LIBBPF_SUPPORT
2105 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2106         set_nobuild('\0', "clang-path", true);
2107         set_nobuild('\0', "clang-opt", true);
2108 # undef set_nobuild
2109 #endif
2110
2111 #ifndef HAVE_BPF_PROLOGUE
2112 # if !defined (HAVE_DWARF_SUPPORT)
2113 #  define REASON  "NO_DWARF=1"
2114 # elif !defined (HAVE_LIBBPF_SUPPORT)
2115 #  define REASON  "NO_LIBBPF=1"
2116 # else
2117 #  define REASON  "this architecture doesn't support BPF prologue"
2118 # endif
2119 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2120         set_nobuild('\0', "vmlinux", true);
2121 # undef set_nobuild
2122 # undef REASON
2123 #endif
2124
2125         CPU_ZERO(&rec->affinity_mask);
2126         rec->opts.affinity = PERF_AFFINITY_SYS;
2127
2128         rec->evlist = perf_evlist__new();
2129         if (rec->evlist == NULL)
2130                 return -ENOMEM;
2131
2132         err = perf_config(perf_record_config, rec);
2133         if (err)
2134                 return err;
2135
2136         argc = parse_options(argc, argv, record_options, record_usage,
2137                             PARSE_OPT_STOP_AT_NON_OPTION);
2138         if (quiet)
2139                 perf_quiet_option();
2140
2141         /* Make system wide (-a) the default target. */
2142         if (!argc && target__none(&rec->opts.target))
2143                 rec->opts.target.system_wide = true;
2144
2145         if (nr_cgroups && !rec->opts.target.system_wide) {
2146                 usage_with_options_msg(record_usage, record_options,
2147                         "cgroup monitoring only available in system-wide mode");
2148
2149         }
2150         if (rec->opts.record_switch_events &&
2151             !perf_can_record_switch_events()) {
2152                 ui__error("kernel does not support recording context switch events\n");
2153                 parse_options_usage(record_usage, record_options, "switch-events", 0);
2154                 return -EINVAL;
2155         }
2156
2157         if (switch_output_setup(rec)) {
2158                 parse_options_usage(record_usage, record_options, "switch-output", 0);
2159                 return -EINVAL;
2160         }
2161
2162         if (rec->switch_output.time) {
2163                 signal(SIGALRM, alarm_sig_handler);
2164                 alarm(rec->switch_output.time);
2165         }
2166
2167         if (rec->switch_output.num_files) {
2168                 rec->switch_output.filenames = calloc(sizeof(char *),
2169                                                       rec->switch_output.num_files);
2170                 if (!rec->switch_output.filenames)
2171                         return -EINVAL;
2172         }
2173
2174         /*
2175          * Allow aliases to facilitate the lookup of symbols for address
2176          * filters. Refer to auxtrace_parse_filters().
2177          */
2178         symbol_conf.allow_aliases = true;
2179
2180         symbol__init(NULL);
2181
2182         err = record__auxtrace_init(rec);
2183         if (err)
2184                 goto out;
2185
2186         if (dry_run)
2187                 goto out;
2188
2189         err = bpf__setup_stdout(rec->evlist);
2190         if (err) {
2191                 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2192                 pr_err("ERROR: Setup BPF stdout failed: %s\n",
2193                          errbuf);
2194                 goto out;
2195         }
2196
2197         err = -ENOMEM;
2198
2199         if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(rec->evlist))
2200                 pr_warning(
2201 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
2202 "check /proc/sys/kernel/kptr_restrict.\n\n"
2203 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
2204 "file is not found in the buildid cache or in the vmlinux path.\n\n"
2205 "Samples in kernel modules won't be resolved at all.\n\n"
2206 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
2207 "even with a suitable vmlinux or kallsyms file.\n\n");
2208
2209         if (rec->no_buildid_cache || rec->no_buildid) {
2210                 disable_buildid_cache();
2211         } else if (rec->switch_output.enabled) {
2212                 /*
2213                  * In 'perf record --switch-output', disable buildid
2214                  * generation by default to reduce data file switching
2215                  * overhead. Still generate buildid if they are required
2216                  * explicitly using
2217                  *
2218                  *  perf record --switch-output --no-no-buildid \
2219                  *              --no-no-buildid-cache
2220                  *
2221                  * Following code equals to:
2222                  *
2223                  * if ((rec->no_buildid || !rec->no_buildid_set) &&
2224                  *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2225                  *         disable_buildid_cache();
2226                  */
2227                 bool disable = true;
2228
2229                 if (rec->no_buildid_set && !rec->no_buildid)
2230                         disable = false;
2231                 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2232                         disable = false;
2233                 if (disable) {
2234                         rec->no_buildid = true;
2235                         rec->no_buildid_cache = true;
2236                         disable_buildid_cache();
2237                 }
2238         }
2239
2240         if (record.opts.overwrite)
2241                 record.opts.tail_synthesize = true;
2242
2243         if (rec->evlist->nr_entries == 0 &&
2244             __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2245                 pr_err("Not enough memory for event selector list\n");
2246                 goto out;
2247         }
2248
2249         if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2250                 rec->opts.no_inherit = true;
2251
2252         err = target__validate(&rec->opts.target);
2253         if (err) {
2254                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2255                 ui__warning("%s\n", errbuf);
2256         }
2257
2258         err = target__parse_uid(&rec->opts.target);
2259         if (err) {
2260                 int saved_errno = errno;
2261
2262                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2263                 ui__error("%s", errbuf);
2264
2265                 err = -saved_errno;
2266                 goto out;
2267         }
2268
2269         /* Enable ignoring missing threads when -u/-p option is defined. */
2270         rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2271
2272         err = -ENOMEM;
2273         if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2274                 usage_with_options(record_usage, record_options);
2275
2276         err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2277         if (err)
2278                 goto out;
2279
2280         /*
2281          * We take all buildids when the file contains
2282          * AUX area tracing data because we do not decode the
2283          * trace because it would take too long.
2284          */
2285         if (rec->opts.full_auxtrace)
2286                 rec->buildid_all = true;
2287
2288         if (record_opts__config(&rec->opts)) {
2289                 err = -EINVAL;
2290                 goto out;
2291         }
2292
2293         if (rec->opts.nr_cblocks > nr_cblocks_max)
2294                 rec->opts.nr_cblocks = nr_cblocks_max;
2295         if (verbose > 0)
2296                 pr_info("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2297
2298         pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2299         pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2300
2301         err = __cmd_record(&record, argc, argv);
2302 out:
2303         perf_evlist__delete(rec->evlist);
2304         symbol__exit();
2305         auxtrace_record__free(rec->itr);
2306         return err;
2307 }
2308
2309 static void snapshot_sig_handler(int sig __maybe_unused)
2310 {
2311         struct record *rec = &record;
2312
2313         if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2314                 trigger_hit(&auxtrace_snapshot_trigger);
2315                 auxtrace_record__snapshot_started = 1;
2316                 if (auxtrace_record__snapshot_start(record.itr))
2317                         trigger_error(&auxtrace_snapshot_trigger);
2318         }
2319
2320         if (switch_output_signal(rec))
2321                 trigger_hit(&switch_output_trigger);
2322 }
2323
2324 static void alarm_sig_handler(int sig __maybe_unused)
2325 {
2326         struct record *rec = &record;
2327
2328         if (switch_output_time(rec))
2329                 trigger_hit(&switch_output_trigger);
2330 }