]> asedeno.scripts.mit.edu Git - linux.git/blob - tools/perf/builtin-record.c
perf tools: Rename perf_evlist__mmap() to evlist__mmap()
[linux.git] / tools / perf / builtin-record.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/target.h"
24 #include "util/session.h"
25 #include "util/tool.h"
26 #include "util/symbol.h"
27 #include "util/record.h"
28 #include "util/cpumap.h"
29 #include "util/thread_map.h"
30 #include "util/data.h"
31 #include "util/perf_regs.h"
32 #include "util/auxtrace.h"
33 #include "util/tsc.h"
34 #include "util/parse-branch-options.h"
35 #include "util/parse-regs-options.h"
36 #include "util/llvm-utils.h"
37 #include "util/bpf-loader.h"
38 #include "util/trigger.h"
39 #include "util/perf-hooks.h"
40 #include "util/cpu-set-sched.h"
41 #include "util/synthetic-events.h"
42 #include "util/time-utils.h"
43 #include "util/units.h"
44 #include "util/bpf-event.h"
45 #include "asm/bug.h"
46 #include "perf.h"
47
48 #include <errno.h>
49 #include <inttypes.h>
50 #include <locale.h>
51 #include <poll.h>
52 #include <unistd.h>
53 #include <sched.h>
54 #include <signal.h>
55 #include <sys/mman.h>
56 #include <sys/wait.h>
57 #include <linux/err.h>
58 #include <linux/string.h>
59 #include <linux/time64.h>
60 #include <linux/zalloc.h>
61
62 struct switch_output {
63         bool             enabled;
64         bool             signal;
65         unsigned long    size;
66         unsigned long    time;
67         const char      *str;
68         bool             set;
69         char             **filenames;
70         int              num_files;
71         int              cur_file;
72 };
73
74 struct record {
75         struct perf_tool        tool;
76         struct record_opts      opts;
77         u64                     bytes_written;
78         struct perf_data        data;
79         struct auxtrace_record  *itr;
80         struct evlist   *evlist;
81         struct perf_session     *session;
82         int                     realtime_prio;
83         bool                    no_buildid;
84         bool                    no_buildid_set;
85         bool                    no_buildid_cache;
86         bool                    no_buildid_cache_set;
87         bool                    buildid_all;
88         bool                    timestamp_filename;
89         bool                    timestamp_boundary;
90         struct switch_output    switch_output;
91         unsigned long long      samples;
92         cpu_set_t               affinity_mask;
93 };
94
95 static volatile int auxtrace_record__snapshot_started;
96 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
97 static DEFINE_TRIGGER(switch_output_trigger);
98
99 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
100         "SYS", "NODE", "CPU"
101 };
102
103 static bool switch_output_signal(struct record *rec)
104 {
105         return rec->switch_output.signal &&
106                trigger_is_ready(&switch_output_trigger);
107 }
108
109 static bool switch_output_size(struct record *rec)
110 {
111         return rec->switch_output.size &&
112                trigger_is_ready(&switch_output_trigger) &&
113                (rec->bytes_written >= rec->switch_output.size);
114 }
115
116 static bool switch_output_time(struct record *rec)
117 {
118         return rec->switch_output.time &&
119                trigger_is_ready(&switch_output_trigger);
120 }
121
122 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
123                          void *bf, size_t size)
124 {
125         struct perf_data_file *file = &rec->session->data->file;
126
127         if (perf_data_file__write(file, bf, size) < 0) {
128                 pr_err("failed to write perf data, error: %m\n");
129                 return -1;
130         }
131
132         rec->bytes_written += size;
133
134         if (switch_output_size(rec))
135                 trigger_hit(&switch_output_trigger);
136
137         return 0;
138 }
139
140 static int record__aio_enabled(struct record *rec);
141 static int record__comp_enabled(struct record *rec);
142 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
143                             void *src, size_t src_size);
144
145 #ifdef HAVE_AIO_SUPPORT
146 static int record__aio_write(struct aiocb *cblock, int trace_fd,
147                 void *buf, size_t size, off_t off)
148 {
149         int rc;
150
151         cblock->aio_fildes = trace_fd;
152         cblock->aio_buf    = buf;
153         cblock->aio_nbytes = size;
154         cblock->aio_offset = off;
155         cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
156
157         do {
158                 rc = aio_write(cblock);
159                 if (rc == 0) {
160                         break;
161                 } else if (errno != EAGAIN) {
162                         cblock->aio_fildes = -1;
163                         pr_err("failed to queue perf data, error: %m\n");
164                         break;
165                 }
166         } while (1);
167
168         return rc;
169 }
170
171 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
172 {
173         void *rem_buf;
174         off_t rem_off;
175         size_t rem_size;
176         int rc, aio_errno;
177         ssize_t aio_ret, written;
178
179         aio_errno = aio_error(cblock);
180         if (aio_errno == EINPROGRESS)
181                 return 0;
182
183         written = aio_ret = aio_return(cblock);
184         if (aio_ret < 0) {
185                 if (aio_errno != EINTR)
186                         pr_err("failed to write perf data, error: %m\n");
187                 written = 0;
188         }
189
190         rem_size = cblock->aio_nbytes - written;
191
192         if (rem_size == 0) {
193                 cblock->aio_fildes = -1;
194                 /*
195                  * md->refcount is incremented in record__aio_pushfn() for
196                  * every aio write request started in record__aio_push() so
197                  * decrement it because the request is now complete.
198                  */
199                 perf_mmap__put(md);
200                 rc = 1;
201         } else {
202                 /*
203                  * aio write request may require restart with the
204                  * reminder if the kernel didn't write whole
205                  * chunk at once.
206                  */
207                 rem_off = cblock->aio_offset + written;
208                 rem_buf = (void *)(cblock->aio_buf + written);
209                 record__aio_write(cblock, cblock->aio_fildes,
210                                 rem_buf, rem_size, rem_off);
211                 rc = 0;
212         }
213
214         return rc;
215 }
216
217 static int record__aio_sync(struct mmap *md, bool sync_all)
218 {
219         struct aiocb **aiocb = md->aio.aiocb;
220         struct aiocb *cblocks = md->aio.cblocks;
221         struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
222         int i, do_suspend;
223
224         do {
225                 do_suspend = 0;
226                 for (i = 0; i < md->aio.nr_cblocks; ++i) {
227                         if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
228                                 if (sync_all)
229                                         aiocb[i] = NULL;
230                                 else
231                                         return i;
232                         } else {
233                                 /*
234                                  * Started aio write is not complete yet
235                                  * so it has to be waited before the
236                                  * next allocation.
237                                  */
238                                 aiocb[i] = &cblocks[i];
239                                 do_suspend = 1;
240                         }
241                 }
242                 if (!do_suspend)
243                         return -1;
244
245                 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
246                         if (!(errno == EAGAIN || errno == EINTR))
247                                 pr_err("failed to sync perf data, error: %m\n");
248                 }
249         } while (1);
250 }
251
252 struct record_aio {
253         struct record   *rec;
254         void            *data;
255         size_t          size;
256 };
257
258 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
259 {
260         struct record_aio *aio = to;
261
262         /*
263          * map->base data pointed by buf is copied into free map->aio.data[] buffer
264          * to release space in the kernel buffer as fast as possible, calling
265          * perf_mmap__consume() from perf_mmap__push() function.
266          *
267          * That lets the kernel to proceed with storing more profiling data into
268          * the kernel buffer earlier than other per-cpu kernel buffers are handled.
269          *
270          * Coping can be done in two steps in case the chunk of profiling data
271          * crosses the upper bound of the kernel buffer. In this case we first move
272          * part of data from map->start till the upper bound and then the reminder
273          * from the beginning of the kernel buffer till the end of the data chunk.
274          */
275
276         if (record__comp_enabled(aio->rec)) {
277                 size = zstd_compress(aio->rec->session, aio->data + aio->size,
278                                      perf_mmap__mmap_len(map) - aio->size,
279                                      buf, size);
280         } else {
281                 memcpy(aio->data + aio->size, buf, size);
282         }
283
284         if (!aio->size) {
285                 /*
286                  * Increment map->refcount to guard map->aio.data[] buffer
287                  * from premature deallocation because map object can be
288                  * released earlier than aio write request started on
289                  * map->aio.data[] buffer is complete.
290                  *
291                  * perf_mmap__put() is done at record__aio_complete()
292                  * after started aio request completion or at record__aio_push()
293                  * if the request failed to start.
294                  */
295                 perf_mmap__get(map);
296         }
297
298         aio->size += size;
299
300         return size;
301 }
302
303 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
304 {
305         int ret, idx;
306         int trace_fd = rec->session->data->file.fd;
307         struct record_aio aio = { .rec = rec, .size = 0 };
308
309         /*
310          * Call record__aio_sync() to wait till map->aio.data[] buffer
311          * becomes available after previous aio write operation.
312          */
313
314         idx = record__aio_sync(map, false);
315         aio.data = map->aio.data[idx];
316         ret = perf_mmap__push(map, &aio, record__aio_pushfn);
317         if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
318                 return ret;
319
320         rec->samples++;
321         ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
322         if (!ret) {
323                 *off += aio.size;
324                 rec->bytes_written += aio.size;
325                 if (switch_output_size(rec))
326                         trigger_hit(&switch_output_trigger);
327         } else {
328                 /*
329                  * Decrement map->refcount incremented in record__aio_pushfn()
330                  * back if record__aio_write() operation failed to start, otherwise
331                  * map->refcount is decremented in record__aio_complete() after
332                  * aio write operation finishes successfully.
333                  */
334                 perf_mmap__put(map);
335         }
336
337         return ret;
338 }
339
340 static off_t record__aio_get_pos(int trace_fd)
341 {
342         return lseek(trace_fd, 0, SEEK_CUR);
343 }
344
345 static void record__aio_set_pos(int trace_fd, off_t pos)
346 {
347         lseek(trace_fd, pos, SEEK_SET);
348 }
349
350 static void record__aio_mmap_read_sync(struct record *rec)
351 {
352         int i;
353         struct evlist *evlist = rec->evlist;
354         struct mmap *maps = evlist->mmap;
355
356         if (!record__aio_enabled(rec))
357                 return;
358
359         for (i = 0; i < evlist->nr_mmaps; i++) {
360                 struct mmap *map = &maps[i];
361
362                 if (map->base)
363                         record__aio_sync(map, true);
364         }
365 }
366
367 static int nr_cblocks_default = 1;
368 static int nr_cblocks_max = 4;
369
370 static int record__aio_parse(const struct option *opt,
371                              const char *str,
372                              int unset)
373 {
374         struct record_opts *opts = (struct record_opts *)opt->value;
375
376         if (unset) {
377                 opts->nr_cblocks = 0;
378         } else {
379                 if (str)
380                         opts->nr_cblocks = strtol(str, NULL, 0);
381                 if (!opts->nr_cblocks)
382                         opts->nr_cblocks = nr_cblocks_default;
383         }
384
385         return 0;
386 }
387 #else /* HAVE_AIO_SUPPORT */
388 static int nr_cblocks_max = 0;
389
390 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
391                             off_t *off __maybe_unused)
392 {
393         return -1;
394 }
395
396 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
397 {
398         return -1;
399 }
400
401 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
402 {
403 }
404
405 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
406 {
407 }
408 #endif
409
410 static int record__aio_enabled(struct record *rec)
411 {
412         return rec->opts.nr_cblocks > 0;
413 }
414
415 #define MMAP_FLUSH_DEFAULT 1
416 static int record__mmap_flush_parse(const struct option *opt,
417                                     const char *str,
418                                     int unset)
419 {
420         int flush_max;
421         struct record_opts *opts = (struct record_opts *)opt->value;
422         static struct parse_tag tags[] = {
423                         { .tag  = 'B', .mult = 1       },
424                         { .tag  = 'K', .mult = 1 << 10 },
425                         { .tag  = 'M', .mult = 1 << 20 },
426                         { .tag  = 'G', .mult = 1 << 30 },
427                         { .tag  = 0 },
428         };
429
430         if (unset)
431                 return 0;
432
433         if (str) {
434                 opts->mmap_flush = parse_tag_value(str, tags);
435                 if (opts->mmap_flush == (int)-1)
436                         opts->mmap_flush = strtol(str, NULL, 0);
437         }
438
439         if (!opts->mmap_flush)
440                 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
441
442         flush_max = evlist__mmap_size(opts->mmap_pages);
443         flush_max /= 4;
444         if (opts->mmap_flush > flush_max)
445                 opts->mmap_flush = flush_max;
446
447         return 0;
448 }
449
450 #ifdef HAVE_ZSTD_SUPPORT
451 static unsigned int comp_level_default = 1;
452
453 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
454 {
455         struct record_opts *opts = opt->value;
456
457         if (unset) {
458                 opts->comp_level = 0;
459         } else {
460                 if (str)
461                         opts->comp_level = strtol(str, NULL, 0);
462                 if (!opts->comp_level)
463                         opts->comp_level = comp_level_default;
464         }
465
466         return 0;
467 }
468 #endif
469 static unsigned int comp_level_max = 22;
470
471 static int record__comp_enabled(struct record *rec)
472 {
473         return rec->opts.comp_level > 0;
474 }
475
476 static int process_synthesized_event(struct perf_tool *tool,
477                                      union perf_event *event,
478                                      struct perf_sample *sample __maybe_unused,
479                                      struct machine *machine __maybe_unused)
480 {
481         struct record *rec = container_of(tool, struct record, tool);
482         return record__write(rec, NULL, event, event->header.size);
483 }
484
485 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
486 {
487         struct record *rec = to;
488
489         if (record__comp_enabled(rec)) {
490                 size = zstd_compress(rec->session, map->data, perf_mmap__mmap_len(map), bf, size);
491                 bf   = map->data;
492         }
493
494         rec->samples++;
495         return record__write(rec, map, bf, size);
496 }
497
498 static volatile int done;
499 static volatile int signr = -1;
500 static volatile int child_finished;
501
502 static void sig_handler(int sig)
503 {
504         if (sig == SIGCHLD)
505                 child_finished = 1;
506         else
507                 signr = sig;
508
509         done = 1;
510 }
511
512 static void sigsegv_handler(int sig)
513 {
514         perf_hooks__recover();
515         sighandler_dump_stack(sig);
516 }
517
518 static void record__sig_exit(void)
519 {
520         if (signr == -1)
521                 return;
522
523         signal(signr, SIG_DFL);
524         raise(signr);
525 }
526
527 #ifdef HAVE_AUXTRACE_SUPPORT
528
529 static int record__process_auxtrace(struct perf_tool *tool,
530                                     struct mmap *map,
531                                     union perf_event *event, void *data1,
532                                     size_t len1, void *data2, size_t len2)
533 {
534         struct record *rec = container_of(tool, struct record, tool);
535         struct perf_data *data = &rec->data;
536         size_t padding;
537         u8 pad[8] = {0};
538
539         if (!perf_data__is_pipe(data) && !perf_data__is_dir(data)) {
540                 off_t file_offset;
541                 int fd = perf_data__fd(data);
542                 int err;
543
544                 file_offset = lseek(fd, 0, SEEK_CUR);
545                 if (file_offset == -1)
546                         return -1;
547                 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
548                                                      event, file_offset);
549                 if (err)
550                         return err;
551         }
552
553         /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
554         padding = (len1 + len2) & 7;
555         if (padding)
556                 padding = 8 - padding;
557
558         record__write(rec, map, event, event->header.size);
559         record__write(rec, map, data1, len1);
560         if (len2)
561                 record__write(rec, map, data2, len2);
562         record__write(rec, map, &pad, padding);
563
564         return 0;
565 }
566
567 static int record__auxtrace_mmap_read(struct record *rec,
568                                       struct mmap *map)
569 {
570         int ret;
571
572         ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
573                                   record__process_auxtrace);
574         if (ret < 0)
575                 return ret;
576
577         if (ret)
578                 rec->samples++;
579
580         return 0;
581 }
582
583 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
584                                                struct mmap *map)
585 {
586         int ret;
587
588         ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
589                                            record__process_auxtrace,
590                                            rec->opts.auxtrace_snapshot_size);
591         if (ret < 0)
592                 return ret;
593
594         if (ret)
595                 rec->samples++;
596
597         return 0;
598 }
599
600 static int record__auxtrace_read_snapshot_all(struct record *rec)
601 {
602         int i;
603         int rc = 0;
604
605         for (i = 0; i < rec->evlist->nr_mmaps; i++) {
606                 struct mmap *map = &rec->evlist->mmap[i];
607
608                 if (!map->auxtrace_mmap.base)
609                         continue;
610
611                 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
612                         rc = -1;
613                         goto out;
614                 }
615         }
616 out:
617         return rc;
618 }
619
620 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
621 {
622         pr_debug("Recording AUX area tracing snapshot\n");
623         if (record__auxtrace_read_snapshot_all(rec) < 0) {
624                 trigger_error(&auxtrace_snapshot_trigger);
625         } else {
626                 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
627                         trigger_error(&auxtrace_snapshot_trigger);
628                 else
629                         trigger_ready(&auxtrace_snapshot_trigger);
630         }
631 }
632
633 static int record__auxtrace_snapshot_exit(struct record *rec)
634 {
635         if (trigger_is_error(&auxtrace_snapshot_trigger))
636                 return 0;
637
638         if (!auxtrace_record__snapshot_started &&
639             auxtrace_record__snapshot_start(rec->itr))
640                 return -1;
641
642         record__read_auxtrace_snapshot(rec, true);
643         if (trigger_is_error(&auxtrace_snapshot_trigger))
644                 return -1;
645
646         return 0;
647 }
648
649 static int record__auxtrace_init(struct record *rec)
650 {
651         int err;
652
653         if (!rec->itr) {
654                 rec->itr = auxtrace_record__init(rec->evlist, &err);
655                 if (err)
656                         return err;
657         }
658
659         err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
660                                               rec->opts.auxtrace_snapshot_opts);
661         if (err)
662                 return err;
663
664         return auxtrace_parse_filters(rec->evlist);
665 }
666
667 #else
668
669 static inline
670 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
671                                struct mmap *map __maybe_unused)
672 {
673         return 0;
674 }
675
676 static inline
677 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
678                                     bool on_exit __maybe_unused)
679 {
680 }
681
682 static inline
683 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
684 {
685         return 0;
686 }
687
688 static inline
689 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
690 {
691         return 0;
692 }
693
694 static int record__auxtrace_init(struct record *rec __maybe_unused)
695 {
696         return 0;
697 }
698
699 #endif
700
701 static int record__mmap_evlist(struct record *rec,
702                                struct evlist *evlist)
703 {
704         struct record_opts *opts = &rec->opts;
705         char msg[512];
706
707         if (opts->affinity != PERF_AFFINITY_SYS)
708                 cpu__setup_cpunode_map();
709
710         if (evlist__mmap_ex(evlist, opts->mmap_pages,
711                                  opts->auxtrace_mmap_pages,
712                                  opts->auxtrace_snapshot_mode,
713                                  opts->nr_cblocks, opts->affinity,
714                                  opts->mmap_flush, opts->comp_level) < 0) {
715                 if (errno == EPERM) {
716                         pr_err("Permission error mapping pages.\n"
717                                "Consider increasing "
718                                "/proc/sys/kernel/perf_event_mlock_kb,\n"
719                                "or try again with a smaller value of -m/--mmap_pages.\n"
720                                "(current value: %u,%u)\n",
721                                opts->mmap_pages, opts->auxtrace_mmap_pages);
722                         return -errno;
723                 } else {
724                         pr_err("failed to mmap with %d (%s)\n", errno,
725                                 str_error_r(errno, msg, sizeof(msg)));
726                         if (errno)
727                                 return -errno;
728                         else
729                                 return -EINVAL;
730                 }
731         }
732         return 0;
733 }
734
735 static int record__mmap(struct record *rec)
736 {
737         return record__mmap_evlist(rec, rec->evlist);
738 }
739
740 static int record__open(struct record *rec)
741 {
742         char msg[BUFSIZ];
743         struct evsel *pos;
744         struct evlist *evlist = rec->evlist;
745         struct perf_session *session = rec->session;
746         struct record_opts *opts = &rec->opts;
747         int rc = 0;
748
749         /*
750          * For initial_delay we need to add a dummy event so that we can track
751          * PERF_RECORD_MMAP while we wait for the initial delay to enable the
752          * real events, the ones asked by the user.
753          */
754         if (opts->initial_delay) {
755                 if (perf_evlist__add_dummy(evlist))
756                         return -ENOMEM;
757
758                 pos = perf_evlist__first(evlist);
759                 pos->tracking = 0;
760                 pos = perf_evlist__last(evlist);
761                 pos->tracking = 1;
762                 pos->core.attr.enable_on_exec = 1;
763         }
764
765         perf_evlist__config(evlist, opts, &callchain_param);
766
767         evlist__for_each_entry(evlist, pos) {
768 try_again:
769                 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
770                         if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
771                                 if (verbose > 0)
772                                         ui__warning("%s\n", msg);
773                                 goto try_again;
774                         }
775                         if ((errno == EINVAL || errno == EBADF) &&
776                             pos->leader != pos &&
777                             pos->weak_group) {
778                                 pos = perf_evlist__reset_weak_group(evlist, pos);
779                                 goto try_again;
780                         }
781                         rc = -errno;
782                         perf_evsel__open_strerror(pos, &opts->target,
783                                                   errno, msg, sizeof(msg));
784                         ui__error("%s\n", msg);
785                         goto out;
786                 }
787
788                 pos->supported = true;
789         }
790
791         if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(evlist)) {
792                 pr_warning(
793 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
794 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
795 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
796 "file is not found in the buildid cache or in the vmlinux path.\n\n"
797 "Samples in kernel modules won't be resolved at all.\n\n"
798 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
799 "even with a suitable vmlinux or kallsyms file.\n\n");
800         }
801
802         if (perf_evlist__apply_filters(evlist, &pos)) {
803                 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
804                         pos->filter, perf_evsel__name(pos), errno,
805                         str_error_r(errno, msg, sizeof(msg)));
806                 rc = -1;
807                 goto out;
808         }
809
810         rc = record__mmap(rec);
811         if (rc)
812                 goto out;
813
814         session->evlist = evlist;
815         perf_session__set_id_hdr_size(session);
816 out:
817         return rc;
818 }
819
820 static int process_sample_event(struct perf_tool *tool,
821                                 union perf_event *event,
822                                 struct perf_sample *sample,
823                                 struct evsel *evsel,
824                                 struct machine *machine)
825 {
826         struct record *rec = container_of(tool, struct record, tool);
827
828         if (rec->evlist->first_sample_time == 0)
829                 rec->evlist->first_sample_time = sample->time;
830
831         rec->evlist->last_sample_time = sample->time;
832
833         if (rec->buildid_all)
834                 return 0;
835
836         rec->samples++;
837         return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
838 }
839
840 static int process_buildids(struct record *rec)
841 {
842         struct perf_session *session = rec->session;
843
844         if (perf_data__size(&rec->data) == 0)
845                 return 0;
846
847         /*
848          * During this process, it'll load kernel map and replace the
849          * dso->long_name to a real pathname it found.  In this case
850          * we prefer the vmlinux path like
851          *   /lib/modules/3.16.4/build/vmlinux
852          *
853          * rather than build-id path (in debug directory).
854          *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
855          */
856         symbol_conf.ignore_vmlinux_buildid = true;
857
858         /*
859          * If --buildid-all is given, it marks all DSO regardless of hits,
860          * so no need to process samples. But if timestamp_boundary is enabled,
861          * it still needs to walk on all samples to get the timestamps of
862          * first/last samples.
863          */
864         if (rec->buildid_all && !rec->timestamp_boundary)
865                 rec->tool.sample = NULL;
866
867         return perf_session__process_events(session);
868 }
869
870 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
871 {
872         int err;
873         struct perf_tool *tool = data;
874         /*
875          *As for guest kernel when processing subcommand record&report,
876          *we arrange module mmap prior to guest kernel mmap and trigger
877          *a preload dso because default guest module symbols are loaded
878          *from guest kallsyms instead of /lib/modules/XXX/XXX. This
879          *method is used to avoid symbol missing when the first addr is
880          *in module instead of in guest kernel.
881          */
882         err = perf_event__synthesize_modules(tool, process_synthesized_event,
883                                              machine);
884         if (err < 0)
885                 pr_err("Couldn't record guest kernel [%d]'s reference"
886                        " relocation symbol.\n", machine->pid);
887
888         /*
889          * We use _stext for guest kernel because guest kernel's /proc/kallsyms
890          * have no _text sometimes.
891          */
892         err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
893                                                  machine);
894         if (err < 0)
895                 pr_err("Couldn't record guest kernel [%d]'s reference"
896                        " relocation symbol.\n", machine->pid);
897 }
898
899 static struct perf_event_header finished_round_event = {
900         .size = sizeof(struct perf_event_header),
901         .type = PERF_RECORD_FINISHED_ROUND,
902 };
903
904 static void record__adjust_affinity(struct record *rec, struct mmap *map)
905 {
906         if (rec->opts.affinity != PERF_AFFINITY_SYS &&
907             !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
908                 CPU_ZERO(&rec->affinity_mask);
909                 CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask);
910                 sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask);
911         }
912 }
913
914 static size_t process_comp_header(void *record, size_t increment)
915 {
916         struct perf_record_compressed *event = record;
917         size_t size = sizeof(*event);
918
919         if (increment) {
920                 event->header.size += increment;
921                 return increment;
922         }
923
924         event->header.type = PERF_RECORD_COMPRESSED;
925         event->header.size = size;
926
927         return size;
928 }
929
930 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
931                             void *src, size_t src_size)
932 {
933         size_t compressed;
934         size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
935
936         compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
937                                                      max_record_size, process_comp_header);
938
939         session->bytes_transferred += src_size;
940         session->bytes_compressed  += compressed;
941
942         return compressed;
943 }
944
945 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
946                                     bool overwrite, bool synch)
947 {
948         u64 bytes_written = rec->bytes_written;
949         int i;
950         int rc = 0;
951         struct mmap *maps;
952         int trace_fd = rec->data.file.fd;
953         off_t off = 0;
954
955         if (!evlist)
956                 return 0;
957
958         maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
959         if (!maps)
960                 return 0;
961
962         if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
963                 return 0;
964
965         if (record__aio_enabled(rec))
966                 off = record__aio_get_pos(trace_fd);
967
968         for (i = 0; i < evlist->nr_mmaps; i++) {
969                 u64 flush = 0;
970                 struct mmap *map = &maps[i];
971
972                 if (map->base) {
973                         record__adjust_affinity(rec, map);
974                         if (synch) {
975                                 flush = map->flush;
976                                 map->flush = 1;
977                         }
978                         if (!record__aio_enabled(rec)) {
979                                 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
980                                         if (synch)
981                                                 map->flush = flush;
982                                         rc = -1;
983                                         goto out;
984                                 }
985                         } else {
986                                 if (record__aio_push(rec, map, &off) < 0) {
987                                         record__aio_set_pos(trace_fd, off);
988                                         if (synch)
989                                                 map->flush = flush;
990                                         rc = -1;
991                                         goto out;
992                                 }
993                         }
994                         if (synch)
995                                 map->flush = flush;
996                 }
997
998                 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
999                     record__auxtrace_mmap_read(rec, map) != 0) {
1000                         rc = -1;
1001                         goto out;
1002                 }
1003         }
1004
1005         if (record__aio_enabled(rec))
1006                 record__aio_set_pos(trace_fd, off);
1007
1008         /*
1009          * Mark the round finished in case we wrote
1010          * at least one event.
1011          */
1012         if (bytes_written != rec->bytes_written)
1013                 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1014
1015         if (overwrite)
1016                 perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1017 out:
1018         return rc;
1019 }
1020
1021 static int record__mmap_read_all(struct record *rec, bool synch)
1022 {
1023         int err;
1024
1025         err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1026         if (err)
1027                 return err;
1028
1029         return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1030 }
1031
1032 static void record__init_features(struct record *rec)
1033 {
1034         struct perf_session *session = rec->session;
1035         int feat;
1036
1037         for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1038                 perf_header__set_feat(&session->header, feat);
1039
1040         if (rec->no_buildid)
1041                 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1042
1043         if (!have_tracepoints(&rec->evlist->core.entries))
1044                 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1045
1046         if (!rec->opts.branch_stack)
1047                 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1048
1049         if (!rec->opts.full_auxtrace)
1050                 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1051
1052         if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1053                 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1054
1055         perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1056         if (!record__comp_enabled(rec))
1057                 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1058
1059         perf_header__clear_feat(&session->header, HEADER_STAT);
1060 }
1061
1062 static void
1063 record__finish_output(struct record *rec)
1064 {
1065         struct perf_data *data = &rec->data;
1066         int fd = perf_data__fd(data);
1067
1068         if (data->is_pipe)
1069                 return;
1070
1071         rec->session->header.data_size += rec->bytes_written;
1072         data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1073
1074         if (!rec->no_buildid) {
1075                 process_buildids(rec);
1076
1077                 if (rec->buildid_all)
1078                         dsos__hit_all(rec->session);
1079         }
1080         perf_session__write_header(rec->session, rec->evlist, fd, true);
1081
1082         return;
1083 }
1084
1085 static int record__synthesize_workload(struct record *rec, bool tail)
1086 {
1087         int err;
1088         struct perf_thread_map *thread_map;
1089
1090         if (rec->opts.tail_synthesize != tail)
1091                 return 0;
1092
1093         thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1094         if (thread_map == NULL)
1095                 return -1;
1096
1097         err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1098                                                  process_synthesized_event,
1099                                                  &rec->session->machines.host,
1100                                                  rec->opts.sample_address);
1101         perf_thread_map__put(thread_map);
1102         return err;
1103 }
1104
1105 static int record__synthesize(struct record *rec, bool tail);
1106
1107 static int
1108 record__switch_output(struct record *rec, bool at_exit)
1109 {
1110         struct perf_data *data = &rec->data;
1111         int fd, err;
1112         char *new_filename;
1113
1114         /* Same Size:      "2015122520103046"*/
1115         char timestamp[] = "InvalidTimestamp";
1116
1117         record__aio_mmap_read_sync(rec);
1118
1119         record__synthesize(rec, true);
1120         if (target__none(&rec->opts.target))
1121                 record__synthesize_workload(rec, true);
1122
1123         rec->samples = 0;
1124         record__finish_output(rec);
1125         err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1126         if (err) {
1127                 pr_err("Failed to get current timestamp\n");
1128                 return -EINVAL;
1129         }
1130
1131         fd = perf_data__switch(data, timestamp,
1132                                     rec->session->header.data_offset,
1133                                     at_exit, &new_filename);
1134         if (fd >= 0 && !at_exit) {
1135                 rec->bytes_written = 0;
1136                 rec->session->header.data_size = 0;
1137         }
1138
1139         if (!quiet)
1140                 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1141                         data->path, timestamp);
1142
1143         if (rec->switch_output.num_files) {
1144                 int n = rec->switch_output.cur_file + 1;
1145
1146                 if (n >= rec->switch_output.num_files)
1147                         n = 0;
1148                 rec->switch_output.cur_file = n;
1149                 if (rec->switch_output.filenames[n]) {
1150                         remove(rec->switch_output.filenames[n]);
1151                         zfree(&rec->switch_output.filenames[n]);
1152                 }
1153                 rec->switch_output.filenames[n] = new_filename;
1154         } else {
1155                 free(new_filename);
1156         }
1157
1158         /* Output tracking events */
1159         if (!at_exit) {
1160                 record__synthesize(rec, false);
1161
1162                 /*
1163                  * In 'perf record --switch-output' without -a,
1164                  * record__synthesize() in record__switch_output() won't
1165                  * generate tracking events because there's no thread_map
1166                  * in evlist. Which causes newly created perf.data doesn't
1167                  * contain map and comm information.
1168                  * Create a fake thread_map and directly call
1169                  * perf_event__synthesize_thread_map() for those events.
1170                  */
1171                 if (target__none(&rec->opts.target))
1172                         record__synthesize_workload(rec, false);
1173         }
1174         return fd;
1175 }
1176
1177 static volatile int workload_exec_errno;
1178
1179 /*
1180  * perf_evlist__prepare_workload will send a SIGUSR1
1181  * if the fork fails, since we asked by setting its
1182  * want_signal to true.
1183  */
1184 static void workload_exec_failed_signal(int signo __maybe_unused,
1185                                         siginfo_t *info,
1186                                         void *ucontext __maybe_unused)
1187 {
1188         workload_exec_errno = info->si_value.sival_int;
1189         done = 1;
1190         child_finished = 1;
1191 }
1192
1193 static void snapshot_sig_handler(int sig);
1194 static void alarm_sig_handler(int sig);
1195
1196 static const struct perf_event_mmap_page *
1197 perf_evlist__pick_pc(struct evlist *evlist)
1198 {
1199         if (evlist) {
1200                 if (evlist->mmap && evlist->mmap[0].base)
1201                         return evlist->mmap[0].base;
1202                 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].base)
1203                         return evlist->overwrite_mmap[0].base;
1204         }
1205         return NULL;
1206 }
1207
1208 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1209 {
1210         const struct perf_event_mmap_page *pc;
1211
1212         pc = perf_evlist__pick_pc(rec->evlist);
1213         if (pc)
1214                 return pc;
1215         return NULL;
1216 }
1217
1218 static int record__synthesize(struct record *rec, bool tail)
1219 {
1220         struct perf_session *session = rec->session;
1221         struct machine *machine = &session->machines.host;
1222         struct perf_data *data = &rec->data;
1223         struct record_opts *opts = &rec->opts;
1224         struct perf_tool *tool = &rec->tool;
1225         int fd = perf_data__fd(data);
1226         int err = 0;
1227
1228         if (rec->opts.tail_synthesize != tail)
1229                 return 0;
1230
1231         if (data->is_pipe) {
1232                 /*
1233                  * We need to synthesize events first, because some
1234                  * features works on top of them (on report side).
1235                  */
1236                 err = perf_event__synthesize_attrs(tool, rec->evlist,
1237                                                    process_synthesized_event);
1238                 if (err < 0) {
1239                         pr_err("Couldn't synthesize attrs.\n");
1240                         goto out;
1241                 }
1242
1243                 err = perf_event__synthesize_features(tool, session, rec->evlist,
1244                                                       process_synthesized_event);
1245                 if (err < 0) {
1246                         pr_err("Couldn't synthesize features.\n");
1247                         return err;
1248                 }
1249
1250                 if (have_tracepoints(&rec->evlist->core.entries)) {
1251                         /*
1252                          * FIXME err <= 0 here actually means that
1253                          * there were no tracepoints so its not really
1254                          * an error, just that we don't need to
1255                          * synthesize anything.  We really have to
1256                          * return this more properly and also
1257                          * propagate errors that now are calling die()
1258                          */
1259                         err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
1260                                                                   process_synthesized_event);
1261                         if (err <= 0) {
1262                                 pr_err("Couldn't record tracing data.\n");
1263                                 goto out;
1264                         }
1265                         rec->bytes_written += err;
1266                 }
1267         }
1268
1269         err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1270                                           process_synthesized_event, machine);
1271         if (err)
1272                 goto out;
1273
1274         if (rec->opts.full_auxtrace) {
1275                 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1276                                         session, process_synthesized_event);
1277                 if (err)
1278                         goto out;
1279         }
1280
1281         if (!perf_evlist__exclude_kernel(rec->evlist)) {
1282                 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1283                                                          machine);
1284                 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1285                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1286                                    "Check /proc/kallsyms permission or run as root.\n");
1287
1288                 err = perf_event__synthesize_modules(tool, process_synthesized_event,
1289                                                      machine);
1290                 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1291                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1292                                    "Check /proc/modules permission or run as root.\n");
1293         }
1294
1295         if (perf_guest) {
1296                 machines__process_guests(&session->machines,
1297                                          perf_event__synthesize_guest_os, tool);
1298         }
1299
1300         err = perf_event__synthesize_extra_attr(&rec->tool,
1301                                                 rec->evlist,
1302                                                 process_synthesized_event,
1303                                                 data->is_pipe);
1304         if (err)
1305                 goto out;
1306
1307         err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1308                                                  process_synthesized_event,
1309                                                 NULL);
1310         if (err < 0) {
1311                 pr_err("Couldn't synthesize thread map.\n");
1312                 return err;
1313         }
1314
1315         err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1316                                              process_synthesized_event, NULL);
1317         if (err < 0) {
1318                 pr_err("Couldn't synthesize cpu map.\n");
1319                 return err;
1320         }
1321
1322         err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1323                                                 machine, opts);
1324         if (err < 0)
1325                 pr_warning("Couldn't synthesize bpf events.\n");
1326
1327         err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1328                                             process_synthesized_event, opts->sample_address,
1329                                             1);
1330 out:
1331         return err;
1332 }
1333
1334 static int __cmd_record(struct record *rec, int argc, const char **argv)
1335 {
1336         int err;
1337         int status = 0;
1338         unsigned long waking = 0;
1339         const bool forks = argc > 0;
1340         struct perf_tool *tool = &rec->tool;
1341         struct record_opts *opts = &rec->opts;
1342         struct perf_data *data = &rec->data;
1343         struct perf_session *session;
1344         bool disabled = false, draining = false;
1345         struct evlist *sb_evlist = NULL;
1346         int fd;
1347         float ratio = 0;
1348
1349         atexit(record__sig_exit);
1350         signal(SIGCHLD, sig_handler);
1351         signal(SIGINT, sig_handler);
1352         signal(SIGTERM, sig_handler);
1353         signal(SIGSEGV, sigsegv_handler);
1354
1355         if (rec->opts.record_namespaces)
1356                 tool->namespace_events = true;
1357
1358         if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1359                 signal(SIGUSR2, snapshot_sig_handler);
1360                 if (rec->opts.auxtrace_snapshot_mode)
1361                         trigger_on(&auxtrace_snapshot_trigger);
1362                 if (rec->switch_output.enabled)
1363                         trigger_on(&switch_output_trigger);
1364         } else {
1365                 signal(SIGUSR2, SIG_IGN);
1366         }
1367
1368         session = perf_session__new(data, false, tool);
1369         if (IS_ERR(session)) {
1370                 pr_err("Perf session creation failed.\n");
1371                 return PTR_ERR(session);
1372         }
1373
1374         fd = perf_data__fd(data);
1375         rec->session = session;
1376
1377         if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1378                 pr_err("Compression initialization failed.\n");
1379                 return -1;
1380         }
1381
1382         session->header.env.comp_type  = PERF_COMP_ZSTD;
1383         session->header.env.comp_level = rec->opts.comp_level;
1384
1385         record__init_features(rec);
1386
1387         if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1388                 session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1389
1390         if (forks) {
1391                 err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1392                                                     argv, data->is_pipe,
1393                                                     workload_exec_failed_signal);
1394                 if (err < 0) {
1395                         pr_err("Couldn't run the workload!\n");
1396                         status = err;
1397                         goto out_delete_session;
1398                 }
1399         }
1400
1401         /*
1402          * If we have just single event and are sending data
1403          * through pipe, we need to force the ids allocation,
1404          * because we synthesize event name through the pipe
1405          * and need the id for that.
1406          */
1407         if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1408                 rec->opts.sample_id = true;
1409
1410         if (record__open(rec) != 0) {
1411                 err = -1;
1412                 goto out_child;
1413         }
1414         session->header.env.comp_mmap_len = session->evlist->mmap_len;
1415
1416         err = bpf__apply_obj_config();
1417         if (err) {
1418                 char errbuf[BUFSIZ];
1419
1420                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1421                 pr_err("ERROR: Apply config to BPF failed: %s\n",
1422                          errbuf);
1423                 goto out_child;
1424         }
1425
1426         /*
1427          * Normally perf_session__new would do this, but it doesn't have the
1428          * evlist.
1429          */
1430         if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1431                 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1432                 rec->tool.ordered_events = false;
1433         }
1434
1435         if (!rec->evlist->nr_groups)
1436                 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1437
1438         if (data->is_pipe) {
1439                 err = perf_header__write_pipe(fd);
1440                 if (err < 0)
1441                         goto out_child;
1442         } else {
1443                 err = perf_session__write_header(session, rec->evlist, fd, false);
1444                 if (err < 0)
1445                         goto out_child;
1446         }
1447
1448         if (!rec->no_buildid
1449             && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1450                 pr_err("Couldn't generate buildids. "
1451                        "Use --no-buildid to profile anyway.\n");
1452                 err = -1;
1453                 goto out_child;
1454         }
1455
1456         if (!opts->no_bpf_event)
1457                 bpf_event__add_sb_event(&sb_evlist, &session->header.env);
1458
1459         if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) {
1460                 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1461                 opts->no_bpf_event = true;
1462         }
1463
1464         err = record__synthesize(rec, false);
1465         if (err < 0)
1466                 goto out_child;
1467
1468         if (rec->realtime_prio) {
1469                 struct sched_param param;
1470
1471                 param.sched_priority = rec->realtime_prio;
1472                 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1473                         pr_err("Could not set realtime priority.\n");
1474                         err = -1;
1475                         goto out_child;
1476                 }
1477         }
1478
1479         /*
1480          * When perf is starting the traced process, all the events
1481          * (apart from group members) have enable_on_exec=1 set,
1482          * so don't spoil it by prematurely enabling them.
1483          */
1484         if (!target__none(&opts->target) && !opts->initial_delay)
1485                 evlist__enable(rec->evlist);
1486
1487         /*
1488          * Let the child rip
1489          */
1490         if (forks) {
1491                 struct machine *machine = &session->machines.host;
1492                 union perf_event *event;
1493                 pid_t tgid;
1494
1495                 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1496                 if (event == NULL) {
1497                         err = -ENOMEM;
1498                         goto out_child;
1499                 }
1500
1501                 /*
1502                  * Some H/W events are generated before COMM event
1503                  * which is emitted during exec(), so perf script
1504                  * cannot see a correct process name for those events.
1505                  * Synthesize COMM event to prevent it.
1506                  */
1507                 tgid = perf_event__synthesize_comm(tool, event,
1508                                                    rec->evlist->workload.pid,
1509                                                    process_synthesized_event,
1510                                                    machine);
1511                 free(event);
1512
1513                 if (tgid == -1)
1514                         goto out_child;
1515
1516                 event = malloc(sizeof(event->namespaces) +
1517                                (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1518                                machine->id_hdr_size);
1519                 if (event == NULL) {
1520                         err = -ENOMEM;
1521                         goto out_child;
1522                 }
1523
1524                 /*
1525                  * Synthesize NAMESPACES event for the command specified.
1526                  */
1527                 perf_event__synthesize_namespaces(tool, event,
1528                                                   rec->evlist->workload.pid,
1529                                                   tgid, process_synthesized_event,
1530                                                   machine);
1531                 free(event);
1532
1533                 perf_evlist__start_workload(rec->evlist);
1534         }
1535
1536         if (opts->initial_delay) {
1537                 usleep(opts->initial_delay * USEC_PER_MSEC);
1538                 evlist__enable(rec->evlist);
1539         }
1540
1541         trigger_ready(&auxtrace_snapshot_trigger);
1542         trigger_ready(&switch_output_trigger);
1543         perf_hooks__invoke_record_start();
1544         for (;;) {
1545                 unsigned long long hits = rec->samples;
1546
1547                 /*
1548                  * rec->evlist->bkw_mmap_state is possible to be
1549                  * BKW_MMAP_EMPTY here: when done == true and
1550                  * hits != rec->samples in previous round.
1551                  *
1552                  * perf_evlist__toggle_bkw_mmap ensure we never
1553                  * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1554                  */
1555                 if (trigger_is_hit(&switch_output_trigger) || done || draining)
1556                         perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1557
1558                 if (record__mmap_read_all(rec, false) < 0) {
1559                         trigger_error(&auxtrace_snapshot_trigger);
1560                         trigger_error(&switch_output_trigger);
1561                         err = -1;
1562                         goto out_child;
1563                 }
1564
1565                 if (auxtrace_record__snapshot_started) {
1566                         auxtrace_record__snapshot_started = 0;
1567                         if (!trigger_is_error(&auxtrace_snapshot_trigger))
1568                                 record__read_auxtrace_snapshot(rec, false);
1569                         if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1570                                 pr_err("AUX area tracing snapshot failed\n");
1571                                 err = -1;
1572                                 goto out_child;
1573                         }
1574                 }
1575
1576                 if (trigger_is_hit(&switch_output_trigger)) {
1577                         /*
1578                          * If switch_output_trigger is hit, the data in
1579                          * overwritable ring buffer should have been collected,
1580                          * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1581                          *
1582                          * If SIGUSR2 raise after or during record__mmap_read_all(),
1583                          * record__mmap_read_all() didn't collect data from
1584                          * overwritable ring buffer. Read again.
1585                          */
1586                         if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1587                                 continue;
1588                         trigger_ready(&switch_output_trigger);
1589
1590                         /*
1591                          * Reenable events in overwrite ring buffer after
1592                          * record__mmap_read_all(): we should have collected
1593                          * data from it.
1594                          */
1595                         perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1596
1597                         if (!quiet)
1598                                 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1599                                         waking);
1600                         waking = 0;
1601                         fd = record__switch_output(rec, false);
1602                         if (fd < 0) {
1603                                 pr_err("Failed to switch to new file\n");
1604                                 trigger_error(&switch_output_trigger);
1605                                 err = fd;
1606                                 goto out_child;
1607                         }
1608
1609                         /* re-arm the alarm */
1610                         if (rec->switch_output.time)
1611                                 alarm(rec->switch_output.time);
1612                 }
1613
1614                 if (hits == rec->samples) {
1615                         if (done || draining)
1616                                 break;
1617                         err = perf_evlist__poll(rec->evlist, -1);
1618                         /*
1619                          * Propagate error, only if there's any. Ignore positive
1620                          * number of returned events and interrupt error.
1621                          */
1622                         if (err > 0 || (err < 0 && errno == EINTR))
1623                                 err = 0;
1624                         waking++;
1625
1626                         if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1627                                 draining = true;
1628                 }
1629
1630                 /*
1631                  * When perf is starting the traced process, at the end events
1632                  * die with the process and we wait for that. Thus no need to
1633                  * disable events in this case.
1634                  */
1635                 if (done && !disabled && !target__none(&opts->target)) {
1636                         trigger_off(&auxtrace_snapshot_trigger);
1637                         evlist__disable(rec->evlist);
1638                         disabled = true;
1639                 }
1640         }
1641
1642         trigger_off(&auxtrace_snapshot_trigger);
1643         trigger_off(&switch_output_trigger);
1644
1645         if (opts->auxtrace_snapshot_on_exit)
1646                 record__auxtrace_snapshot_exit(rec);
1647
1648         if (forks && workload_exec_errno) {
1649                 char msg[STRERR_BUFSIZE];
1650                 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1651                 pr_err("Workload failed: %s\n", emsg);
1652                 err = -1;
1653                 goto out_child;
1654         }
1655
1656         if (!quiet)
1657                 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1658
1659         if (target__none(&rec->opts.target))
1660                 record__synthesize_workload(rec, true);
1661
1662 out_child:
1663         record__mmap_read_all(rec, true);
1664         record__aio_mmap_read_sync(rec);
1665
1666         if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1667                 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1668                 session->header.env.comp_ratio = ratio + 0.5;
1669         }
1670
1671         if (forks) {
1672                 int exit_status;
1673
1674                 if (!child_finished)
1675                         kill(rec->evlist->workload.pid, SIGTERM);
1676
1677                 wait(&exit_status);
1678
1679                 if (err < 0)
1680                         status = err;
1681                 else if (WIFEXITED(exit_status))
1682                         status = WEXITSTATUS(exit_status);
1683                 else if (WIFSIGNALED(exit_status))
1684                         signr = WTERMSIG(exit_status);
1685         } else
1686                 status = err;
1687
1688         record__synthesize(rec, true);
1689         /* this will be recalculated during process_buildids() */
1690         rec->samples = 0;
1691
1692         if (!err) {
1693                 if (!rec->timestamp_filename) {
1694                         record__finish_output(rec);
1695                 } else {
1696                         fd = record__switch_output(rec, true);
1697                         if (fd < 0) {
1698                                 status = fd;
1699                                 goto out_delete_session;
1700                         }
1701                 }
1702         }
1703
1704         perf_hooks__invoke_record_end();
1705
1706         if (!err && !quiet) {
1707                 char samples[128];
1708                 const char *postfix = rec->timestamp_filename ?
1709                                         ".<timestamp>" : "";
1710
1711                 if (rec->samples && !rec->opts.full_auxtrace)
1712                         scnprintf(samples, sizeof(samples),
1713                                   " (%" PRIu64 " samples)", rec->samples);
1714                 else
1715                         samples[0] = '\0';
1716
1717                 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
1718                         perf_data__size(data) / 1024.0 / 1024.0,
1719                         data->path, postfix, samples);
1720                 if (ratio) {
1721                         fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
1722                                         rec->session->bytes_transferred / 1024.0 / 1024.0,
1723                                         ratio);
1724                 }
1725                 fprintf(stderr, " ]\n");
1726         }
1727
1728 out_delete_session:
1729         zstd_fini(&session->zstd_data);
1730         perf_session__delete(session);
1731
1732         if (!opts->no_bpf_event)
1733                 perf_evlist__stop_sb_thread(sb_evlist);
1734         return status;
1735 }
1736
1737 static void callchain_debug(struct callchain_param *callchain)
1738 {
1739         static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1740
1741         pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1742
1743         if (callchain->record_mode == CALLCHAIN_DWARF)
1744                 pr_debug("callchain: stack dump size %d\n",
1745                          callchain->dump_size);
1746 }
1747
1748 int record_opts__parse_callchain(struct record_opts *record,
1749                                  struct callchain_param *callchain,
1750                                  const char *arg, bool unset)
1751 {
1752         int ret;
1753         callchain->enabled = !unset;
1754
1755         /* --no-call-graph */
1756         if (unset) {
1757                 callchain->record_mode = CALLCHAIN_NONE;
1758                 pr_debug("callchain: disabled\n");
1759                 return 0;
1760         }
1761
1762         ret = parse_callchain_record_opt(arg, callchain);
1763         if (!ret) {
1764                 /* Enable data address sampling for DWARF unwind. */
1765                 if (callchain->record_mode == CALLCHAIN_DWARF)
1766                         record->sample_address = true;
1767                 callchain_debug(callchain);
1768         }
1769
1770         return ret;
1771 }
1772
1773 int record_parse_callchain_opt(const struct option *opt,
1774                                const char *arg,
1775                                int unset)
1776 {
1777         return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1778 }
1779
1780 int record_callchain_opt(const struct option *opt,
1781                          const char *arg __maybe_unused,
1782                          int unset __maybe_unused)
1783 {
1784         struct callchain_param *callchain = opt->value;
1785
1786         callchain->enabled = true;
1787
1788         if (callchain->record_mode == CALLCHAIN_NONE)
1789                 callchain->record_mode = CALLCHAIN_FP;
1790
1791         callchain_debug(callchain);
1792         return 0;
1793 }
1794
1795 static int perf_record_config(const char *var, const char *value, void *cb)
1796 {
1797         struct record *rec = cb;
1798
1799         if (!strcmp(var, "record.build-id")) {
1800                 if (!strcmp(value, "cache"))
1801                         rec->no_buildid_cache = false;
1802                 else if (!strcmp(value, "no-cache"))
1803                         rec->no_buildid_cache = true;
1804                 else if (!strcmp(value, "skip"))
1805                         rec->no_buildid = true;
1806                 else
1807                         return -1;
1808                 return 0;
1809         }
1810         if (!strcmp(var, "record.call-graph")) {
1811                 var = "call-graph.record-mode";
1812                 return perf_default_config(var, value, cb);
1813         }
1814 #ifdef HAVE_AIO_SUPPORT
1815         if (!strcmp(var, "record.aio")) {
1816                 rec->opts.nr_cblocks = strtol(value, NULL, 0);
1817                 if (!rec->opts.nr_cblocks)
1818                         rec->opts.nr_cblocks = nr_cblocks_default;
1819         }
1820 #endif
1821
1822         return 0;
1823 }
1824
1825 struct clockid_map {
1826         const char *name;
1827         int clockid;
1828 };
1829
1830 #define CLOCKID_MAP(n, c)       \
1831         { .name = n, .clockid = (c), }
1832
1833 #define CLOCKID_END     { .name = NULL, }
1834
1835
1836 /*
1837  * Add the missing ones, we need to build on many distros...
1838  */
1839 #ifndef CLOCK_MONOTONIC_RAW
1840 #define CLOCK_MONOTONIC_RAW 4
1841 #endif
1842 #ifndef CLOCK_BOOTTIME
1843 #define CLOCK_BOOTTIME 7
1844 #endif
1845 #ifndef CLOCK_TAI
1846 #define CLOCK_TAI 11
1847 #endif
1848
1849 static const struct clockid_map clockids[] = {
1850         /* available for all events, NMI safe */
1851         CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1852         CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1853
1854         /* available for some events */
1855         CLOCKID_MAP("realtime", CLOCK_REALTIME),
1856         CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1857         CLOCKID_MAP("tai", CLOCK_TAI),
1858
1859         /* available for the lazy */
1860         CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1861         CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1862         CLOCKID_MAP("real", CLOCK_REALTIME),
1863         CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1864
1865         CLOCKID_END,
1866 };
1867
1868 static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1869 {
1870         struct timespec res;
1871
1872         *res_ns = 0;
1873         if (!clock_getres(clk_id, &res))
1874                 *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1875         else
1876                 pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1877
1878         return 0;
1879 }
1880
1881 static int parse_clockid(const struct option *opt, const char *str, int unset)
1882 {
1883         struct record_opts *opts = (struct record_opts *)opt->value;
1884         const struct clockid_map *cm;
1885         const char *ostr = str;
1886
1887         if (unset) {
1888                 opts->use_clockid = 0;
1889                 return 0;
1890         }
1891
1892         /* no arg passed */
1893         if (!str)
1894                 return 0;
1895
1896         /* no setting it twice */
1897         if (opts->use_clockid)
1898                 return -1;
1899
1900         opts->use_clockid = true;
1901
1902         /* if its a number, we're done */
1903         if (sscanf(str, "%d", &opts->clockid) == 1)
1904                 return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
1905
1906         /* allow a "CLOCK_" prefix to the name */
1907         if (!strncasecmp(str, "CLOCK_", 6))
1908                 str += 6;
1909
1910         for (cm = clockids; cm->name; cm++) {
1911                 if (!strcasecmp(str, cm->name)) {
1912                         opts->clockid = cm->clockid;
1913                         return get_clockid_res(opts->clockid,
1914                                                &opts->clockid_res_ns);
1915                 }
1916         }
1917
1918         opts->use_clockid = false;
1919         ui__warning("unknown clockid %s, check man page\n", ostr);
1920         return -1;
1921 }
1922
1923 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
1924 {
1925         struct record_opts *opts = (struct record_opts *)opt->value;
1926
1927         if (unset || !str)
1928                 return 0;
1929
1930         if (!strcasecmp(str, "node"))
1931                 opts->affinity = PERF_AFFINITY_NODE;
1932         else if (!strcasecmp(str, "cpu"))
1933                 opts->affinity = PERF_AFFINITY_CPU;
1934
1935         return 0;
1936 }
1937
1938 static int record__parse_mmap_pages(const struct option *opt,
1939                                     const char *str,
1940                                     int unset __maybe_unused)
1941 {
1942         struct record_opts *opts = opt->value;
1943         char *s, *p;
1944         unsigned int mmap_pages;
1945         int ret;
1946
1947         if (!str)
1948                 return -EINVAL;
1949
1950         s = strdup(str);
1951         if (!s)
1952                 return -ENOMEM;
1953
1954         p = strchr(s, ',');
1955         if (p)
1956                 *p = '\0';
1957
1958         if (*s) {
1959                 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1960                 if (ret)
1961                         goto out_free;
1962                 opts->mmap_pages = mmap_pages;
1963         }
1964
1965         if (!p) {
1966                 ret = 0;
1967                 goto out_free;
1968         }
1969
1970         ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1971         if (ret)
1972                 goto out_free;
1973
1974         opts->auxtrace_mmap_pages = mmap_pages;
1975
1976 out_free:
1977         free(s);
1978         return ret;
1979 }
1980
1981 static void switch_output_size_warn(struct record *rec)
1982 {
1983         u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
1984         struct switch_output *s = &rec->switch_output;
1985
1986         wakeup_size /= 2;
1987
1988         if (s->size < wakeup_size) {
1989                 char buf[100];
1990
1991                 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
1992                 pr_warning("WARNING: switch-output data size lower than "
1993                            "wakeup kernel buffer size (%s) "
1994                            "expect bigger perf.data sizes\n", buf);
1995         }
1996 }
1997
1998 static int switch_output_setup(struct record *rec)
1999 {
2000         struct switch_output *s = &rec->switch_output;
2001         static struct parse_tag tags_size[] = {
2002                 { .tag  = 'B', .mult = 1       },
2003                 { .tag  = 'K', .mult = 1 << 10 },
2004                 { .tag  = 'M', .mult = 1 << 20 },
2005                 { .tag  = 'G', .mult = 1 << 30 },
2006                 { .tag  = 0 },
2007         };
2008         static struct parse_tag tags_time[] = {
2009                 { .tag  = 's', .mult = 1        },
2010                 { .tag  = 'm', .mult = 60       },
2011                 { .tag  = 'h', .mult = 60*60    },
2012                 { .tag  = 'd', .mult = 60*60*24 },
2013                 { .tag  = 0 },
2014         };
2015         unsigned long val;
2016
2017         if (!s->set)
2018                 return 0;
2019
2020         if (!strcmp(s->str, "signal")) {
2021                 s->signal = true;
2022                 pr_debug("switch-output with SIGUSR2 signal\n");
2023                 goto enabled;
2024         }
2025
2026         val = parse_tag_value(s->str, tags_size);
2027         if (val != (unsigned long) -1) {
2028                 s->size = val;
2029                 pr_debug("switch-output with %s size threshold\n", s->str);
2030                 goto enabled;
2031         }
2032
2033         val = parse_tag_value(s->str, tags_time);
2034         if (val != (unsigned long) -1) {
2035                 s->time = val;
2036                 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2037                          s->str, s->time);
2038                 goto enabled;
2039         }
2040
2041         return -1;
2042
2043 enabled:
2044         rec->timestamp_filename = true;
2045         s->enabled              = true;
2046
2047         if (s->size && !rec->opts.no_buffering)
2048                 switch_output_size_warn(rec);
2049
2050         return 0;
2051 }
2052
2053 static const char * const __record_usage[] = {
2054         "perf record [<options>] [<command>]",
2055         "perf record [<options>] -- <command> [<options>]",
2056         NULL
2057 };
2058 const char * const *record_usage = __record_usage;
2059
2060 /*
2061  * XXX Ideally would be local to cmd_record() and passed to a record__new
2062  * because we need to have access to it in record__exit, that is called
2063  * after cmd_record() exits, but since record_options need to be accessible to
2064  * builtin-script, leave it here.
2065  *
2066  * At least we don't ouch it in all the other functions here directly.
2067  *
2068  * Just say no to tons of global variables, sigh.
2069  */
2070 static struct record record = {
2071         .opts = {
2072                 .sample_time         = true,
2073                 .mmap_pages          = UINT_MAX,
2074                 .user_freq           = UINT_MAX,
2075                 .user_interval       = ULLONG_MAX,
2076                 .freq                = 4000,
2077                 .target              = {
2078                         .uses_mmap   = true,
2079                         .default_per_cpu = true,
2080                 },
2081                 .mmap_flush          = MMAP_FLUSH_DEFAULT,
2082         },
2083         .tool = {
2084                 .sample         = process_sample_event,
2085                 .fork           = perf_event__process_fork,
2086                 .exit           = perf_event__process_exit,
2087                 .comm           = perf_event__process_comm,
2088                 .namespaces     = perf_event__process_namespaces,
2089                 .mmap           = perf_event__process_mmap,
2090                 .mmap2          = perf_event__process_mmap2,
2091                 .ordered_events = true,
2092         },
2093 };
2094
2095 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2096         "\n\t\t\t\tDefault: fp";
2097
2098 static bool dry_run;
2099
2100 /*
2101  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2102  * with it and switch to use the library functions in perf_evlist that came
2103  * from builtin-record.c, i.e. use record_opts,
2104  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2105  * using pipes, etc.
2106  */
2107 static struct option __record_options[] = {
2108         OPT_CALLBACK('e', "event", &record.evlist, "event",
2109                      "event selector. use 'perf list' to list available events",
2110                      parse_events_option),
2111         OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2112                      "event filter", parse_filter),
2113         OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2114                            NULL, "don't record events from perf itself",
2115                            exclude_perf),
2116         OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2117                     "record events on existing process id"),
2118         OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2119                     "record events on existing thread id"),
2120         OPT_INTEGER('r', "realtime", &record.realtime_prio,
2121                     "collect data with this RT SCHED_FIFO priority"),
2122         OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2123                     "collect data without buffering"),
2124         OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2125                     "collect raw sample records from all opened counters"),
2126         OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2127                             "system-wide collection from all CPUs"),
2128         OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2129                     "list of cpus to monitor"),
2130         OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2131         OPT_STRING('o', "output", &record.data.path, "file",
2132                     "output file name"),
2133         OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2134                         &record.opts.no_inherit_set,
2135                         "child tasks do not inherit counters"),
2136         OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2137                     "synthesize non-sample events at the end of output"),
2138         OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2139         OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
2140         OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2141                     "Fail if the specified frequency can't be used"),
2142         OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2143                      "profile at this frequency",
2144                       record__parse_freq),
2145         OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2146                      "number of mmap data pages and AUX area tracing mmap pages",
2147                      record__parse_mmap_pages),
2148         OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2149                      "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2150                      record__mmap_flush_parse),
2151         OPT_BOOLEAN(0, "group", &record.opts.group,
2152                     "put the counters into a counter group"),
2153         OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2154                            NULL, "enables call-graph recording" ,
2155                            &record_callchain_opt),
2156         OPT_CALLBACK(0, "call-graph", &record.opts,
2157                      "record_mode[,record_size]", record_callchain_help,
2158                      &record_parse_callchain_opt),
2159         OPT_INCR('v', "verbose", &verbose,
2160                     "be more verbose (show counter open errors, etc)"),
2161         OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2162         OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2163                     "per thread counts"),
2164         OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2165         OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2166                     "Record the sample physical addresses"),
2167         OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2168         OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2169                         &record.opts.sample_time_set,
2170                         "Record the sample timestamps"),
2171         OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2172                         "Record the sample period"),
2173         OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2174                     "don't sample"),
2175         OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2176                         &record.no_buildid_cache_set,
2177                         "do not update the buildid cache"),
2178         OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2179                         &record.no_buildid_set,
2180                         "do not collect buildids in perf.data"),
2181         OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2182                      "monitor event in cgroup name only",
2183                      parse_cgroups),
2184         OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2185                   "ms to wait before starting measurement after program start"),
2186         OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2187                    "user to profile"),
2188
2189         OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2190                      "branch any", "sample any taken branches",
2191                      parse_branch_stack),
2192
2193         OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2194                      "branch filter mask", "branch stack filter modes",
2195                      parse_branch_stack),
2196         OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2197                     "sample by weight (on special events only)"),
2198         OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2199                     "sample transaction flags (special events only)"),
2200         OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2201                     "use per-thread mmaps"),
2202         OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2203                     "sample selected machine registers on interrupt,"
2204                     " use '-I?' to list register names", parse_intr_regs),
2205         OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2206                     "sample selected machine registers on interrupt,"
2207                     " use '--user-regs=?' to list register names", parse_user_regs),
2208         OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2209                     "Record running/enabled time of read (:S) events"),
2210         OPT_CALLBACK('k', "clockid", &record.opts,
2211         "clockid", "clockid to use for events, see clock_gettime()",
2212         parse_clockid),
2213         OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2214                           "opts", "AUX area tracing Snapshot Mode", ""),
2215         OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2216                         "per thread proc mmap processing timeout in ms"),
2217         OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2218                     "Record namespaces events"),
2219         OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
2220                     "Record context switch events"),
2221         OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2222                          "Configure all used events to run in kernel space.",
2223                          PARSE_OPT_EXCLUSIVE),
2224         OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2225                          "Configure all used events to run in user space.",
2226                          PARSE_OPT_EXCLUSIVE),
2227         OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2228                     "collect kernel callchains"),
2229         OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2230                     "collect user callchains"),
2231         OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2232                    "clang binary to use for compiling BPF scriptlets"),
2233         OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2234                    "options passed to clang when compiling BPF scriptlets"),
2235         OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2236                    "file", "vmlinux pathname"),
2237         OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2238                     "Record build-id of all DSOs regardless of hits"),
2239         OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2240                     "append timestamp to output filename"),
2241         OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2242                     "Record timestamp boundary (time of first/last samples)"),
2243         OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2244                           &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2245                           "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2246                           "signal"),
2247         OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2248                    "Limit number of switch output generated files"),
2249         OPT_BOOLEAN(0, "dry-run", &dry_run,
2250                     "Parse options then exit"),
2251 #ifdef HAVE_AIO_SUPPORT
2252         OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2253                      &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2254                      record__aio_parse),
2255 #endif
2256         OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2257                      "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2258                      record__parse_affinity),
2259 #ifdef HAVE_ZSTD_SUPPORT
2260         OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2261                             "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2262                             record__parse_comp_level),
2263 #endif
2264         OPT_END()
2265 };
2266
2267 struct option *record_options = __record_options;
2268
2269 int cmd_record(int argc, const char **argv)
2270 {
2271         int err;
2272         struct record *rec = &record;
2273         char errbuf[BUFSIZ];
2274
2275         setlocale(LC_ALL, "");
2276
2277 #ifndef HAVE_LIBBPF_SUPPORT
2278 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2279         set_nobuild('\0', "clang-path", true);
2280         set_nobuild('\0', "clang-opt", true);
2281 # undef set_nobuild
2282 #endif
2283
2284 #ifndef HAVE_BPF_PROLOGUE
2285 # if !defined (HAVE_DWARF_SUPPORT)
2286 #  define REASON  "NO_DWARF=1"
2287 # elif !defined (HAVE_LIBBPF_SUPPORT)
2288 #  define REASON  "NO_LIBBPF=1"
2289 # else
2290 #  define REASON  "this architecture doesn't support BPF prologue"
2291 # endif
2292 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2293         set_nobuild('\0', "vmlinux", true);
2294 # undef set_nobuild
2295 # undef REASON
2296 #endif
2297
2298         CPU_ZERO(&rec->affinity_mask);
2299         rec->opts.affinity = PERF_AFFINITY_SYS;
2300
2301         rec->evlist = evlist__new();
2302         if (rec->evlist == NULL)
2303                 return -ENOMEM;
2304
2305         err = perf_config(perf_record_config, rec);
2306         if (err)
2307                 return err;
2308
2309         argc = parse_options(argc, argv, record_options, record_usage,
2310                             PARSE_OPT_STOP_AT_NON_OPTION);
2311         if (quiet)
2312                 perf_quiet_option();
2313
2314         /* Make system wide (-a) the default target. */
2315         if (!argc && target__none(&rec->opts.target))
2316                 rec->opts.target.system_wide = true;
2317
2318         if (nr_cgroups && !rec->opts.target.system_wide) {
2319                 usage_with_options_msg(record_usage, record_options,
2320                         "cgroup monitoring only available in system-wide mode");
2321
2322         }
2323
2324         if (rec->opts.comp_level != 0) {
2325                 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2326                 rec->no_buildid = true;
2327         }
2328
2329         if (rec->opts.record_switch_events &&
2330             !perf_can_record_switch_events()) {
2331                 ui__error("kernel does not support recording context switch events\n");
2332                 parse_options_usage(record_usage, record_options, "switch-events", 0);
2333                 return -EINVAL;
2334         }
2335
2336         if (switch_output_setup(rec)) {
2337                 parse_options_usage(record_usage, record_options, "switch-output", 0);
2338                 return -EINVAL;
2339         }
2340
2341         if (rec->switch_output.time) {
2342                 signal(SIGALRM, alarm_sig_handler);
2343                 alarm(rec->switch_output.time);
2344         }
2345
2346         if (rec->switch_output.num_files) {
2347                 rec->switch_output.filenames = calloc(sizeof(char *),
2348                                                       rec->switch_output.num_files);
2349                 if (!rec->switch_output.filenames)
2350                         return -EINVAL;
2351         }
2352
2353         /*
2354          * Allow aliases to facilitate the lookup of symbols for address
2355          * filters. Refer to auxtrace_parse_filters().
2356          */
2357         symbol_conf.allow_aliases = true;
2358
2359         symbol__init(NULL);
2360
2361         err = record__auxtrace_init(rec);
2362         if (err)
2363                 goto out;
2364
2365         if (dry_run)
2366                 goto out;
2367
2368         err = bpf__setup_stdout(rec->evlist);
2369         if (err) {
2370                 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2371                 pr_err("ERROR: Setup BPF stdout failed: %s\n",
2372                          errbuf);
2373                 goto out;
2374         }
2375
2376         err = -ENOMEM;
2377
2378         if (rec->no_buildid_cache || rec->no_buildid) {
2379                 disable_buildid_cache();
2380         } else if (rec->switch_output.enabled) {
2381                 /*
2382                  * In 'perf record --switch-output', disable buildid
2383                  * generation by default to reduce data file switching
2384                  * overhead. Still generate buildid if they are required
2385                  * explicitly using
2386                  *
2387                  *  perf record --switch-output --no-no-buildid \
2388                  *              --no-no-buildid-cache
2389                  *
2390                  * Following code equals to:
2391                  *
2392                  * if ((rec->no_buildid || !rec->no_buildid_set) &&
2393                  *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2394                  *         disable_buildid_cache();
2395                  */
2396                 bool disable = true;
2397
2398                 if (rec->no_buildid_set && !rec->no_buildid)
2399                         disable = false;
2400                 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2401                         disable = false;
2402                 if (disable) {
2403                         rec->no_buildid = true;
2404                         rec->no_buildid_cache = true;
2405                         disable_buildid_cache();
2406                 }
2407         }
2408
2409         if (record.opts.overwrite)
2410                 record.opts.tail_synthesize = true;
2411
2412         if (rec->evlist->core.nr_entries == 0 &&
2413             __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2414                 pr_err("Not enough memory for event selector list\n");
2415                 goto out;
2416         }
2417
2418         if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2419                 rec->opts.no_inherit = true;
2420
2421         err = target__validate(&rec->opts.target);
2422         if (err) {
2423                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2424                 ui__warning("%s\n", errbuf);
2425         }
2426
2427         err = target__parse_uid(&rec->opts.target);
2428         if (err) {
2429                 int saved_errno = errno;
2430
2431                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2432                 ui__error("%s", errbuf);
2433
2434                 err = -saved_errno;
2435                 goto out;
2436         }
2437
2438         /* Enable ignoring missing threads when -u/-p option is defined. */
2439         rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2440
2441         err = -ENOMEM;
2442         if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2443                 usage_with_options(record_usage, record_options);
2444
2445         err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2446         if (err)
2447                 goto out;
2448
2449         /*
2450          * We take all buildids when the file contains
2451          * AUX area tracing data because we do not decode the
2452          * trace because it would take too long.
2453          */
2454         if (rec->opts.full_auxtrace)
2455                 rec->buildid_all = true;
2456
2457         if (record_opts__config(&rec->opts)) {
2458                 err = -EINVAL;
2459                 goto out;
2460         }
2461
2462         if (rec->opts.nr_cblocks > nr_cblocks_max)
2463                 rec->opts.nr_cblocks = nr_cblocks_max;
2464         pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2465
2466         pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2467         pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2468
2469         if (rec->opts.comp_level > comp_level_max)
2470                 rec->opts.comp_level = comp_level_max;
2471         pr_debug("comp level: %d\n", rec->opts.comp_level);
2472
2473         err = __cmd_record(&record, argc, argv);
2474 out:
2475         evlist__delete(rec->evlist);
2476         symbol__exit();
2477         auxtrace_record__free(rec->itr);
2478         return err;
2479 }
2480
2481 static void snapshot_sig_handler(int sig __maybe_unused)
2482 {
2483         struct record *rec = &record;
2484
2485         if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2486                 trigger_hit(&auxtrace_snapshot_trigger);
2487                 auxtrace_record__snapshot_started = 1;
2488                 if (auxtrace_record__snapshot_start(record.itr))
2489                         trigger_error(&auxtrace_snapshot_trigger);
2490         }
2491
2492         if (switch_output_signal(rec))
2493                 trigger_hit(&switch_output_trigger);
2494 }
2495
2496 static void alarm_sig_handler(int sig __maybe_unused)
2497 {
2498         struct record *rec = &record;
2499
2500         if (switch_output_time(rec))
2501                 trigger_hit(&switch_output_trigger);
2502 }