]> asedeno.scripts.mit.edu Git - linux.git/blob - tools/perf/builtin-record.c
perf tools: Move event synthesizing routines to separate .c file
[linux.git] / tools / perf / builtin-record.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/target.h"
24 #include "util/session.h"
25 #include "util/tool.h"
26 #include "util/symbol.h"
27 #include "util/record.h"
28 #include "util/cpumap.h"
29 #include "util/thread_map.h"
30 #include "util/data.h"
31 #include "util/perf_regs.h"
32 #include "util/auxtrace.h"
33 #include "util/tsc.h"
34 #include "util/parse-branch-options.h"
35 #include "util/parse-regs-options.h"
36 #include "util/llvm-utils.h"
37 #include "util/bpf-loader.h"
38 #include "util/trigger.h"
39 #include "util/perf-hooks.h"
40 #include "util/cpu-set-sched.h"
41 #include "util/synthetic-events.h"
42 #include "util/time-utils.h"
43 #include "util/units.h"
44 #include "util/bpf-event.h"
45 #include "asm/bug.h"
46 #include "perf.h"
47
48 #include <errno.h>
49 #include <inttypes.h>
50 #include <locale.h>
51 #include <poll.h>
52 #include <unistd.h>
53 #include <sched.h>
54 #include <signal.h>
55 #include <sys/mman.h>
56 #include <sys/wait.h>
57 #include <linux/string.h>
58 #include <linux/time64.h>
59 #include <linux/zalloc.h>
60
61 struct switch_output {
62         bool             enabled;
63         bool             signal;
64         unsigned long    size;
65         unsigned long    time;
66         const char      *str;
67         bool             set;
68         char             **filenames;
69         int              num_files;
70         int              cur_file;
71 };
72
73 struct record {
74         struct perf_tool        tool;
75         struct record_opts      opts;
76         u64                     bytes_written;
77         struct perf_data        data;
78         struct auxtrace_record  *itr;
79         struct evlist   *evlist;
80         struct perf_session     *session;
81         int                     realtime_prio;
82         bool                    no_buildid;
83         bool                    no_buildid_set;
84         bool                    no_buildid_cache;
85         bool                    no_buildid_cache_set;
86         bool                    buildid_all;
87         bool                    timestamp_filename;
88         bool                    timestamp_boundary;
89         struct switch_output    switch_output;
90         unsigned long long      samples;
91         cpu_set_t               affinity_mask;
92 };
93
94 static volatile int auxtrace_record__snapshot_started;
95 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
96 static DEFINE_TRIGGER(switch_output_trigger);
97
98 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
99         "SYS", "NODE", "CPU"
100 };
101
102 static bool switch_output_signal(struct record *rec)
103 {
104         return rec->switch_output.signal &&
105                trigger_is_ready(&switch_output_trigger);
106 }
107
108 static bool switch_output_size(struct record *rec)
109 {
110         return rec->switch_output.size &&
111                trigger_is_ready(&switch_output_trigger) &&
112                (rec->bytes_written >= rec->switch_output.size);
113 }
114
115 static bool switch_output_time(struct record *rec)
116 {
117         return rec->switch_output.time &&
118                trigger_is_ready(&switch_output_trigger);
119 }
120
121 static int record__write(struct record *rec, struct perf_mmap *map __maybe_unused,
122                          void *bf, size_t size)
123 {
124         struct perf_data_file *file = &rec->session->data->file;
125
126         if (perf_data_file__write(file, bf, size) < 0) {
127                 pr_err("failed to write perf data, error: %m\n");
128                 return -1;
129         }
130
131         rec->bytes_written += size;
132
133         if (switch_output_size(rec))
134                 trigger_hit(&switch_output_trigger);
135
136         return 0;
137 }
138
139 static int record__aio_enabled(struct record *rec);
140 static int record__comp_enabled(struct record *rec);
141 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
142                             void *src, size_t src_size);
143
144 #ifdef HAVE_AIO_SUPPORT
145 static int record__aio_write(struct aiocb *cblock, int trace_fd,
146                 void *buf, size_t size, off_t off)
147 {
148         int rc;
149
150         cblock->aio_fildes = trace_fd;
151         cblock->aio_buf    = buf;
152         cblock->aio_nbytes = size;
153         cblock->aio_offset = off;
154         cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
155
156         do {
157                 rc = aio_write(cblock);
158                 if (rc == 0) {
159                         break;
160                 } else if (errno != EAGAIN) {
161                         cblock->aio_fildes = -1;
162                         pr_err("failed to queue perf data, error: %m\n");
163                         break;
164                 }
165         } while (1);
166
167         return rc;
168 }
169
170 static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
171 {
172         void *rem_buf;
173         off_t rem_off;
174         size_t rem_size;
175         int rc, aio_errno;
176         ssize_t aio_ret, written;
177
178         aio_errno = aio_error(cblock);
179         if (aio_errno == EINPROGRESS)
180                 return 0;
181
182         written = aio_ret = aio_return(cblock);
183         if (aio_ret < 0) {
184                 if (aio_errno != EINTR)
185                         pr_err("failed to write perf data, error: %m\n");
186                 written = 0;
187         }
188
189         rem_size = cblock->aio_nbytes - written;
190
191         if (rem_size == 0) {
192                 cblock->aio_fildes = -1;
193                 /*
194                  * md->refcount is incremented in record__aio_pushfn() for
195                  * every aio write request started in record__aio_push() so
196                  * decrement it because the request is now complete.
197                  */
198                 perf_mmap__put(md);
199                 rc = 1;
200         } else {
201                 /*
202                  * aio write request may require restart with the
203                  * reminder if the kernel didn't write whole
204                  * chunk at once.
205                  */
206                 rem_off = cblock->aio_offset + written;
207                 rem_buf = (void *)(cblock->aio_buf + written);
208                 record__aio_write(cblock, cblock->aio_fildes,
209                                 rem_buf, rem_size, rem_off);
210                 rc = 0;
211         }
212
213         return rc;
214 }
215
216 static int record__aio_sync(struct perf_mmap *md, bool sync_all)
217 {
218         struct aiocb **aiocb = md->aio.aiocb;
219         struct aiocb *cblocks = md->aio.cblocks;
220         struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
221         int i, do_suspend;
222
223         do {
224                 do_suspend = 0;
225                 for (i = 0; i < md->aio.nr_cblocks; ++i) {
226                         if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
227                                 if (sync_all)
228                                         aiocb[i] = NULL;
229                                 else
230                                         return i;
231                         } else {
232                                 /*
233                                  * Started aio write is not complete yet
234                                  * so it has to be waited before the
235                                  * next allocation.
236                                  */
237                                 aiocb[i] = &cblocks[i];
238                                 do_suspend = 1;
239                         }
240                 }
241                 if (!do_suspend)
242                         return -1;
243
244                 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
245                         if (!(errno == EAGAIN || errno == EINTR))
246                                 pr_err("failed to sync perf data, error: %m\n");
247                 }
248         } while (1);
249 }
250
251 struct record_aio {
252         struct record   *rec;
253         void            *data;
254         size_t          size;
255 };
256
257 static int record__aio_pushfn(struct perf_mmap *map, void *to, void *buf, size_t size)
258 {
259         struct record_aio *aio = to;
260
261         /*
262          * map->base data pointed by buf is copied into free map->aio.data[] buffer
263          * to release space in the kernel buffer as fast as possible, calling
264          * perf_mmap__consume() from perf_mmap__push() function.
265          *
266          * That lets the kernel to proceed with storing more profiling data into
267          * the kernel buffer earlier than other per-cpu kernel buffers are handled.
268          *
269          * Coping can be done in two steps in case the chunk of profiling data
270          * crosses the upper bound of the kernel buffer. In this case we first move
271          * part of data from map->start till the upper bound and then the reminder
272          * from the beginning of the kernel buffer till the end of the data chunk.
273          */
274
275         if (record__comp_enabled(aio->rec)) {
276                 size = zstd_compress(aio->rec->session, aio->data + aio->size,
277                                      perf_mmap__mmap_len(map) - aio->size,
278                                      buf, size);
279         } else {
280                 memcpy(aio->data + aio->size, buf, size);
281         }
282
283         if (!aio->size) {
284                 /*
285                  * Increment map->refcount to guard map->aio.data[] buffer
286                  * from premature deallocation because map object can be
287                  * released earlier than aio write request started on
288                  * map->aio.data[] buffer is complete.
289                  *
290                  * perf_mmap__put() is done at record__aio_complete()
291                  * after started aio request completion or at record__aio_push()
292                  * if the request failed to start.
293                  */
294                 perf_mmap__get(map);
295         }
296
297         aio->size += size;
298
299         return size;
300 }
301
302 static int record__aio_push(struct record *rec, struct perf_mmap *map, off_t *off)
303 {
304         int ret, idx;
305         int trace_fd = rec->session->data->file.fd;
306         struct record_aio aio = { .rec = rec, .size = 0 };
307
308         /*
309          * Call record__aio_sync() to wait till map->aio.data[] buffer
310          * becomes available after previous aio write operation.
311          */
312
313         idx = record__aio_sync(map, false);
314         aio.data = map->aio.data[idx];
315         ret = perf_mmap__push(map, &aio, record__aio_pushfn);
316         if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
317                 return ret;
318
319         rec->samples++;
320         ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
321         if (!ret) {
322                 *off += aio.size;
323                 rec->bytes_written += aio.size;
324                 if (switch_output_size(rec))
325                         trigger_hit(&switch_output_trigger);
326         } else {
327                 /*
328                  * Decrement map->refcount incremented in record__aio_pushfn()
329                  * back if record__aio_write() operation failed to start, otherwise
330                  * map->refcount is decremented in record__aio_complete() after
331                  * aio write operation finishes successfully.
332                  */
333                 perf_mmap__put(map);
334         }
335
336         return ret;
337 }
338
339 static off_t record__aio_get_pos(int trace_fd)
340 {
341         return lseek(trace_fd, 0, SEEK_CUR);
342 }
343
344 static void record__aio_set_pos(int trace_fd, off_t pos)
345 {
346         lseek(trace_fd, pos, SEEK_SET);
347 }
348
349 static void record__aio_mmap_read_sync(struct record *rec)
350 {
351         int i;
352         struct evlist *evlist = rec->evlist;
353         struct perf_mmap *maps = evlist->mmap;
354
355         if (!record__aio_enabled(rec))
356                 return;
357
358         for (i = 0; i < evlist->nr_mmaps; i++) {
359                 struct perf_mmap *map = &maps[i];
360
361                 if (map->base)
362                         record__aio_sync(map, true);
363         }
364 }
365
366 static int nr_cblocks_default = 1;
367 static int nr_cblocks_max = 4;
368
369 static int record__aio_parse(const struct option *opt,
370                              const char *str,
371                              int unset)
372 {
373         struct record_opts *opts = (struct record_opts *)opt->value;
374
375         if (unset) {
376                 opts->nr_cblocks = 0;
377         } else {
378                 if (str)
379                         opts->nr_cblocks = strtol(str, NULL, 0);
380                 if (!opts->nr_cblocks)
381                         opts->nr_cblocks = nr_cblocks_default;
382         }
383
384         return 0;
385 }
386 #else /* HAVE_AIO_SUPPORT */
387 static int nr_cblocks_max = 0;
388
389 static int record__aio_push(struct record *rec __maybe_unused, struct perf_mmap *map __maybe_unused,
390                             off_t *off __maybe_unused)
391 {
392         return -1;
393 }
394
395 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
396 {
397         return -1;
398 }
399
400 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
401 {
402 }
403
404 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
405 {
406 }
407 #endif
408
409 static int record__aio_enabled(struct record *rec)
410 {
411         return rec->opts.nr_cblocks > 0;
412 }
413
414 #define MMAP_FLUSH_DEFAULT 1
415 static int record__mmap_flush_parse(const struct option *opt,
416                                     const char *str,
417                                     int unset)
418 {
419         int flush_max;
420         struct record_opts *opts = (struct record_opts *)opt->value;
421         static struct parse_tag tags[] = {
422                         { .tag  = 'B', .mult = 1       },
423                         { .tag  = 'K', .mult = 1 << 10 },
424                         { .tag  = 'M', .mult = 1 << 20 },
425                         { .tag  = 'G', .mult = 1 << 30 },
426                         { .tag  = 0 },
427         };
428
429         if (unset)
430                 return 0;
431
432         if (str) {
433                 opts->mmap_flush = parse_tag_value(str, tags);
434                 if (opts->mmap_flush == (int)-1)
435                         opts->mmap_flush = strtol(str, NULL, 0);
436         }
437
438         if (!opts->mmap_flush)
439                 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
440
441         flush_max = perf_evlist__mmap_size(opts->mmap_pages);
442         flush_max /= 4;
443         if (opts->mmap_flush > flush_max)
444                 opts->mmap_flush = flush_max;
445
446         return 0;
447 }
448
449 #ifdef HAVE_ZSTD_SUPPORT
450 static unsigned int comp_level_default = 1;
451
452 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
453 {
454         struct record_opts *opts = opt->value;
455
456         if (unset) {
457                 opts->comp_level = 0;
458         } else {
459                 if (str)
460                         opts->comp_level = strtol(str, NULL, 0);
461                 if (!opts->comp_level)
462                         opts->comp_level = comp_level_default;
463         }
464
465         return 0;
466 }
467 #endif
468 static unsigned int comp_level_max = 22;
469
470 static int record__comp_enabled(struct record *rec)
471 {
472         return rec->opts.comp_level > 0;
473 }
474
475 static int process_synthesized_event(struct perf_tool *tool,
476                                      union perf_event *event,
477                                      struct perf_sample *sample __maybe_unused,
478                                      struct machine *machine __maybe_unused)
479 {
480         struct record *rec = container_of(tool, struct record, tool);
481         return record__write(rec, NULL, event, event->header.size);
482 }
483
484 static int record__pushfn(struct perf_mmap *map, void *to, void *bf, size_t size)
485 {
486         struct record *rec = to;
487
488         if (record__comp_enabled(rec)) {
489                 size = zstd_compress(rec->session, map->data, perf_mmap__mmap_len(map), bf, size);
490                 bf   = map->data;
491         }
492
493         rec->samples++;
494         return record__write(rec, map, bf, size);
495 }
496
497 static volatile int done;
498 static volatile int signr = -1;
499 static volatile int child_finished;
500
501 static void sig_handler(int sig)
502 {
503         if (sig == SIGCHLD)
504                 child_finished = 1;
505         else
506                 signr = sig;
507
508         done = 1;
509 }
510
511 static void sigsegv_handler(int sig)
512 {
513         perf_hooks__recover();
514         sighandler_dump_stack(sig);
515 }
516
517 static void record__sig_exit(void)
518 {
519         if (signr == -1)
520                 return;
521
522         signal(signr, SIG_DFL);
523         raise(signr);
524 }
525
526 #ifdef HAVE_AUXTRACE_SUPPORT
527
528 static int record__process_auxtrace(struct perf_tool *tool,
529                                     struct perf_mmap *map,
530                                     union perf_event *event, void *data1,
531                                     size_t len1, void *data2, size_t len2)
532 {
533         struct record *rec = container_of(tool, struct record, tool);
534         struct perf_data *data = &rec->data;
535         size_t padding;
536         u8 pad[8] = {0};
537
538         if (!perf_data__is_pipe(data) && !perf_data__is_dir(data)) {
539                 off_t file_offset;
540                 int fd = perf_data__fd(data);
541                 int err;
542
543                 file_offset = lseek(fd, 0, SEEK_CUR);
544                 if (file_offset == -1)
545                         return -1;
546                 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
547                                                      event, file_offset);
548                 if (err)
549                         return err;
550         }
551
552         /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
553         padding = (len1 + len2) & 7;
554         if (padding)
555                 padding = 8 - padding;
556
557         record__write(rec, map, event, event->header.size);
558         record__write(rec, map, data1, len1);
559         if (len2)
560                 record__write(rec, map, data2, len2);
561         record__write(rec, map, &pad, padding);
562
563         return 0;
564 }
565
566 static int record__auxtrace_mmap_read(struct record *rec,
567                                       struct perf_mmap *map)
568 {
569         int ret;
570
571         ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
572                                   record__process_auxtrace);
573         if (ret < 0)
574                 return ret;
575
576         if (ret)
577                 rec->samples++;
578
579         return 0;
580 }
581
582 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
583                                                struct perf_mmap *map)
584 {
585         int ret;
586
587         ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
588                                            record__process_auxtrace,
589                                            rec->opts.auxtrace_snapshot_size);
590         if (ret < 0)
591                 return ret;
592
593         if (ret)
594                 rec->samples++;
595
596         return 0;
597 }
598
599 static int record__auxtrace_read_snapshot_all(struct record *rec)
600 {
601         int i;
602         int rc = 0;
603
604         for (i = 0; i < rec->evlist->nr_mmaps; i++) {
605                 struct perf_mmap *map = &rec->evlist->mmap[i];
606
607                 if (!map->auxtrace_mmap.base)
608                         continue;
609
610                 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
611                         rc = -1;
612                         goto out;
613                 }
614         }
615 out:
616         return rc;
617 }
618
619 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
620 {
621         pr_debug("Recording AUX area tracing snapshot\n");
622         if (record__auxtrace_read_snapshot_all(rec) < 0) {
623                 trigger_error(&auxtrace_snapshot_trigger);
624         } else {
625                 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
626                         trigger_error(&auxtrace_snapshot_trigger);
627                 else
628                         trigger_ready(&auxtrace_snapshot_trigger);
629         }
630 }
631
632 static int record__auxtrace_snapshot_exit(struct record *rec)
633 {
634         if (trigger_is_error(&auxtrace_snapshot_trigger))
635                 return 0;
636
637         if (!auxtrace_record__snapshot_started &&
638             auxtrace_record__snapshot_start(rec->itr))
639                 return -1;
640
641         record__read_auxtrace_snapshot(rec, true);
642         if (trigger_is_error(&auxtrace_snapshot_trigger))
643                 return -1;
644
645         return 0;
646 }
647
648 static int record__auxtrace_init(struct record *rec)
649 {
650         int err;
651
652         if (!rec->itr) {
653                 rec->itr = auxtrace_record__init(rec->evlist, &err);
654                 if (err)
655                         return err;
656         }
657
658         err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
659                                               rec->opts.auxtrace_snapshot_opts);
660         if (err)
661                 return err;
662
663         return auxtrace_parse_filters(rec->evlist);
664 }
665
666 #else
667
668 static inline
669 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
670                                struct perf_mmap *map __maybe_unused)
671 {
672         return 0;
673 }
674
675 static inline
676 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
677                                     bool on_exit __maybe_unused)
678 {
679 }
680
681 static inline
682 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
683 {
684         return 0;
685 }
686
687 static inline
688 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
689 {
690         return 0;
691 }
692
693 static int record__auxtrace_init(struct record *rec __maybe_unused)
694 {
695         return 0;
696 }
697
698 #endif
699
700 static int record__mmap_evlist(struct record *rec,
701                                struct evlist *evlist)
702 {
703         struct record_opts *opts = &rec->opts;
704         char msg[512];
705
706         if (opts->affinity != PERF_AFFINITY_SYS)
707                 cpu__setup_cpunode_map();
708
709         if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
710                                  opts->auxtrace_mmap_pages,
711                                  opts->auxtrace_snapshot_mode,
712                                  opts->nr_cblocks, opts->affinity,
713                                  opts->mmap_flush, opts->comp_level) < 0) {
714                 if (errno == EPERM) {
715                         pr_err("Permission error mapping pages.\n"
716                                "Consider increasing "
717                                "/proc/sys/kernel/perf_event_mlock_kb,\n"
718                                "or try again with a smaller value of -m/--mmap_pages.\n"
719                                "(current value: %u,%u)\n",
720                                opts->mmap_pages, opts->auxtrace_mmap_pages);
721                         return -errno;
722                 } else {
723                         pr_err("failed to mmap with %d (%s)\n", errno,
724                                 str_error_r(errno, msg, sizeof(msg)));
725                         if (errno)
726                                 return -errno;
727                         else
728                                 return -EINVAL;
729                 }
730         }
731         return 0;
732 }
733
734 static int record__mmap(struct record *rec)
735 {
736         return record__mmap_evlist(rec, rec->evlist);
737 }
738
739 static int record__open(struct record *rec)
740 {
741         char msg[BUFSIZ];
742         struct evsel *pos;
743         struct evlist *evlist = rec->evlist;
744         struct perf_session *session = rec->session;
745         struct record_opts *opts = &rec->opts;
746         int rc = 0;
747
748         /*
749          * For initial_delay we need to add a dummy event so that we can track
750          * PERF_RECORD_MMAP while we wait for the initial delay to enable the
751          * real events, the ones asked by the user.
752          */
753         if (opts->initial_delay) {
754                 if (perf_evlist__add_dummy(evlist))
755                         return -ENOMEM;
756
757                 pos = perf_evlist__first(evlist);
758                 pos->tracking = 0;
759                 pos = perf_evlist__last(evlist);
760                 pos->tracking = 1;
761                 pos->core.attr.enable_on_exec = 1;
762         }
763
764         perf_evlist__config(evlist, opts, &callchain_param);
765
766         evlist__for_each_entry(evlist, pos) {
767 try_again:
768                 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
769                         if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
770                                 if (verbose > 0)
771                                         ui__warning("%s\n", msg);
772                                 goto try_again;
773                         }
774                         if ((errno == EINVAL || errno == EBADF) &&
775                             pos->leader != pos &&
776                             pos->weak_group) {
777                                 pos = perf_evlist__reset_weak_group(evlist, pos);
778                                 goto try_again;
779                         }
780                         rc = -errno;
781                         perf_evsel__open_strerror(pos, &opts->target,
782                                                   errno, msg, sizeof(msg));
783                         ui__error("%s\n", msg);
784                         goto out;
785                 }
786
787                 pos->supported = true;
788         }
789
790         if (perf_evlist__apply_filters(evlist, &pos)) {
791                 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
792                         pos->filter, perf_evsel__name(pos), errno,
793                         str_error_r(errno, msg, sizeof(msg)));
794                 rc = -1;
795                 goto out;
796         }
797
798         rc = record__mmap(rec);
799         if (rc)
800                 goto out;
801
802         session->evlist = evlist;
803         perf_session__set_id_hdr_size(session);
804 out:
805         return rc;
806 }
807
808 static int process_sample_event(struct perf_tool *tool,
809                                 union perf_event *event,
810                                 struct perf_sample *sample,
811                                 struct evsel *evsel,
812                                 struct machine *machine)
813 {
814         struct record *rec = container_of(tool, struct record, tool);
815
816         if (rec->evlist->first_sample_time == 0)
817                 rec->evlist->first_sample_time = sample->time;
818
819         rec->evlist->last_sample_time = sample->time;
820
821         if (rec->buildid_all)
822                 return 0;
823
824         rec->samples++;
825         return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
826 }
827
828 static int process_buildids(struct record *rec)
829 {
830         struct perf_session *session = rec->session;
831
832         if (perf_data__size(&rec->data) == 0)
833                 return 0;
834
835         /*
836          * During this process, it'll load kernel map and replace the
837          * dso->long_name to a real pathname it found.  In this case
838          * we prefer the vmlinux path like
839          *   /lib/modules/3.16.4/build/vmlinux
840          *
841          * rather than build-id path (in debug directory).
842          *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
843          */
844         symbol_conf.ignore_vmlinux_buildid = true;
845
846         /*
847          * If --buildid-all is given, it marks all DSO regardless of hits,
848          * so no need to process samples. But if timestamp_boundary is enabled,
849          * it still needs to walk on all samples to get the timestamps of
850          * first/last samples.
851          */
852         if (rec->buildid_all && !rec->timestamp_boundary)
853                 rec->tool.sample = NULL;
854
855         return perf_session__process_events(session);
856 }
857
858 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
859 {
860         int err;
861         struct perf_tool *tool = data;
862         /*
863          *As for guest kernel when processing subcommand record&report,
864          *we arrange module mmap prior to guest kernel mmap and trigger
865          *a preload dso because default guest module symbols are loaded
866          *from guest kallsyms instead of /lib/modules/XXX/XXX. This
867          *method is used to avoid symbol missing when the first addr is
868          *in module instead of in guest kernel.
869          */
870         err = perf_event__synthesize_modules(tool, process_synthesized_event,
871                                              machine);
872         if (err < 0)
873                 pr_err("Couldn't record guest kernel [%d]'s reference"
874                        " relocation symbol.\n", machine->pid);
875
876         /*
877          * We use _stext for guest kernel because guest kernel's /proc/kallsyms
878          * have no _text sometimes.
879          */
880         err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
881                                                  machine);
882         if (err < 0)
883                 pr_err("Couldn't record guest kernel [%d]'s reference"
884                        " relocation symbol.\n", machine->pid);
885 }
886
887 static struct perf_event_header finished_round_event = {
888         .size = sizeof(struct perf_event_header),
889         .type = PERF_RECORD_FINISHED_ROUND,
890 };
891
892 static void record__adjust_affinity(struct record *rec, struct perf_mmap *map)
893 {
894         if (rec->opts.affinity != PERF_AFFINITY_SYS &&
895             !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
896                 CPU_ZERO(&rec->affinity_mask);
897                 CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask);
898                 sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask);
899         }
900 }
901
902 static size_t process_comp_header(void *record, size_t increment)
903 {
904         struct perf_record_compressed *event = record;
905         size_t size = sizeof(*event);
906
907         if (increment) {
908                 event->header.size += increment;
909                 return increment;
910         }
911
912         event->header.type = PERF_RECORD_COMPRESSED;
913         event->header.size = size;
914
915         return size;
916 }
917
918 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
919                             void *src, size_t src_size)
920 {
921         size_t compressed;
922         size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
923
924         compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
925                                                      max_record_size, process_comp_header);
926
927         session->bytes_transferred += src_size;
928         session->bytes_compressed  += compressed;
929
930         return compressed;
931 }
932
933 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
934                                     bool overwrite, bool synch)
935 {
936         u64 bytes_written = rec->bytes_written;
937         int i;
938         int rc = 0;
939         struct perf_mmap *maps;
940         int trace_fd = rec->data.file.fd;
941         off_t off = 0;
942
943         if (!evlist)
944                 return 0;
945
946         maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
947         if (!maps)
948                 return 0;
949
950         if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
951                 return 0;
952
953         if (record__aio_enabled(rec))
954                 off = record__aio_get_pos(trace_fd);
955
956         for (i = 0; i < evlist->nr_mmaps; i++) {
957                 u64 flush = 0;
958                 struct perf_mmap *map = &maps[i];
959
960                 if (map->base) {
961                         record__adjust_affinity(rec, map);
962                         if (synch) {
963                                 flush = map->flush;
964                                 map->flush = 1;
965                         }
966                         if (!record__aio_enabled(rec)) {
967                                 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
968                                         if (synch)
969                                                 map->flush = flush;
970                                         rc = -1;
971                                         goto out;
972                                 }
973                         } else {
974                                 if (record__aio_push(rec, map, &off) < 0) {
975                                         record__aio_set_pos(trace_fd, off);
976                                         if (synch)
977                                                 map->flush = flush;
978                                         rc = -1;
979                                         goto out;
980                                 }
981                         }
982                         if (synch)
983                                 map->flush = flush;
984                 }
985
986                 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
987                     record__auxtrace_mmap_read(rec, map) != 0) {
988                         rc = -1;
989                         goto out;
990                 }
991         }
992
993         if (record__aio_enabled(rec))
994                 record__aio_set_pos(trace_fd, off);
995
996         /*
997          * Mark the round finished in case we wrote
998          * at least one event.
999          */
1000         if (bytes_written != rec->bytes_written)
1001                 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1002
1003         if (overwrite)
1004                 perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1005 out:
1006         return rc;
1007 }
1008
1009 static int record__mmap_read_all(struct record *rec, bool synch)
1010 {
1011         int err;
1012
1013         err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1014         if (err)
1015                 return err;
1016
1017         return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1018 }
1019
1020 static void record__init_features(struct record *rec)
1021 {
1022         struct perf_session *session = rec->session;
1023         int feat;
1024
1025         for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1026                 perf_header__set_feat(&session->header, feat);
1027
1028         if (rec->no_buildid)
1029                 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1030
1031         if (!have_tracepoints(&rec->evlist->core.entries))
1032                 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1033
1034         if (!rec->opts.branch_stack)
1035                 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1036
1037         if (!rec->opts.full_auxtrace)
1038                 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1039
1040         if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1041                 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1042
1043         perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1044         if (!record__comp_enabled(rec))
1045                 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1046
1047         perf_header__clear_feat(&session->header, HEADER_STAT);
1048 }
1049
1050 static void
1051 record__finish_output(struct record *rec)
1052 {
1053         struct perf_data *data = &rec->data;
1054         int fd = perf_data__fd(data);
1055
1056         if (data->is_pipe)
1057                 return;
1058
1059         rec->session->header.data_size += rec->bytes_written;
1060         data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1061
1062         if (!rec->no_buildid) {
1063                 process_buildids(rec);
1064
1065                 if (rec->buildid_all)
1066                         dsos__hit_all(rec->session);
1067         }
1068         perf_session__write_header(rec->session, rec->evlist, fd, true);
1069
1070         return;
1071 }
1072
1073 static int record__synthesize_workload(struct record *rec, bool tail)
1074 {
1075         int err;
1076         struct perf_thread_map *thread_map;
1077
1078         if (rec->opts.tail_synthesize != tail)
1079                 return 0;
1080
1081         thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1082         if (thread_map == NULL)
1083                 return -1;
1084
1085         err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1086                                                  process_synthesized_event,
1087                                                  &rec->session->machines.host,
1088                                                  rec->opts.sample_address);
1089         perf_thread_map__put(thread_map);
1090         return err;
1091 }
1092
1093 static int record__synthesize(struct record *rec, bool tail);
1094
1095 static int
1096 record__switch_output(struct record *rec, bool at_exit)
1097 {
1098         struct perf_data *data = &rec->data;
1099         int fd, err;
1100         char *new_filename;
1101
1102         /* Same Size:      "2015122520103046"*/
1103         char timestamp[] = "InvalidTimestamp";
1104
1105         record__aio_mmap_read_sync(rec);
1106
1107         record__synthesize(rec, true);
1108         if (target__none(&rec->opts.target))
1109                 record__synthesize_workload(rec, true);
1110
1111         rec->samples = 0;
1112         record__finish_output(rec);
1113         err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1114         if (err) {
1115                 pr_err("Failed to get current timestamp\n");
1116                 return -EINVAL;
1117         }
1118
1119         fd = perf_data__switch(data, timestamp,
1120                                     rec->session->header.data_offset,
1121                                     at_exit, &new_filename);
1122         if (fd >= 0 && !at_exit) {
1123                 rec->bytes_written = 0;
1124                 rec->session->header.data_size = 0;
1125         }
1126
1127         if (!quiet)
1128                 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1129                         data->path, timestamp);
1130
1131         if (rec->switch_output.num_files) {
1132                 int n = rec->switch_output.cur_file + 1;
1133
1134                 if (n >= rec->switch_output.num_files)
1135                         n = 0;
1136                 rec->switch_output.cur_file = n;
1137                 if (rec->switch_output.filenames[n]) {
1138                         remove(rec->switch_output.filenames[n]);
1139                         zfree(&rec->switch_output.filenames[n]);
1140                 }
1141                 rec->switch_output.filenames[n] = new_filename;
1142         } else {
1143                 free(new_filename);
1144         }
1145
1146         /* Output tracking events */
1147         if (!at_exit) {
1148                 record__synthesize(rec, false);
1149
1150                 /*
1151                  * In 'perf record --switch-output' without -a,
1152                  * record__synthesize() in record__switch_output() won't
1153                  * generate tracking events because there's no thread_map
1154                  * in evlist. Which causes newly created perf.data doesn't
1155                  * contain map and comm information.
1156                  * Create a fake thread_map and directly call
1157                  * perf_event__synthesize_thread_map() for those events.
1158                  */
1159                 if (target__none(&rec->opts.target))
1160                         record__synthesize_workload(rec, false);
1161         }
1162         return fd;
1163 }
1164
1165 static volatile int workload_exec_errno;
1166
1167 /*
1168  * perf_evlist__prepare_workload will send a SIGUSR1
1169  * if the fork fails, since we asked by setting its
1170  * want_signal to true.
1171  */
1172 static void workload_exec_failed_signal(int signo __maybe_unused,
1173                                         siginfo_t *info,
1174                                         void *ucontext __maybe_unused)
1175 {
1176         workload_exec_errno = info->si_value.sival_int;
1177         done = 1;
1178         child_finished = 1;
1179 }
1180
1181 static void snapshot_sig_handler(int sig);
1182 static void alarm_sig_handler(int sig);
1183
1184 static const struct perf_event_mmap_page *
1185 perf_evlist__pick_pc(struct evlist *evlist)
1186 {
1187         if (evlist) {
1188                 if (evlist->mmap && evlist->mmap[0].base)
1189                         return evlist->mmap[0].base;
1190                 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].base)
1191                         return evlist->overwrite_mmap[0].base;
1192         }
1193         return NULL;
1194 }
1195
1196 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1197 {
1198         const struct perf_event_mmap_page *pc;
1199
1200         pc = perf_evlist__pick_pc(rec->evlist);
1201         if (pc)
1202                 return pc;
1203         return NULL;
1204 }
1205
1206 static int record__synthesize(struct record *rec, bool tail)
1207 {
1208         struct perf_session *session = rec->session;
1209         struct machine *machine = &session->machines.host;
1210         struct perf_data *data = &rec->data;
1211         struct record_opts *opts = &rec->opts;
1212         struct perf_tool *tool = &rec->tool;
1213         int fd = perf_data__fd(data);
1214         int err = 0;
1215
1216         if (rec->opts.tail_synthesize != tail)
1217                 return 0;
1218
1219         if (data->is_pipe) {
1220                 /*
1221                  * We need to synthesize events first, because some
1222                  * features works on top of them (on report side).
1223                  */
1224                 err = perf_event__synthesize_attrs(tool, rec->evlist,
1225                                                    process_synthesized_event);
1226                 if (err < 0) {
1227                         pr_err("Couldn't synthesize attrs.\n");
1228                         goto out;
1229                 }
1230
1231                 err = perf_event__synthesize_features(tool, session, rec->evlist,
1232                                                       process_synthesized_event);
1233                 if (err < 0) {
1234                         pr_err("Couldn't synthesize features.\n");
1235                         return err;
1236                 }
1237
1238                 if (have_tracepoints(&rec->evlist->core.entries)) {
1239                         /*
1240                          * FIXME err <= 0 here actually means that
1241                          * there were no tracepoints so its not really
1242                          * an error, just that we don't need to
1243                          * synthesize anything.  We really have to
1244                          * return this more properly and also
1245                          * propagate errors that now are calling die()
1246                          */
1247                         err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
1248                                                                   process_synthesized_event);
1249                         if (err <= 0) {
1250                                 pr_err("Couldn't record tracing data.\n");
1251                                 goto out;
1252                         }
1253                         rec->bytes_written += err;
1254                 }
1255         }
1256
1257         err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1258                                           process_synthesized_event, machine);
1259         if (err)
1260                 goto out;
1261
1262         if (rec->opts.full_auxtrace) {
1263                 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1264                                         session, process_synthesized_event);
1265                 if (err)
1266                         goto out;
1267         }
1268
1269         if (!perf_evlist__exclude_kernel(rec->evlist)) {
1270                 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1271                                                          machine);
1272                 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1273                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1274                                    "Check /proc/kallsyms permission or run as root.\n");
1275
1276                 err = perf_event__synthesize_modules(tool, process_synthesized_event,
1277                                                      machine);
1278                 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1279                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1280                                    "Check /proc/modules permission or run as root.\n");
1281         }
1282
1283         if (perf_guest) {
1284                 machines__process_guests(&session->machines,
1285                                          perf_event__synthesize_guest_os, tool);
1286         }
1287
1288         err = perf_event__synthesize_extra_attr(&rec->tool,
1289                                                 rec->evlist,
1290                                                 process_synthesized_event,
1291                                                 data->is_pipe);
1292         if (err)
1293                 goto out;
1294
1295         err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1296                                                  process_synthesized_event,
1297                                                 NULL);
1298         if (err < 0) {
1299                 pr_err("Couldn't synthesize thread map.\n");
1300                 return err;
1301         }
1302
1303         err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1304                                              process_synthesized_event, NULL);
1305         if (err < 0) {
1306                 pr_err("Couldn't synthesize cpu map.\n");
1307                 return err;
1308         }
1309
1310         err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1311                                                 machine, opts);
1312         if (err < 0)
1313                 pr_warning("Couldn't synthesize bpf events.\n");
1314
1315         err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1316                                             process_synthesized_event, opts->sample_address,
1317                                             1);
1318 out:
1319         return err;
1320 }
1321
1322 static int __cmd_record(struct record *rec, int argc, const char **argv)
1323 {
1324         int err;
1325         int status = 0;
1326         unsigned long waking = 0;
1327         const bool forks = argc > 0;
1328         struct perf_tool *tool = &rec->tool;
1329         struct record_opts *opts = &rec->opts;
1330         struct perf_data *data = &rec->data;
1331         struct perf_session *session;
1332         bool disabled = false, draining = false;
1333         struct evlist *sb_evlist = NULL;
1334         int fd;
1335         float ratio = 0;
1336
1337         atexit(record__sig_exit);
1338         signal(SIGCHLD, sig_handler);
1339         signal(SIGINT, sig_handler);
1340         signal(SIGTERM, sig_handler);
1341         signal(SIGSEGV, sigsegv_handler);
1342
1343         if (rec->opts.record_namespaces)
1344                 tool->namespace_events = true;
1345
1346         if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1347                 signal(SIGUSR2, snapshot_sig_handler);
1348                 if (rec->opts.auxtrace_snapshot_mode)
1349                         trigger_on(&auxtrace_snapshot_trigger);
1350                 if (rec->switch_output.enabled)
1351                         trigger_on(&switch_output_trigger);
1352         } else {
1353                 signal(SIGUSR2, SIG_IGN);
1354         }
1355
1356         session = perf_session__new(data, false, tool);
1357         if (session == NULL) {
1358                 pr_err("Perf session creation failed.\n");
1359                 return -1;
1360         }
1361
1362         fd = perf_data__fd(data);
1363         rec->session = session;
1364
1365         if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1366                 pr_err("Compression initialization failed.\n");
1367                 return -1;
1368         }
1369
1370         session->header.env.comp_type  = PERF_COMP_ZSTD;
1371         session->header.env.comp_level = rec->opts.comp_level;
1372
1373         record__init_features(rec);
1374
1375         if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1376                 session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1377
1378         if (forks) {
1379                 err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1380                                                     argv, data->is_pipe,
1381                                                     workload_exec_failed_signal);
1382                 if (err < 0) {
1383                         pr_err("Couldn't run the workload!\n");
1384                         status = err;
1385                         goto out_delete_session;
1386                 }
1387         }
1388
1389         /*
1390          * If we have just single event and are sending data
1391          * through pipe, we need to force the ids allocation,
1392          * because we synthesize event name through the pipe
1393          * and need the id for that.
1394          */
1395         if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1396                 rec->opts.sample_id = true;
1397
1398         if (record__open(rec) != 0) {
1399                 err = -1;
1400                 goto out_child;
1401         }
1402         session->header.env.comp_mmap_len = session->evlist->mmap_len;
1403
1404         err = bpf__apply_obj_config();
1405         if (err) {
1406                 char errbuf[BUFSIZ];
1407
1408                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1409                 pr_err("ERROR: Apply config to BPF failed: %s\n",
1410                          errbuf);
1411                 goto out_child;
1412         }
1413
1414         /*
1415          * Normally perf_session__new would do this, but it doesn't have the
1416          * evlist.
1417          */
1418         if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1419                 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1420                 rec->tool.ordered_events = false;
1421         }
1422
1423         if (!rec->evlist->nr_groups)
1424                 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1425
1426         if (data->is_pipe) {
1427                 err = perf_header__write_pipe(fd);
1428                 if (err < 0)
1429                         goto out_child;
1430         } else {
1431                 err = perf_session__write_header(session, rec->evlist, fd, false);
1432                 if (err < 0)
1433                         goto out_child;
1434         }
1435
1436         if (!rec->no_buildid
1437             && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1438                 pr_err("Couldn't generate buildids. "
1439                        "Use --no-buildid to profile anyway.\n");
1440                 err = -1;
1441                 goto out_child;
1442         }
1443
1444         if (!opts->no_bpf_event)
1445                 bpf_event__add_sb_event(&sb_evlist, &session->header.env);
1446
1447         if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) {
1448                 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1449                 opts->no_bpf_event = true;
1450         }
1451
1452         err = record__synthesize(rec, false);
1453         if (err < 0)
1454                 goto out_child;
1455
1456         if (rec->realtime_prio) {
1457                 struct sched_param param;
1458
1459                 param.sched_priority = rec->realtime_prio;
1460                 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1461                         pr_err("Could not set realtime priority.\n");
1462                         err = -1;
1463                         goto out_child;
1464                 }
1465         }
1466
1467         /*
1468          * When perf is starting the traced process, all the events
1469          * (apart from group members) have enable_on_exec=1 set,
1470          * so don't spoil it by prematurely enabling them.
1471          */
1472         if (!target__none(&opts->target) && !opts->initial_delay)
1473                 evlist__enable(rec->evlist);
1474
1475         /*
1476          * Let the child rip
1477          */
1478         if (forks) {
1479                 struct machine *machine = &session->machines.host;
1480                 union perf_event *event;
1481                 pid_t tgid;
1482
1483                 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1484                 if (event == NULL) {
1485                         err = -ENOMEM;
1486                         goto out_child;
1487                 }
1488
1489                 /*
1490                  * Some H/W events are generated before COMM event
1491                  * which is emitted during exec(), so perf script
1492                  * cannot see a correct process name for those events.
1493                  * Synthesize COMM event to prevent it.
1494                  */
1495                 tgid = perf_event__synthesize_comm(tool, event,
1496                                                    rec->evlist->workload.pid,
1497                                                    process_synthesized_event,
1498                                                    machine);
1499                 free(event);
1500
1501                 if (tgid == -1)
1502                         goto out_child;
1503
1504                 event = malloc(sizeof(event->namespaces) +
1505                                (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1506                                machine->id_hdr_size);
1507                 if (event == NULL) {
1508                         err = -ENOMEM;
1509                         goto out_child;
1510                 }
1511
1512                 /*
1513                  * Synthesize NAMESPACES event for the command specified.
1514                  */
1515                 perf_event__synthesize_namespaces(tool, event,
1516                                                   rec->evlist->workload.pid,
1517                                                   tgid, process_synthesized_event,
1518                                                   machine);
1519                 free(event);
1520
1521                 perf_evlist__start_workload(rec->evlist);
1522         }
1523
1524         if (opts->initial_delay) {
1525                 usleep(opts->initial_delay * USEC_PER_MSEC);
1526                 evlist__enable(rec->evlist);
1527         }
1528
1529         trigger_ready(&auxtrace_snapshot_trigger);
1530         trigger_ready(&switch_output_trigger);
1531         perf_hooks__invoke_record_start();
1532         for (;;) {
1533                 unsigned long long hits = rec->samples;
1534
1535                 /*
1536                  * rec->evlist->bkw_mmap_state is possible to be
1537                  * BKW_MMAP_EMPTY here: when done == true and
1538                  * hits != rec->samples in previous round.
1539                  *
1540                  * perf_evlist__toggle_bkw_mmap ensure we never
1541                  * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1542                  */
1543                 if (trigger_is_hit(&switch_output_trigger) || done || draining)
1544                         perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1545
1546                 if (record__mmap_read_all(rec, false) < 0) {
1547                         trigger_error(&auxtrace_snapshot_trigger);
1548                         trigger_error(&switch_output_trigger);
1549                         err = -1;
1550                         goto out_child;
1551                 }
1552
1553                 if (auxtrace_record__snapshot_started) {
1554                         auxtrace_record__snapshot_started = 0;
1555                         if (!trigger_is_error(&auxtrace_snapshot_trigger))
1556                                 record__read_auxtrace_snapshot(rec, false);
1557                         if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1558                                 pr_err("AUX area tracing snapshot failed\n");
1559                                 err = -1;
1560                                 goto out_child;
1561                         }
1562                 }
1563
1564                 if (trigger_is_hit(&switch_output_trigger)) {
1565                         /*
1566                          * If switch_output_trigger is hit, the data in
1567                          * overwritable ring buffer should have been collected,
1568                          * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1569                          *
1570                          * If SIGUSR2 raise after or during record__mmap_read_all(),
1571                          * record__mmap_read_all() didn't collect data from
1572                          * overwritable ring buffer. Read again.
1573                          */
1574                         if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1575                                 continue;
1576                         trigger_ready(&switch_output_trigger);
1577
1578                         /*
1579                          * Reenable events in overwrite ring buffer after
1580                          * record__mmap_read_all(): we should have collected
1581                          * data from it.
1582                          */
1583                         perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1584
1585                         if (!quiet)
1586                                 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1587                                         waking);
1588                         waking = 0;
1589                         fd = record__switch_output(rec, false);
1590                         if (fd < 0) {
1591                                 pr_err("Failed to switch to new file\n");
1592                                 trigger_error(&switch_output_trigger);
1593                                 err = fd;
1594                                 goto out_child;
1595                         }
1596
1597                         /* re-arm the alarm */
1598                         if (rec->switch_output.time)
1599                                 alarm(rec->switch_output.time);
1600                 }
1601
1602                 if (hits == rec->samples) {
1603                         if (done || draining)
1604                                 break;
1605                         err = perf_evlist__poll(rec->evlist, -1);
1606                         /*
1607                          * Propagate error, only if there's any. Ignore positive
1608                          * number of returned events and interrupt error.
1609                          */
1610                         if (err > 0 || (err < 0 && errno == EINTR))
1611                                 err = 0;
1612                         waking++;
1613
1614                         if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1615                                 draining = true;
1616                 }
1617
1618                 /*
1619                  * When perf is starting the traced process, at the end events
1620                  * die with the process and we wait for that. Thus no need to
1621                  * disable events in this case.
1622                  */
1623                 if (done && !disabled && !target__none(&opts->target)) {
1624                         trigger_off(&auxtrace_snapshot_trigger);
1625                         evlist__disable(rec->evlist);
1626                         disabled = true;
1627                 }
1628         }
1629
1630         trigger_off(&auxtrace_snapshot_trigger);
1631         trigger_off(&switch_output_trigger);
1632
1633         if (opts->auxtrace_snapshot_on_exit)
1634                 record__auxtrace_snapshot_exit(rec);
1635
1636         if (forks && workload_exec_errno) {
1637                 char msg[STRERR_BUFSIZE];
1638                 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1639                 pr_err("Workload failed: %s\n", emsg);
1640                 err = -1;
1641                 goto out_child;
1642         }
1643
1644         if (!quiet)
1645                 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1646
1647         if (target__none(&rec->opts.target))
1648                 record__synthesize_workload(rec, true);
1649
1650 out_child:
1651         record__mmap_read_all(rec, true);
1652         record__aio_mmap_read_sync(rec);
1653
1654         if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1655                 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1656                 session->header.env.comp_ratio = ratio + 0.5;
1657         }
1658
1659         if (forks) {
1660                 int exit_status;
1661
1662                 if (!child_finished)
1663                         kill(rec->evlist->workload.pid, SIGTERM);
1664
1665                 wait(&exit_status);
1666
1667                 if (err < 0)
1668                         status = err;
1669                 else if (WIFEXITED(exit_status))
1670                         status = WEXITSTATUS(exit_status);
1671                 else if (WIFSIGNALED(exit_status))
1672                         signr = WTERMSIG(exit_status);
1673         } else
1674                 status = err;
1675
1676         record__synthesize(rec, true);
1677         /* this will be recalculated during process_buildids() */
1678         rec->samples = 0;
1679
1680         if (!err) {
1681                 if (!rec->timestamp_filename) {
1682                         record__finish_output(rec);
1683                 } else {
1684                         fd = record__switch_output(rec, true);
1685                         if (fd < 0) {
1686                                 status = fd;
1687                                 goto out_delete_session;
1688                         }
1689                 }
1690         }
1691
1692         perf_hooks__invoke_record_end();
1693
1694         if (!err && !quiet) {
1695                 char samples[128];
1696                 const char *postfix = rec->timestamp_filename ?
1697                                         ".<timestamp>" : "";
1698
1699                 if (rec->samples && !rec->opts.full_auxtrace)
1700                         scnprintf(samples, sizeof(samples),
1701                                   " (%" PRIu64 " samples)", rec->samples);
1702                 else
1703                         samples[0] = '\0';
1704
1705                 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
1706                         perf_data__size(data) / 1024.0 / 1024.0,
1707                         data->path, postfix, samples);
1708                 if (ratio) {
1709                         fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
1710                                         rec->session->bytes_transferred / 1024.0 / 1024.0,
1711                                         ratio);
1712                 }
1713                 fprintf(stderr, " ]\n");
1714         }
1715
1716 out_delete_session:
1717         zstd_fini(&session->zstd_data);
1718         perf_session__delete(session);
1719
1720         if (!opts->no_bpf_event)
1721                 perf_evlist__stop_sb_thread(sb_evlist);
1722         return status;
1723 }
1724
1725 static void callchain_debug(struct callchain_param *callchain)
1726 {
1727         static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1728
1729         pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1730
1731         if (callchain->record_mode == CALLCHAIN_DWARF)
1732                 pr_debug("callchain: stack dump size %d\n",
1733                          callchain->dump_size);
1734 }
1735
1736 int record_opts__parse_callchain(struct record_opts *record,
1737                                  struct callchain_param *callchain,
1738                                  const char *arg, bool unset)
1739 {
1740         int ret;
1741         callchain->enabled = !unset;
1742
1743         /* --no-call-graph */
1744         if (unset) {
1745                 callchain->record_mode = CALLCHAIN_NONE;
1746                 pr_debug("callchain: disabled\n");
1747                 return 0;
1748         }
1749
1750         ret = parse_callchain_record_opt(arg, callchain);
1751         if (!ret) {
1752                 /* Enable data address sampling for DWARF unwind. */
1753                 if (callchain->record_mode == CALLCHAIN_DWARF)
1754                         record->sample_address = true;
1755                 callchain_debug(callchain);
1756         }
1757
1758         return ret;
1759 }
1760
1761 int record_parse_callchain_opt(const struct option *opt,
1762                                const char *arg,
1763                                int unset)
1764 {
1765         return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1766 }
1767
1768 int record_callchain_opt(const struct option *opt,
1769                          const char *arg __maybe_unused,
1770                          int unset __maybe_unused)
1771 {
1772         struct callchain_param *callchain = opt->value;
1773
1774         callchain->enabled = true;
1775
1776         if (callchain->record_mode == CALLCHAIN_NONE)
1777                 callchain->record_mode = CALLCHAIN_FP;
1778
1779         callchain_debug(callchain);
1780         return 0;
1781 }
1782
1783 static int perf_record_config(const char *var, const char *value, void *cb)
1784 {
1785         struct record *rec = cb;
1786
1787         if (!strcmp(var, "record.build-id")) {
1788                 if (!strcmp(value, "cache"))
1789                         rec->no_buildid_cache = false;
1790                 else if (!strcmp(value, "no-cache"))
1791                         rec->no_buildid_cache = true;
1792                 else if (!strcmp(value, "skip"))
1793                         rec->no_buildid = true;
1794                 else
1795                         return -1;
1796                 return 0;
1797         }
1798         if (!strcmp(var, "record.call-graph")) {
1799                 var = "call-graph.record-mode";
1800                 return perf_default_config(var, value, cb);
1801         }
1802 #ifdef HAVE_AIO_SUPPORT
1803         if (!strcmp(var, "record.aio")) {
1804                 rec->opts.nr_cblocks = strtol(value, NULL, 0);
1805                 if (!rec->opts.nr_cblocks)
1806                         rec->opts.nr_cblocks = nr_cblocks_default;
1807         }
1808 #endif
1809
1810         return 0;
1811 }
1812
1813 struct clockid_map {
1814         const char *name;
1815         int clockid;
1816 };
1817
1818 #define CLOCKID_MAP(n, c)       \
1819         { .name = n, .clockid = (c), }
1820
1821 #define CLOCKID_END     { .name = NULL, }
1822
1823
1824 /*
1825  * Add the missing ones, we need to build on many distros...
1826  */
1827 #ifndef CLOCK_MONOTONIC_RAW
1828 #define CLOCK_MONOTONIC_RAW 4
1829 #endif
1830 #ifndef CLOCK_BOOTTIME
1831 #define CLOCK_BOOTTIME 7
1832 #endif
1833 #ifndef CLOCK_TAI
1834 #define CLOCK_TAI 11
1835 #endif
1836
1837 static const struct clockid_map clockids[] = {
1838         /* available for all events, NMI safe */
1839         CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1840         CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1841
1842         /* available for some events */
1843         CLOCKID_MAP("realtime", CLOCK_REALTIME),
1844         CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1845         CLOCKID_MAP("tai", CLOCK_TAI),
1846
1847         /* available for the lazy */
1848         CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1849         CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1850         CLOCKID_MAP("real", CLOCK_REALTIME),
1851         CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1852
1853         CLOCKID_END,
1854 };
1855
1856 static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1857 {
1858         struct timespec res;
1859
1860         *res_ns = 0;
1861         if (!clock_getres(clk_id, &res))
1862                 *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1863         else
1864                 pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1865
1866         return 0;
1867 }
1868
1869 static int parse_clockid(const struct option *opt, const char *str, int unset)
1870 {
1871         struct record_opts *opts = (struct record_opts *)opt->value;
1872         const struct clockid_map *cm;
1873         const char *ostr = str;
1874
1875         if (unset) {
1876                 opts->use_clockid = 0;
1877                 return 0;
1878         }
1879
1880         /* no arg passed */
1881         if (!str)
1882                 return 0;
1883
1884         /* no setting it twice */
1885         if (opts->use_clockid)
1886                 return -1;
1887
1888         opts->use_clockid = true;
1889
1890         /* if its a number, we're done */
1891         if (sscanf(str, "%d", &opts->clockid) == 1)
1892                 return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
1893
1894         /* allow a "CLOCK_" prefix to the name */
1895         if (!strncasecmp(str, "CLOCK_", 6))
1896                 str += 6;
1897
1898         for (cm = clockids; cm->name; cm++) {
1899                 if (!strcasecmp(str, cm->name)) {
1900                         opts->clockid = cm->clockid;
1901                         return get_clockid_res(opts->clockid,
1902                                                &opts->clockid_res_ns);
1903                 }
1904         }
1905
1906         opts->use_clockid = false;
1907         ui__warning("unknown clockid %s, check man page\n", ostr);
1908         return -1;
1909 }
1910
1911 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
1912 {
1913         struct record_opts *opts = (struct record_opts *)opt->value;
1914
1915         if (unset || !str)
1916                 return 0;
1917
1918         if (!strcasecmp(str, "node"))
1919                 opts->affinity = PERF_AFFINITY_NODE;
1920         else if (!strcasecmp(str, "cpu"))
1921                 opts->affinity = PERF_AFFINITY_CPU;
1922
1923         return 0;
1924 }
1925
1926 static int record__parse_mmap_pages(const struct option *opt,
1927                                     const char *str,
1928                                     int unset __maybe_unused)
1929 {
1930         struct record_opts *opts = opt->value;
1931         char *s, *p;
1932         unsigned int mmap_pages;
1933         int ret;
1934
1935         if (!str)
1936                 return -EINVAL;
1937
1938         s = strdup(str);
1939         if (!s)
1940                 return -ENOMEM;
1941
1942         p = strchr(s, ',');
1943         if (p)
1944                 *p = '\0';
1945
1946         if (*s) {
1947                 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1948                 if (ret)
1949                         goto out_free;
1950                 opts->mmap_pages = mmap_pages;
1951         }
1952
1953         if (!p) {
1954                 ret = 0;
1955                 goto out_free;
1956         }
1957
1958         ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1959         if (ret)
1960                 goto out_free;
1961
1962         opts->auxtrace_mmap_pages = mmap_pages;
1963
1964 out_free:
1965         free(s);
1966         return ret;
1967 }
1968
1969 static void switch_output_size_warn(struct record *rec)
1970 {
1971         u64 wakeup_size = perf_evlist__mmap_size(rec->opts.mmap_pages);
1972         struct switch_output *s = &rec->switch_output;
1973
1974         wakeup_size /= 2;
1975
1976         if (s->size < wakeup_size) {
1977                 char buf[100];
1978
1979                 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
1980                 pr_warning("WARNING: switch-output data size lower than "
1981                            "wakeup kernel buffer size (%s) "
1982                            "expect bigger perf.data sizes\n", buf);
1983         }
1984 }
1985
1986 static int switch_output_setup(struct record *rec)
1987 {
1988         struct switch_output *s = &rec->switch_output;
1989         static struct parse_tag tags_size[] = {
1990                 { .tag  = 'B', .mult = 1       },
1991                 { .tag  = 'K', .mult = 1 << 10 },
1992                 { .tag  = 'M', .mult = 1 << 20 },
1993                 { .tag  = 'G', .mult = 1 << 30 },
1994                 { .tag  = 0 },
1995         };
1996         static struct parse_tag tags_time[] = {
1997                 { .tag  = 's', .mult = 1        },
1998                 { .tag  = 'm', .mult = 60       },
1999                 { .tag  = 'h', .mult = 60*60    },
2000                 { .tag  = 'd', .mult = 60*60*24 },
2001                 { .tag  = 0 },
2002         };
2003         unsigned long val;
2004
2005         if (!s->set)
2006                 return 0;
2007
2008         if (!strcmp(s->str, "signal")) {
2009                 s->signal = true;
2010                 pr_debug("switch-output with SIGUSR2 signal\n");
2011                 goto enabled;
2012         }
2013
2014         val = parse_tag_value(s->str, tags_size);
2015         if (val != (unsigned long) -1) {
2016                 s->size = val;
2017                 pr_debug("switch-output with %s size threshold\n", s->str);
2018                 goto enabled;
2019         }
2020
2021         val = parse_tag_value(s->str, tags_time);
2022         if (val != (unsigned long) -1) {
2023                 s->time = val;
2024                 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2025                          s->str, s->time);
2026                 goto enabled;
2027         }
2028
2029         return -1;
2030
2031 enabled:
2032         rec->timestamp_filename = true;
2033         s->enabled              = true;
2034
2035         if (s->size && !rec->opts.no_buffering)
2036                 switch_output_size_warn(rec);
2037
2038         return 0;
2039 }
2040
2041 static const char * const __record_usage[] = {
2042         "perf record [<options>] [<command>]",
2043         "perf record [<options>] -- <command> [<options>]",
2044         NULL
2045 };
2046 const char * const *record_usage = __record_usage;
2047
2048 /*
2049  * XXX Ideally would be local to cmd_record() and passed to a record__new
2050  * because we need to have access to it in record__exit, that is called
2051  * after cmd_record() exits, but since record_options need to be accessible to
2052  * builtin-script, leave it here.
2053  *
2054  * At least we don't ouch it in all the other functions here directly.
2055  *
2056  * Just say no to tons of global variables, sigh.
2057  */
2058 static struct record record = {
2059         .opts = {
2060                 .sample_time         = true,
2061                 .mmap_pages          = UINT_MAX,
2062                 .user_freq           = UINT_MAX,
2063                 .user_interval       = ULLONG_MAX,
2064                 .freq                = 4000,
2065                 .target              = {
2066                         .uses_mmap   = true,
2067                         .default_per_cpu = true,
2068                 },
2069                 .mmap_flush          = MMAP_FLUSH_DEFAULT,
2070         },
2071         .tool = {
2072                 .sample         = process_sample_event,
2073                 .fork           = perf_event__process_fork,
2074                 .exit           = perf_event__process_exit,
2075                 .comm           = perf_event__process_comm,
2076                 .namespaces     = perf_event__process_namespaces,
2077                 .mmap           = perf_event__process_mmap,
2078                 .mmap2          = perf_event__process_mmap2,
2079                 .ordered_events = true,
2080         },
2081 };
2082
2083 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2084         "\n\t\t\t\tDefault: fp";
2085
2086 static bool dry_run;
2087
2088 /*
2089  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2090  * with it and switch to use the library functions in perf_evlist that came
2091  * from builtin-record.c, i.e. use record_opts,
2092  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2093  * using pipes, etc.
2094  */
2095 static struct option __record_options[] = {
2096         OPT_CALLBACK('e', "event", &record.evlist, "event",
2097                      "event selector. use 'perf list' to list available events",
2098                      parse_events_option),
2099         OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2100                      "event filter", parse_filter),
2101         OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2102                            NULL, "don't record events from perf itself",
2103                            exclude_perf),
2104         OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2105                     "record events on existing process id"),
2106         OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2107                     "record events on existing thread id"),
2108         OPT_INTEGER('r', "realtime", &record.realtime_prio,
2109                     "collect data with this RT SCHED_FIFO priority"),
2110         OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2111                     "collect data without buffering"),
2112         OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2113                     "collect raw sample records from all opened counters"),
2114         OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2115                             "system-wide collection from all CPUs"),
2116         OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2117                     "list of cpus to monitor"),
2118         OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2119         OPT_STRING('o', "output", &record.data.path, "file",
2120                     "output file name"),
2121         OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2122                         &record.opts.no_inherit_set,
2123                         "child tasks do not inherit counters"),
2124         OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2125                     "synthesize non-sample events at the end of output"),
2126         OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2127         OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
2128         OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2129                     "Fail if the specified frequency can't be used"),
2130         OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2131                      "profile at this frequency",
2132                       record__parse_freq),
2133         OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2134                      "number of mmap data pages and AUX area tracing mmap pages",
2135                      record__parse_mmap_pages),
2136         OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2137                      "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2138                      record__mmap_flush_parse),
2139         OPT_BOOLEAN(0, "group", &record.opts.group,
2140                     "put the counters into a counter group"),
2141         OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2142                            NULL, "enables call-graph recording" ,
2143                            &record_callchain_opt),
2144         OPT_CALLBACK(0, "call-graph", &record.opts,
2145                      "record_mode[,record_size]", record_callchain_help,
2146                      &record_parse_callchain_opt),
2147         OPT_INCR('v', "verbose", &verbose,
2148                     "be more verbose (show counter open errors, etc)"),
2149         OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2150         OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2151                     "per thread counts"),
2152         OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2153         OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2154                     "Record the sample physical addresses"),
2155         OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2156         OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2157                         &record.opts.sample_time_set,
2158                         "Record the sample timestamps"),
2159         OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2160                         "Record the sample period"),
2161         OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2162                     "don't sample"),
2163         OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2164                         &record.no_buildid_cache_set,
2165                         "do not update the buildid cache"),
2166         OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2167                         &record.no_buildid_set,
2168                         "do not collect buildids in perf.data"),
2169         OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2170                      "monitor event in cgroup name only",
2171                      parse_cgroups),
2172         OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2173                   "ms to wait before starting measurement after program start"),
2174         OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2175                    "user to profile"),
2176
2177         OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2178                      "branch any", "sample any taken branches",
2179                      parse_branch_stack),
2180
2181         OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2182                      "branch filter mask", "branch stack filter modes",
2183                      parse_branch_stack),
2184         OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2185                     "sample by weight (on special events only)"),
2186         OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2187                     "sample transaction flags (special events only)"),
2188         OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2189                     "use per-thread mmaps"),
2190         OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2191                     "sample selected machine registers on interrupt,"
2192                     " use '-I?' to list register names", parse_intr_regs),
2193         OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2194                     "sample selected machine registers on interrupt,"
2195                     " use '--user-regs=?' to list register names", parse_user_regs),
2196         OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2197                     "Record running/enabled time of read (:S) events"),
2198         OPT_CALLBACK('k', "clockid", &record.opts,
2199         "clockid", "clockid to use for events, see clock_gettime()",
2200         parse_clockid),
2201         OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2202                           "opts", "AUX area tracing Snapshot Mode", ""),
2203         OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2204                         "per thread proc mmap processing timeout in ms"),
2205         OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2206                     "Record namespaces events"),
2207         OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
2208                     "Record context switch events"),
2209         OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2210                          "Configure all used events to run in kernel space.",
2211                          PARSE_OPT_EXCLUSIVE),
2212         OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2213                          "Configure all used events to run in user space.",
2214                          PARSE_OPT_EXCLUSIVE),
2215         OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2216                     "collect kernel callchains"),
2217         OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2218                     "collect user callchains"),
2219         OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2220                    "clang binary to use for compiling BPF scriptlets"),
2221         OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2222                    "options passed to clang when compiling BPF scriptlets"),
2223         OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2224                    "file", "vmlinux pathname"),
2225         OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2226                     "Record build-id of all DSOs regardless of hits"),
2227         OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2228                     "append timestamp to output filename"),
2229         OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2230                     "Record timestamp boundary (time of first/last samples)"),
2231         OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2232                           &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2233                           "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2234                           "signal"),
2235         OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2236                    "Limit number of switch output generated files"),
2237         OPT_BOOLEAN(0, "dry-run", &dry_run,
2238                     "Parse options then exit"),
2239 #ifdef HAVE_AIO_SUPPORT
2240         OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2241                      &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2242                      record__aio_parse),
2243 #endif
2244         OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2245                      "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2246                      record__parse_affinity),
2247 #ifdef HAVE_ZSTD_SUPPORT
2248         OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2249                             "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2250                             record__parse_comp_level),
2251 #endif
2252         OPT_END()
2253 };
2254
2255 struct option *record_options = __record_options;
2256
2257 int cmd_record(int argc, const char **argv)
2258 {
2259         int err;
2260         struct record *rec = &record;
2261         char errbuf[BUFSIZ];
2262
2263         setlocale(LC_ALL, "");
2264
2265 #ifndef HAVE_LIBBPF_SUPPORT
2266 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2267         set_nobuild('\0', "clang-path", true);
2268         set_nobuild('\0', "clang-opt", true);
2269 # undef set_nobuild
2270 #endif
2271
2272 #ifndef HAVE_BPF_PROLOGUE
2273 # if !defined (HAVE_DWARF_SUPPORT)
2274 #  define REASON  "NO_DWARF=1"
2275 # elif !defined (HAVE_LIBBPF_SUPPORT)
2276 #  define REASON  "NO_LIBBPF=1"
2277 # else
2278 #  define REASON  "this architecture doesn't support BPF prologue"
2279 # endif
2280 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2281         set_nobuild('\0', "vmlinux", true);
2282 # undef set_nobuild
2283 # undef REASON
2284 #endif
2285
2286         CPU_ZERO(&rec->affinity_mask);
2287         rec->opts.affinity = PERF_AFFINITY_SYS;
2288
2289         rec->evlist = evlist__new();
2290         if (rec->evlist == NULL)
2291                 return -ENOMEM;
2292
2293         err = perf_config(perf_record_config, rec);
2294         if (err)
2295                 return err;
2296
2297         argc = parse_options(argc, argv, record_options, record_usage,
2298                             PARSE_OPT_STOP_AT_NON_OPTION);
2299         if (quiet)
2300                 perf_quiet_option();
2301
2302         /* Make system wide (-a) the default target. */
2303         if (!argc && target__none(&rec->opts.target))
2304                 rec->opts.target.system_wide = true;
2305
2306         if (nr_cgroups && !rec->opts.target.system_wide) {
2307                 usage_with_options_msg(record_usage, record_options,
2308                         "cgroup monitoring only available in system-wide mode");
2309
2310         }
2311
2312         if (rec->opts.comp_level != 0) {
2313                 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2314                 rec->no_buildid = true;
2315         }
2316
2317         if (rec->opts.record_switch_events &&
2318             !perf_can_record_switch_events()) {
2319                 ui__error("kernel does not support recording context switch events\n");
2320                 parse_options_usage(record_usage, record_options, "switch-events", 0);
2321                 return -EINVAL;
2322         }
2323
2324         if (switch_output_setup(rec)) {
2325                 parse_options_usage(record_usage, record_options, "switch-output", 0);
2326                 return -EINVAL;
2327         }
2328
2329         if (rec->switch_output.time) {
2330                 signal(SIGALRM, alarm_sig_handler);
2331                 alarm(rec->switch_output.time);
2332         }
2333
2334         if (rec->switch_output.num_files) {
2335                 rec->switch_output.filenames = calloc(sizeof(char *),
2336                                                       rec->switch_output.num_files);
2337                 if (!rec->switch_output.filenames)
2338                         return -EINVAL;
2339         }
2340
2341         /*
2342          * Allow aliases to facilitate the lookup of symbols for address
2343          * filters. Refer to auxtrace_parse_filters().
2344          */
2345         symbol_conf.allow_aliases = true;
2346
2347         symbol__init(NULL);
2348
2349         err = record__auxtrace_init(rec);
2350         if (err)
2351                 goto out;
2352
2353         if (dry_run)
2354                 goto out;
2355
2356         err = bpf__setup_stdout(rec->evlist);
2357         if (err) {
2358                 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2359                 pr_err("ERROR: Setup BPF stdout failed: %s\n",
2360                          errbuf);
2361                 goto out;
2362         }
2363
2364         err = -ENOMEM;
2365
2366         if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(rec->evlist))
2367                 pr_warning(
2368 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
2369 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
2370 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
2371 "file is not found in the buildid cache or in the vmlinux path.\n\n"
2372 "Samples in kernel modules won't be resolved at all.\n\n"
2373 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
2374 "even with a suitable vmlinux or kallsyms file.\n\n");
2375
2376         if (rec->no_buildid_cache || rec->no_buildid) {
2377                 disable_buildid_cache();
2378         } else if (rec->switch_output.enabled) {
2379                 /*
2380                  * In 'perf record --switch-output', disable buildid
2381                  * generation by default to reduce data file switching
2382                  * overhead. Still generate buildid if they are required
2383                  * explicitly using
2384                  *
2385                  *  perf record --switch-output --no-no-buildid \
2386                  *              --no-no-buildid-cache
2387                  *
2388                  * Following code equals to:
2389                  *
2390                  * if ((rec->no_buildid || !rec->no_buildid_set) &&
2391                  *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2392                  *         disable_buildid_cache();
2393                  */
2394                 bool disable = true;
2395
2396                 if (rec->no_buildid_set && !rec->no_buildid)
2397                         disable = false;
2398                 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2399                         disable = false;
2400                 if (disable) {
2401                         rec->no_buildid = true;
2402                         rec->no_buildid_cache = true;
2403                         disable_buildid_cache();
2404                 }
2405         }
2406
2407         if (record.opts.overwrite)
2408                 record.opts.tail_synthesize = true;
2409
2410         if (rec->evlist->core.nr_entries == 0 &&
2411             __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2412                 pr_err("Not enough memory for event selector list\n");
2413                 goto out;
2414         }
2415
2416         if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2417                 rec->opts.no_inherit = true;
2418
2419         err = target__validate(&rec->opts.target);
2420         if (err) {
2421                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2422                 ui__warning("%s\n", errbuf);
2423         }
2424
2425         err = target__parse_uid(&rec->opts.target);
2426         if (err) {
2427                 int saved_errno = errno;
2428
2429                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2430                 ui__error("%s", errbuf);
2431
2432                 err = -saved_errno;
2433                 goto out;
2434         }
2435
2436         /* Enable ignoring missing threads when -u/-p option is defined. */
2437         rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2438
2439         err = -ENOMEM;
2440         if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2441                 usage_with_options(record_usage, record_options);
2442
2443         err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2444         if (err)
2445                 goto out;
2446
2447         /*
2448          * We take all buildids when the file contains
2449          * AUX area tracing data because we do not decode the
2450          * trace because it would take too long.
2451          */
2452         if (rec->opts.full_auxtrace)
2453                 rec->buildid_all = true;
2454
2455         if (record_opts__config(&rec->opts)) {
2456                 err = -EINVAL;
2457                 goto out;
2458         }
2459
2460         if (rec->opts.nr_cblocks > nr_cblocks_max)
2461                 rec->opts.nr_cblocks = nr_cblocks_max;
2462         pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2463
2464         pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2465         pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2466
2467         if (rec->opts.comp_level > comp_level_max)
2468                 rec->opts.comp_level = comp_level_max;
2469         pr_debug("comp level: %d\n", rec->opts.comp_level);
2470
2471         err = __cmd_record(&record, argc, argv);
2472 out:
2473         evlist__delete(rec->evlist);
2474         symbol__exit();
2475         auxtrace_record__free(rec->itr);
2476         return err;
2477 }
2478
2479 static void snapshot_sig_handler(int sig __maybe_unused)
2480 {
2481         struct record *rec = &record;
2482
2483         if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2484                 trigger_hit(&auxtrace_snapshot_trigger);
2485                 auxtrace_record__snapshot_started = 1;
2486                 if (auxtrace_record__snapshot_start(record.itr))
2487                         trigger_error(&auxtrace_snapshot_trigger);
2488         }
2489
2490         if (switch_output_signal(rec))
2491                 trigger_hit(&switch_output_trigger);
2492 }
2493
2494 static void alarm_sig_handler(int sig __maybe_unused)
2495 {
2496         struct record *rec = &record;
2497
2498         if (switch_output_time(rec))
2499                 trigger_hit(&switch_output_trigger);
2500 }