]> asedeno.scripts.mit.edu Git - linux.git/blob - tools/perf/builtin-record.c
libperf: Rename the PERF_RECORD_ structs to have a "perf" prefix
[linux.git] / tools / perf / builtin-record.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/target.h"
24 #include "util/session.h"
25 #include "util/tool.h"
26 #include "util/symbol.h"
27 #include "util/record.h"
28 #include "util/cpumap.h"
29 #include "util/thread_map.h"
30 #include "util/data.h"
31 #include "util/perf_regs.h"
32 #include "util/auxtrace.h"
33 #include "util/tsc.h"
34 #include "util/parse-branch-options.h"
35 #include "util/parse-regs-options.h"
36 #include "util/llvm-utils.h"
37 #include "util/bpf-loader.h"
38 #include "util/trigger.h"
39 #include "util/perf-hooks.h"
40 #include "util/cpu-set-sched.h"
41 #include "util/time-utils.h"
42 #include "util/units.h"
43 #include "util/bpf-event.h"
44 #include "asm/bug.h"
45
46 #include <errno.h>
47 #include <inttypes.h>
48 #include <locale.h>
49 #include <poll.h>
50 #include <unistd.h>
51 #include <sched.h>
52 #include <signal.h>
53 #include <sys/mman.h>
54 #include <sys/wait.h>
55 #include <linux/time64.h>
56 #include <linux/zalloc.h>
57
58 struct switch_output {
59         bool             enabled;
60         bool             signal;
61         unsigned long    size;
62         unsigned long    time;
63         const char      *str;
64         bool             set;
65         char             **filenames;
66         int              num_files;
67         int              cur_file;
68 };
69
70 struct record {
71         struct perf_tool        tool;
72         struct record_opts      opts;
73         u64                     bytes_written;
74         struct perf_data        data;
75         struct auxtrace_record  *itr;
76         struct evlist   *evlist;
77         struct perf_session     *session;
78         int                     realtime_prio;
79         bool                    no_buildid;
80         bool                    no_buildid_set;
81         bool                    no_buildid_cache;
82         bool                    no_buildid_cache_set;
83         bool                    buildid_all;
84         bool                    timestamp_filename;
85         bool                    timestamp_boundary;
86         struct switch_output    switch_output;
87         unsigned long long      samples;
88         cpu_set_t               affinity_mask;
89 };
90
91 static volatile int auxtrace_record__snapshot_started;
92 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
93 static DEFINE_TRIGGER(switch_output_trigger);
94
95 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
96         "SYS", "NODE", "CPU"
97 };
98
99 static bool switch_output_signal(struct record *rec)
100 {
101         return rec->switch_output.signal &&
102                trigger_is_ready(&switch_output_trigger);
103 }
104
105 static bool switch_output_size(struct record *rec)
106 {
107         return rec->switch_output.size &&
108                trigger_is_ready(&switch_output_trigger) &&
109                (rec->bytes_written >= rec->switch_output.size);
110 }
111
112 static bool switch_output_time(struct record *rec)
113 {
114         return rec->switch_output.time &&
115                trigger_is_ready(&switch_output_trigger);
116 }
117
118 static int record__write(struct record *rec, struct perf_mmap *map __maybe_unused,
119                          void *bf, size_t size)
120 {
121         struct perf_data_file *file = &rec->session->data->file;
122
123         if (perf_data_file__write(file, bf, size) < 0) {
124                 pr_err("failed to write perf data, error: %m\n");
125                 return -1;
126         }
127
128         rec->bytes_written += size;
129
130         if (switch_output_size(rec))
131                 trigger_hit(&switch_output_trigger);
132
133         return 0;
134 }
135
136 static int record__aio_enabled(struct record *rec);
137 static int record__comp_enabled(struct record *rec);
138 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
139                             void *src, size_t src_size);
140
141 #ifdef HAVE_AIO_SUPPORT
142 static int record__aio_write(struct aiocb *cblock, int trace_fd,
143                 void *buf, size_t size, off_t off)
144 {
145         int rc;
146
147         cblock->aio_fildes = trace_fd;
148         cblock->aio_buf    = buf;
149         cblock->aio_nbytes = size;
150         cblock->aio_offset = off;
151         cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
152
153         do {
154                 rc = aio_write(cblock);
155                 if (rc == 0) {
156                         break;
157                 } else if (errno != EAGAIN) {
158                         cblock->aio_fildes = -1;
159                         pr_err("failed to queue perf data, error: %m\n");
160                         break;
161                 }
162         } while (1);
163
164         return rc;
165 }
166
167 static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
168 {
169         void *rem_buf;
170         off_t rem_off;
171         size_t rem_size;
172         int rc, aio_errno;
173         ssize_t aio_ret, written;
174
175         aio_errno = aio_error(cblock);
176         if (aio_errno == EINPROGRESS)
177                 return 0;
178
179         written = aio_ret = aio_return(cblock);
180         if (aio_ret < 0) {
181                 if (aio_errno != EINTR)
182                         pr_err("failed to write perf data, error: %m\n");
183                 written = 0;
184         }
185
186         rem_size = cblock->aio_nbytes - written;
187
188         if (rem_size == 0) {
189                 cblock->aio_fildes = -1;
190                 /*
191                  * md->refcount is incremented in record__aio_pushfn() for
192                  * every aio write request started in record__aio_push() so
193                  * decrement it because the request is now complete.
194                  */
195                 perf_mmap__put(md);
196                 rc = 1;
197         } else {
198                 /*
199                  * aio write request may require restart with the
200                  * reminder if the kernel didn't write whole
201                  * chunk at once.
202                  */
203                 rem_off = cblock->aio_offset + written;
204                 rem_buf = (void *)(cblock->aio_buf + written);
205                 record__aio_write(cblock, cblock->aio_fildes,
206                                 rem_buf, rem_size, rem_off);
207                 rc = 0;
208         }
209
210         return rc;
211 }
212
213 static int record__aio_sync(struct perf_mmap *md, bool sync_all)
214 {
215         struct aiocb **aiocb = md->aio.aiocb;
216         struct aiocb *cblocks = md->aio.cblocks;
217         struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
218         int i, do_suspend;
219
220         do {
221                 do_suspend = 0;
222                 for (i = 0; i < md->aio.nr_cblocks; ++i) {
223                         if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
224                                 if (sync_all)
225                                         aiocb[i] = NULL;
226                                 else
227                                         return i;
228                         } else {
229                                 /*
230                                  * Started aio write is not complete yet
231                                  * so it has to be waited before the
232                                  * next allocation.
233                                  */
234                                 aiocb[i] = &cblocks[i];
235                                 do_suspend = 1;
236                         }
237                 }
238                 if (!do_suspend)
239                         return -1;
240
241                 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
242                         if (!(errno == EAGAIN || errno == EINTR))
243                                 pr_err("failed to sync perf data, error: %m\n");
244                 }
245         } while (1);
246 }
247
248 struct record_aio {
249         struct record   *rec;
250         void            *data;
251         size_t          size;
252 };
253
254 static int record__aio_pushfn(struct perf_mmap *map, void *to, void *buf, size_t size)
255 {
256         struct record_aio *aio = to;
257
258         /*
259          * map->base data pointed by buf is copied into free map->aio.data[] buffer
260          * to release space in the kernel buffer as fast as possible, calling
261          * perf_mmap__consume() from perf_mmap__push() function.
262          *
263          * That lets the kernel to proceed with storing more profiling data into
264          * the kernel buffer earlier than other per-cpu kernel buffers are handled.
265          *
266          * Coping can be done in two steps in case the chunk of profiling data
267          * crosses the upper bound of the kernel buffer. In this case we first move
268          * part of data from map->start till the upper bound and then the reminder
269          * from the beginning of the kernel buffer till the end of the data chunk.
270          */
271
272         if (record__comp_enabled(aio->rec)) {
273                 size = zstd_compress(aio->rec->session, aio->data + aio->size,
274                                      perf_mmap__mmap_len(map) - aio->size,
275                                      buf, size);
276         } else {
277                 memcpy(aio->data + aio->size, buf, size);
278         }
279
280         if (!aio->size) {
281                 /*
282                  * Increment map->refcount to guard map->aio.data[] buffer
283                  * from premature deallocation because map object can be
284                  * released earlier than aio write request started on
285                  * map->aio.data[] buffer is complete.
286                  *
287                  * perf_mmap__put() is done at record__aio_complete()
288                  * after started aio request completion or at record__aio_push()
289                  * if the request failed to start.
290                  */
291                 perf_mmap__get(map);
292         }
293
294         aio->size += size;
295
296         return size;
297 }
298
299 static int record__aio_push(struct record *rec, struct perf_mmap *map, off_t *off)
300 {
301         int ret, idx;
302         int trace_fd = rec->session->data->file.fd;
303         struct record_aio aio = { .rec = rec, .size = 0 };
304
305         /*
306          * Call record__aio_sync() to wait till map->aio.data[] buffer
307          * becomes available after previous aio write operation.
308          */
309
310         idx = record__aio_sync(map, false);
311         aio.data = map->aio.data[idx];
312         ret = perf_mmap__push(map, &aio, record__aio_pushfn);
313         if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
314                 return ret;
315
316         rec->samples++;
317         ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
318         if (!ret) {
319                 *off += aio.size;
320                 rec->bytes_written += aio.size;
321                 if (switch_output_size(rec))
322                         trigger_hit(&switch_output_trigger);
323         } else {
324                 /*
325                  * Decrement map->refcount incremented in record__aio_pushfn()
326                  * back if record__aio_write() operation failed to start, otherwise
327                  * map->refcount is decremented in record__aio_complete() after
328                  * aio write operation finishes successfully.
329                  */
330                 perf_mmap__put(map);
331         }
332
333         return ret;
334 }
335
336 static off_t record__aio_get_pos(int trace_fd)
337 {
338         return lseek(trace_fd, 0, SEEK_CUR);
339 }
340
341 static void record__aio_set_pos(int trace_fd, off_t pos)
342 {
343         lseek(trace_fd, pos, SEEK_SET);
344 }
345
346 static void record__aio_mmap_read_sync(struct record *rec)
347 {
348         int i;
349         struct evlist *evlist = rec->evlist;
350         struct perf_mmap *maps = evlist->mmap;
351
352         if (!record__aio_enabled(rec))
353                 return;
354
355         for (i = 0; i < evlist->nr_mmaps; i++) {
356                 struct perf_mmap *map = &maps[i];
357
358                 if (map->base)
359                         record__aio_sync(map, true);
360         }
361 }
362
363 static int nr_cblocks_default = 1;
364 static int nr_cblocks_max = 4;
365
366 static int record__aio_parse(const struct option *opt,
367                              const char *str,
368                              int unset)
369 {
370         struct record_opts *opts = (struct record_opts *)opt->value;
371
372         if (unset) {
373                 opts->nr_cblocks = 0;
374         } else {
375                 if (str)
376                         opts->nr_cblocks = strtol(str, NULL, 0);
377                 if (!opts->nr_cblocks)
378                         opts->nr_cblocks = nr_cblocks_default;
379         }
380
381         return 0;
382 }
383 #else /* HAVE_AIO_SUPPORT */
384 static int nr_cblocks_max = 0;
385
386 static int record__aio_push(struct record *rec __maybe_unused, struct perf_mmap *map __maybe_unused,
387                             off_t *off __maybe_unused)
388 {
389         return -1;
390 }
391
392 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
393 {
394         return -1;
395 }
396
397 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
398 {
399 }
400
401 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
402 {
403 }
404 #endif
405
406 static int record__aio_enabled(struct record *rec)
407 {
408         return rec->opts.nr_cblocks > 0;
409 }
410
411 #define MMAP_FLUSH_DEFAULT 1
412 static int record__mmap_flush_parse(const struct option *opt,
413                                     const char *str,
414                                     int unset)
415 {
416         int flush_max;
417         struct record_opts *opts = (struct record_opts *)opt->value;
418         static struct parse_tag tags[] = {
419                         { .tag  = 'B', .mult = 1       },
420                         { .tag  = 'K', .mult = 1 << 10 },
421                         { .tag  = 'M', .mult = 1 << 20 },
422                         { .tag  = 'G', .mult = 1 << 30 },
423                         { .tag  = 0 },
424         };
425
426         if (unset)
427                 return 0;
428
429         if (str) {
430                 opts->mmap_flush = parse_tag_value(str, tags);
431                 if (opts->mmap_flush == (int)-1)
432                         opts->mmap_flush = strtol(str, NULL, 0);
433         }
434
435         if (!opts->mmap_flush)
436                 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
437
438         flush_max = perf_evlist__mmap_size(opts->mmap_pages);
439         flush_max /= 4;
440         if (opts->mmap_flush > flush_max)
441                 opts->mmap_flush = flush_max;
442
443         return 0;
444 }
445
446 #ifdef HAVE_ZSTD_SUPPORT
447 static unsigned int comp_level_default = 1;
448
449 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
450 {
451         struct record_opts *opts = opt->value;
452
453         if (unset) {
454                 opts->comp_level = 0;
455         } else {
456                 if (str)
457                         opts->comp_level = strtol(str, NULL, 0);
458                 if (!opts->comp_level)
459                         opts->comp_level = comp_level_default;
460         }
461
462         return 0;
463 }
464 #endif
465 static unsigned int comp_level_max = 22;
466
467 static int record__comp_enabled(struct record *rec)
468 {
469         return rec->opts.comp_level > 0;
470 }
471
472 static int process_synthesized_event(struct perf_tool *tool,
473                                      union perf_event *event,
474                                      struct perf_sample *sample __maybe_unused,
475                                      struct machine *machine __maybe_unused)
476 {
477         struct record *rec = container_of(tool, struct record, tool);
478         return record__write(rec, NULL, event, event->header.size);
479 }
480
481 static int record__pushfn(struct perf_mmap *map, void *to, void *bf, size_t size)
482 {
483         struct record *rec = to;
484
485         if (record__comp_enabled(rec)) {
486                 size = zstd_compress(rec->session, map->data, perf_mmap__mmap_len(map), bf, size);
487                 bf   = map->data;
488         }
489
490         rec->samples++;
491         return record__write(rec, map, bf, size);
492 }
493
494 static volatile int done;
495 static volatile int signr = -1;
496 static volatile int child_finished;
497
498 static void sig_handler(int sig)
499 {
500         if (sig == SIGCHLD)
501                 child_finished = 1;
502         else
503                 signr = sig;
504
505         done = 1;
506 }
507
508 static void sigsegv_handler(int sig)
509 {
510         perf_hooks__recover();
511         sighandler_dump_stack(sig);
512 }
513
514 static void record__sig_exit(void)
515 {
516         if (signr == -1)
517                 return;
518
519         signal(signr, SIG_DFL);
520         raise(signr);
521 }
522
523 #ifdef HAVE_AUXTRACE_SUPPORT
524
525 static int record__process_auxtrace(struct perf_tool *tool,
526                                     struct perf_mmap *map,
527                                     union perf_event *event, void *data1,
528                                     size_t len1, void *data2, size_t len2)
529 {
530         struct record *rec = container_of(tool, struct record, tool);
531         struct perf_data *data = &rec->data;
532         size_t padding;
533         u8 pad[8] = {0};
534
535         if (!perf_data__is_pipe(data) && !perf_data__is_dir(data)) {
536                 off_t file_offset;
537                 int fd = perf_data__fd(data);
538                 int err;
539
540                 file_offset = lseek(fd, 0, SEEK_CUR);
541                 if (file_offset == -1)
542                         return -1;
543                 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
544                                                      event, file_offset);
545                 if (err)
546                         return err;
547         }
548
549         /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
550         padding = (len1 + len2) & 7;
551         if (padding)
552                 padding = 8 - padding;
553
554         record__write(rec, map, event, event->header.size);
555         record__write(rec, map, data1, len1);
556         if (len2)
557                 record__write(rec, map, data2, len2);
558         record__write(rec, map, &pad, padding);
559
560         return 0;
561 }
562
563 static int record__auxtrace_mmap_read(struct record *rec,
564                                       struct perf_mmap *map)
565 {
566         int ret;
567
568         ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
569                                   record__process_auxtrace);
570         if (ret < 0)
571                 return ret;
572
573         if (ret)
574                 rec->samples++;
575
576         return 0;
577 }
578
579 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
580                                                struct perf_mmap *map)
581 {
582         int ret;
583
584         ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
585                                            record__process_auxtrace,
586                                            rec->opts.auxtrace_snapshot_size);
587         if (ret < 0)
588                 return ret;
589
590         if (ret)
591                 rec->samples++;
592
593         return 0;
594 }
595
596 static int record__auxtrace_read_snapshot_all(struct record *rec)
597 {
598         int i;
599         int rc = 0;
600
601         for (i = 0; i < rec->evlist->nr_mmaps; i++) {
602                 struct perf_mmap *map = &rec->evlist->mmap[i];
603
604                 if (!map->auxtrace_mmap.base)
605                         continue;
606
607                 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
608                         rc = -1;
609                         goto out;
610                 }
611         }
612 out:
613         return rc;
614 }
615
616 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
617 {
618         pr_debug("Recording AUX area tracing snapshot\n");
619         if (record__auxtrace_read_snapshot_all(rec) < 0) {
620                 trigger_error(&auxtrace_snapshot_trigger);
621         } else {
622                 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
623                         trigger_error(&auxtrace_snapshot_trigger);
624                 else
625                         trigger_ready(&auxtrace_snapshot_trigger);
626         }
627 }
628
629 static int record__auxtrace_snapshot_exit(struct record *rec)
630 {
631         if (trigger_is_error(&auxtrace_snapshot_trigger))
632                 return 0;
633
634         if (!auxtrace_record__snapshot_started &&
635             auxtrace_record__snapshot_start(rec->itr))
636                 return -1;
637
638         record__read_auxtrace_snapshot(rec, true);
639         if (trigger_is_error(&auxtrace_snapshot_trigger))
640                 return -1;
641
642         return 0;
643 }
644
645 static int record__auxtrace_init(struct record *rec)
646 {
647         int err;
648
649         if (!rec->itr) {
650                 rec->itr = auxtrace_record__init(rec->evlist, &err);
651                 if (err)
652                         return err;
653         }
654
655         err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
656                                               rec->opts.auxtrace_snapshot_opts);
657         if (err)
658                 return err;
659
660         return auxtrace_parse_filters(rec->evlist);
661 }
662
663 #else
664
665 static inline
666 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
667                                struct perf_mmap *map __maybe_unused)
668 {
669         return 0;
670 }
671
672 static inline
673 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
674                                     bool on_exit __maybe_unused)
675 {
676 }
677
678 static inline
679 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
680 {
681         return 0;
682 }
683
684 static inline
685 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
686 {
687         return 0;
688 }
689
690 static int record__auxtrace_init(struct record *rec __maybe_unused)
691 {
692         return 0;
693 }
694
695 #endif
696
697 static int record__mmap_evlist(struct record *rec,
698                                struct evlist *evlist)
699 {
700         struct record_opts *opts = &rec->opts;
701         char msg[512];
702
703         if (opts->affinity != PERF_AFFINITY_SYS)
704                 cpu__setup_cpunode_map();
705
706         if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
707                                  opts->auxtrace_mmap_pages,
708                                  opts->auxtrace_snapshot_mode,
709                                  opts->nr_cblocks, opts->affinity,
710                                  opts->mmap_flush, opts->comp_level) < 0) {
711                 if (errno == EPERM) {
712                         pr_err("Permission error mapping pages.\n"
713                                "Consider increasing "
714                                "/proc/sys/kernel/perf_event_mlock_kb,\n"
715                                "or try again with a smaller value of -m/--mmap_pages.\n"
716                                "(current value: %u,%u)\n",
717                                opts->mmap_pages, opts->auxtrace_mmap_pages);
718                         return -errno;
719                 } else {
720                         pr_err("failed to mmap with %d (%s)\n", errno,
721                                 str_error_r(errno, msg, sizeof(msg)));
722                         if (errno)
723                                 return -errno;
724                         else
725                                 return -EINVAL;
726                 }
727         }
728         return 0;
729 }
730
731 static int record__mmap(struct record *rec)
732 {
733         return record__mmap_evlist(rec, rec->evlist);
734 }
735
736 static int record__open(struct record *rec)
737 {
738         char msg[BUFSIZ];
739         struct evsel *pos;
740         struct evlist *evlist = rec->evlist;
741         struct perf_session *session = rec->session;
742         struct record_opts *opts = &rec->opts;
743         int rc = 0;
744
745         /*
746          * For initial_delay we need to add a dummy event so that we can track
747          * PERF_RECORD_MMAP while we wait for the initial delay to enable the
748          * real events, the ones asked by the user.
749          */
750         if (opts->initial_delay) {
751                 if (perf_evlist__add_dummy(evlist))
752                         return -ENOMEM;
753
754                 pos = perf_evlist__first(evlist);
755                 pos->tracking = 0;
756                 pos = perf_evlist__last(evlist);
757                 pos->tracking = 1;
758                 pos->core.attr.enable_on_exec = 1;
759         }
760
761         perf_evlist__config(evlist, opts, &callchain_param);
762
763         evlist__for_each_entry(evlist, pos) {
764 try_again:
765                 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
766                         if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
767                                 if (verbose > 0)
768                                         ui__warning("%s\n", msg);
769                                 goto try_again;
770                         }
771                         if ((errno == EINVAL || errno == EBADF) &&
772                             pos->leader != pos &&
773                             pos->weak_group) {
774                                 pos = perf_evlist__reset_weak_group(evlist, pos);
775                                 goto try_again;
776                         }
777                         rc = -errno;
778                         perf_evsel__open_strerror(pos, &opts->target,
779                                                   errno, msg, sizeof(msg));
780                         ui__error("%s\n", msg);
781                         goto out;
782                 }
783
784                 pos->supported = true;
785         }
786
787         if (perf_evlist__apply_filters(evlist, &pos)) {
788                 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
789                         pos->filter, perf_evsel__name(pos), errno,
790                         str_error_r(errno, msg, sizeof(msg)));
791                 rc = -1;
792                 goto out;
793         }
794
795         rc = record__mmap(rec);
796         if (rc)
797                 goto out;
798
799         session->evlist = evlist;
800         perf_session__set_id_hdr_size(session);
801 out:
802         return rc;
803 }
804
805 static int process_sample_event(struct perf_tool *tool,
806                                 union perf_event *event,
807                                 struct perf_sample *sample,
808                                 struct evsel *evsel,
809                                 struct machine *machine)
810 {
811         struct record *rec = container_of(tool, struct record, tool);
812
813         if (rec->evlist->first_sample_time == 0)
814                 rec->evlist->first_sample_time = sample->time;
815
816         rec->evlist->last_sample_time = sample->time;
817
818         if (rec->buildid_all)
819                 return 0;
820
821         rec->samples++;
822         return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
823 }
824
825 static int process_buildids(struct record *rec)
826 {
827         struct perf_session *session = rec->session;
828
829         if (perf_data__size(&rec->data) == 0)
830                 return 0;
831
832         /*
833          * During this process, it'll load kernel map and replace the
834          * dso->long_name to a real pathname it found.  In this case
835          * we prefer the vmlinux path like
836          *   /lib/modules/3.16.4/build/vmlinux
837          *
838          * rather than build-id path (in debug directory).
839          *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
840          */
841         symbol_conf.ignore_vmlinux_buildid = true;
842
843         /*
844          * If --buildid-all is given, it marks all DSO regardless of hits,
845          * so no need to process samples. But if timestamp_boundary is enabled,
846          * it still needs to walk on all samples to get the timestamps of
847          * first/last samples.
848          */
849         if (rec->buildid_all && !rec->timestamp_boundary)
850                 rec->tool.sample = NULL;
851
852         return perf_session__process_events(session);
853 }
854
855 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
856 {
857         int err;
858         struct perf_tool *tool = data;
859         /*
860          *As for guest kernel when processing subcommand record&report,
861          *we arrange module mmap prior to guest kernel mmap and trigger
862          *a preload dso because default guest module symbols are loaded
863          *from guest kallsyms instead of /lib/modules/XXX/XXX. This
864          *method is used to avoid symbol missing when the first addr is
865          *in module instead of in guest kernel.
866          */
867         err = perf_event__synthesize_modules(tool, process_synthesized_event,
868                                              machine);
869         if (err < 0)
870                 pr_err("Couldn't record guest kernel [%d]'s reference"
871                        " relocation symbol.\n", machine->pid);
872
873         /*
874          * We use _stext for guest kernel because guest kernel's /proc/kallsyms
875          * have no _text sometimes.
876          */
877         err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
878                                                  machine);
879         if (err < 0)
880                 pr_err("Couldn't record guest kernel [%d]'s reference"
881                        " relocation symbol.\n", machine->pid);
882 }
883
884 static struct perf_event_header finished_round_event = {
885         .size = sizeof(struct perf_event_header),
886         .type = PERF_RECORD_FINISHED_ROUND,
887 };
888
889 static void record__adjust_affinity(struct record *rec, struct perf_mmap *map)
890 {
891         if (rec->opts.affinity != PERF_AFFINITY_SYS &&
892             !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
893                 CPU_ZERO(&rec->affinity_mask);
894                 CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask);
895                 sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask);
896         }
897 }
898
899 static size_t process_comp_header(void *record, size_t increment)
900 {
901         struct perf_record_compressed *event = record;
902         size_t size = sizeof(*event);
903
904         if (increment) {
905                 event->header.size += increment;
906                 return increment;
907         }
908
909         event->header.type = PERF_RECORD_COMPRESSED;
910         event->header.size = size;
911
912         return size;
913 }
914
915 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
916                             void *src, size_t src_size)
917 {
918         size_t compressed;
919         size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
920
921         compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
922                                                      max_record_size, process_comp_header);
923
924         session->bytes_transferred += src_size;
925         session->bytes_compressed  += compressed;
926
927         return compressed;
928 }
929
930 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
931                                     bool overwrite, bool synch)
932 {
933         u64 bytes_written = rec->bytes_written;
934         int i;
935         int rc = 0;
936         struct perf_mmap *maps;
937         int trace_fd = rec->data.file.fd;
938         off_t off = 0;
939
940         if (!evlist)
941                 return 0;
942
943         maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
944         if (!maps)
945                 return 0;
946
947         if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
948                 return 0;
949
950         if (record__aio_enabled(rec))
951                 off = record__aio_get_pos(trace_fd);
952
953         for (i = 0; i < evlist->nr_mmaps; i++) {
954                 u64 flush = 0;
955                 struct perf_mmap *map = &maps[i];
956
957                 if (map->base) {
958                         record__adjust_affinity(rec, map);
959                         if (synch) {
960                                 flush = map->flush;
961                                 map->flush = 1;
962                         }
963                         if (!record__aio_enabled(rec)) {
964                                 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
965                                         if (synch)
966                                                 map->flush = flush;
967                                         rc = -1;
968                                         goto out;
969                                 }
970                         } else {
971                                 if (record__aio_push(rec, map, &off) < 0) {
972                                         record__aio_set_pos(trace_fd, off);
973                                         if (synch)
974                                                 map->flush = flush;
975                                         rc = -1;
976                                         goto out;
977                                 }
978                         }
979                         if (synch)
980                                 map->flush = flush;
981                 }
982
983                 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
984                     record__auxtrace_mmap_read(rec, map) != 0) {
985                         rc = -1;
986                         goto out;
987                 }
988         }
989
990         if (record__aio_enabled(rec))
991                 record__aio_set_pos(trace_fd, off);
992
993         /*
994          * Mark the round finished in case we wrote
995          * at least one event.
996          */
997         if (bytes_written != rec->bytes_written)
998                 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
999
1000         if (overwrite)
1001                 perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1002 out:
1003         return rc;
1004 }
1005
1006 static int record__mmap_read_all(struct record *rec, bool synch)
1007 {
1008         int err;
1009
1010         err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1011         if (err)
1012                 return err;
1013
1014         return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1015 }
1016
1017 static void record__init_features(struct record *rec)
1018 {
1019         struct perf_session *session = rec->session;
1020         int feat;
1021
1022         for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1023                 perf_header__set_feat(&session->header, feat);
1024
1025         if (rec->no_buildid)
1026                 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1027
1028         if (!have_tracepoints(&rec->evlist->core.entries))
1029                 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1030
1031         if (!rec->opts.branch_stack)
1032                 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1033
1034         if (!rec->opts.full_auxtrace)
1035                 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1036
1037         if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1038                 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1039
1040         perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1041         if (!record__comp_enabled(rec))
1042                 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1043
1044         perf_header__clear_feat(&session->header, HEADER_STAT);
1045 }
1046
1047 static void
1048 record__finish_output(struct record *rec)
1049 {
1050         struct perf_data *data = &rec->data;
1051         int fd = perf_data__fd(data);
1052
1053         if (data->is_pipe)
1054                 return;
1055
1056         rec->session->header.data_size += rec->bytes_written;
1057         data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1058
1059         if (!rec->no_buildid) {
1060                 process_buildids(rec);
1061
1062                 if (rec->buildid_all)
1063                         dsos__hit_all(rec->session);
1064         }
1065         perf_session__write_header(rec->session, rec->evlist, fd, true);
1066
1067         return;
1068 }
1069
1070 static int record__synthesize_workload(struct record *rec, bool tail)
1071 {
1072         int err;
1073         struct perf_thread_map *thread_map;
1074
1075         if (rec->opts.tail_synthesize != tail)
1076                 return 0;
1077
1078         thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1079         if (thread_map == NULL)
1080                 return -1;
1081
1082         err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1083                                                  process_synthesized_event,
1084                                                  &rec->session->machines.host,
1085                                                  rec->opts.sample_address);
1086         perf_thread_map__put(thread_map);
1087         return err;
1088 }
1089
1090 static int record__synthesize(struct record *rec, bool tail);
1091
1092 static int
1093 record__switch_output(struct record *rec, bool at_exit)
1094 {
1095         struct perf_data *data = &rec->data;
1096         int fd, err;
1097         char *new_filename;
1098
1099         /* Same Size:      "2015122520103046"*/
1100         char timestamp[] = "InvalidTimestamp";
1101
1102         record__aio_mmap_read_sync(rec);
1103
1104         record__synthesize(rec, true);
1105         if (target__none(&rec->opts.target))
1106                 record__synthesize_workload(rec, true);
1107
1108         rec->samples = 0;
1109         record__finish_output(rec);
1110         err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1111         if (err) {
1112                 pr_err("Failed to get current timestamp\n");
1113                 return -EINVAL;
1114         }
1115
1116         fd = perf_data__switch(data, timestamp,
1117                                     rec->session->header.data_offset,
1118                                     at_exit, &new_filename);
1119         if (fd >= 0 && !at_exit) {
1120                 rec->bytes_written = 0;
1121                 rec->session->header.data_size = 0;
1122         }
1123
1124         if (!quiet)
1125                 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1126                         data->path, timestamp);
1127
1128         if (rec->switch_output.num_files) {
1129                 int n = rec->switch_output.cur_file + 1;
1130
1131                 if (n >= rec->switch_output.num_files)
1132                         n = 0;
1133                 rec->switch_output.cur_file = n;
1134                 if (rec->switch_output.filenames[n]) {
1135                         remove(rec->switch_output.filenames[n]);
1136                         zfree(&rec->switch_output.filenames[n]);
1137                 }
1138                 rec->switch_output.filenames[n] = new_filename;
1139         } else {
1140                 free(new_filename);
1141         }
1142
1143         /* Output tracking events */
1144         if (!at_exit) {
1145                 record__synthesize(rec, false);
1146
1147                 /*
1148                  * In 'perf record --switch-output' without -a,
1149                  * record__synthesize() in record__switch_output() won't
1150                  * generate tracking events because there's no thread_map
1151                  * in evlist. Which causes newly created perf.data doesn't
1152                  * contain map and comm information.
1153                  * Create a fake thread_map and directly call
1154                  * perf_event__synthesize_thread_map() for those events.
1155                  */
1156                 if (target__none(&rec->opts.target))
1157                         record__synthesize_workload(rec, false);
1158         }
1159         return fd;
1160 }
1161
1162 static volatile int workload_exec_errno;
1163
1164 /*
1165  * perf_evlist__prepare_workload will send a SIGUSR1
1166  * if the fork fails, since we asked by setting its
1167  * want_signal to true.
1168  */
1169 static void workload_exec_failed_signal(int signo __maybe_unused,
1170                                         siginfo_t *info,
1171                                         void *ucontext __maybe_unused)
1172 {
1173         workload_exec_errno = info->si_value.sival_int;
1174         done = 1;
1175         child_finished = 1;
1176 }
1177
1178 static void snapshot_sig_handler(int sig);
1179 static void alarm_sig_handler(int sig);
1180
1181 int __weak
1182 perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused,
1183                             struct perf_tool *tool __maybe_unused,
1184                             perf_event__handler_t process __maybe_unused,
1185                             struct machine *machine __maybe_unused)
1186 {
1187         return 0;
1188 }
1189
1190 static const struct perf_event_mmap_page *
1191 perf_evlist__pick_pc(struct evlist *evlist)
1192 {
1193         if (evlist) {
1194                 if (evlist->mmap && evlist->mmap[0].base)
1195                         return evlist->mmap[0].base;
1196                 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].base)
1197                         return evlist->overwrite_mmap[0].base;
1198         }
1199         return NULL;
1200 }
1201
1202 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1203 {
1204         const struct perf_event_mmap_page *pc;
1205
1206         pc = perf_evlist__pick_pc(rec->evlist);
1207         if (pc)
1208                 return pc;
1209         return NULL;
1210 }
1211
1212 static int record__synthesize(struct record *rec, bool tail)
1213 {
1214         struct perf_session *session = rec->session;
1215         struct machine *machine = &session->machines.host;
1216         struct perf_data *data = &rec->data;
1217         struct record_opts *opts = &rec->opts;
1218         struct perf_tool *tool = &rec->tool;
1219         int fd = perf_data__fd(data);
1220         int err = 0;
1221
1222         if (rec->opts.tail_synthesize != tail)
1223                 return 0;
1224
1225         if (data->is_pipe) {
1226                 /*
1227                  * We need to synthesize events first, because some
1228                  * features works on top of them (on report side).
1229                  */
1230                 err = perf_event__synthesize_attrs(tool, rec->evlist,
1231                                                    process_synthesized_event);
1232                 if (err < 0) {
1233                         pr_err("Couldn't synthesize attrs.\n");
1234                         goto out;
1235                 }
1236
1237                 err = perf_event__synthesize_features(tool, session, rec->evlist,
1238                                                       process_synthesized_event);
1239                 if (err < 0) {
1240                         pr_err("Couldn't synthesize features.\n");
1241                         return err;
1242                 }
1243
1244                 if (have_tracepoints(&rec->evlist->core.entries)) {
1245                         /*
1246                          * FIXME err <= 0 here actually means that
1247                          * there were no tracepoints so its not really
1248                          * an error, just that we don't need to
1249                          * synthesize anything.  We really have to
1250                          * return this more properly and also
1251                          * propagate errors that now are calling die()
1252                          */
1253                         err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
1254                                                                   process_synthesized_event);
1255                         if (err <= 0) {
1256                                 pr_err("Couldn't record tracing data.\n");
1257                                 goto out;
1258                         }
1259                         rec->bytes_written += err;
1260                 }
1261         }
1262
1263         err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1264                                           process_synthesized_event, machine);
1265         if (err)
1266                 goto out;
1267
1268         if (rec->opts.full_auxtrace) {
1269                 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1270                                         session, process_synthesized_event);
1271                 if (err)
1272                         goto out;
1273         }
1274
1275         if (!perf_evlist__exclude_kernel(rec->evlist)) {
1276                 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1277                                                          machine);
1278                 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1279                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1280                                    "Check /proc/kallsyms permission or run as root.\n");
1281
1282                 err = perf_event__synthesize_modules(tool, process_synthesized_event,
1283                                                      machine);
1284                 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1285                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1286                                    "Check /proc/modules permission or run as root.\n");
1287         }
1288
1289         if (perf_guest) {
1290                 machines__process_guests(&session->machines,
1291                                          perf_event__synthesize_guest_os, tool);
1292         }
1293
1294         err = perf_event__synthesize_extra_attr(&rec->tool,
1295                                                 rec->evlist,
1296                                                 process_synthesized_event,
1297                                                 data->is_pipe);
1298         if (err)
1299                 goto out;
1300
1301         err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1302                                                  process_synthesized_event,
1303                                                 NULL);
1304         if (err < 0) {
1305                 pr_err("Couldn't synthesize thread map.\n");
1306                 return err;
1307         }
1308
1309         err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1310                                              process_synthesized_event, NULL);
1311         if (err < 0) {
1312                 pr_err("Couldn't synthesize cpu map.\n");
1313                 return err;
1314         }
1315
1316         err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1317                                                 machine, opts);
1318         if (err < 0)
1319                 pr_warning("Couldn't synthesize bpf events.\n");
1320
1321         err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1322                                             process_synthesized_event, opts->sample_address,
1323                                             1);
1324 out:
1325         return err;
1326 }
1327
1328 static int __cmd_record(struct record *rec, int argc, const char **argv)
1329 {
1330         int err;
1331         int status = 0;
1332         unsigned long waking = 0;
1333         const bool forks = argc > 0;
1334         struct perf_tool *tool = &rec->tool;
1335         struct record_opts *opts = &rec->opts;
1336         struct perf_data *data = &rec->data;
1337         struct perf_session *session;
1338         bool disabled = false, draining = false;
1339         struct evlist *sb_evlist = NULL;
1340         int fd;
1341         float ratio = 0;
1342
1343         atexit(record__sig_exit);
1344         signal(SIGCHLD, sig_handler);
1345         signal(SIGINT, sig_handler);
1346         signal(SIGTERM, sig_handler);
1347         signal(SIGSEGV, sigsegv_handler);
1348
1349         if (rec->opts.record_namespaces)
1350                 tool->namespace_events = true;
1351
1352         if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1353                 signal(SIGUSR2, snapshot_sig_handler);
1354                 if (rec->opts.auxtrace_snapshot_mode)
1355                         trigger_on(&auxtrace_snapshot_trigger);
1356                 if (rec->switch_output.enabled)
1357                         trigger_on(&switch_output_trigger);
1358         } else {
1359                 signal(SIGUSR2, SIG_IGN);
1360         }
1361
1362         session = perf_session__new(data, false, tool);
1363         if (session == NULL) {
1364                 pr_err("Perf session creation failed.\n");
1365                 return -1;
1366         }
1367
1368         fd = perf_data__fd(data);
1369         rec->session = session;
1370
1371         if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1372                 pr_err("Compression initialization failed.\n");
1373                 return -1;
1374         }
1375
1376         session->header.env.comp_type  = PERF_COMP_ZSTD;
1377         session->header.env.comp_level = rec->opts.comp_level;
1378
1379         record__init_features(rec);
1380
1381         if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1382                 session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1383
1384         if (forks) {
1385                 err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1386                                                     argv, data->is_pipe,
1387                                                     workload_exec_failed_signal);
1388                 if (err < 0) {
1389                         pr_err("Couldn't run the workload!\n");
1390                         status = err;
1391                         goto out_delete_session;
1392                 }
1393         }
1394
1395         /*
1396          * If we have just single event and are sending data
1397          * through pipe, we need to force the ids allocation,
1398          * because we synthesize event name through the pipe
1399          * and need the id for that.
1400          */
1401         if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1402                 rec->opts.sample_id = true;
1403
1404         if (record__open(rec) != 0) {
1405                 err = -1;
1406                 goto out_child;
1407         }
1408         session->header.env.comp_mmap_len = session->evlist->mmap_len;
1409
1410         err = bpf__apply_obj_config();
1411         if (err) {
1412                 char errbuf[BUFSIZ];
1413
1414                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1415                 pr_err("ERROR: Apply config to BPF failed: %s\n",
1416                          errbuf);
1417                 goto out_child;
1418         }
1419
1420         /*
1421          * Normally perf_session__new would do this, but it doesn't have the
1422          * evlist.
1423          */
1424         if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1425                 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1426                 rec->tool.ordered_events = false;
1427         }
1428
1429         if (!rec->evlist->nr_groups)
1430                 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1431
1432         if (data->is_pipe) {
1433                 err = perf_header__write_pipe(fd);
1434                 if (err < 0)
1435                         goto out_child;
1436         } else {
1437                 err = perf_session__write_header(session, rec->evlist, fd, false);
1438                 if (err < 0)
1439                         goto out_child;
1440         }
1441
1442         if (!rec->no_buildid
1443             && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1444                 pr_err("Couldn't generate buildids. "
1445                        "Use --no-buildid to profile anyway.\n");
1446                 err = -1;
1447                 goto out_child;
1448         }
1449
1450         if (!opts->no_bpf_event)
1451                 bpf_event__add_sb_event(&sb_evlist, &session->header.env);
1452
1453         if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) {
1454                 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1455                 opts->no_bpf_event = true;
1456         }
1457
1458         err = record__synthesize(rec, false);
1459         if (err < 0)
1460                 goto out_child;
1461
1462         if (rec->realtime_prio) {
1463                 struct sched_param param;
1464
1465                 param.sched_priority = rec->realtime_prio;
1466                 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1467                         pr_err("Could not set realtime priority.\n");
1468                         err = -1;
1469                         goto out_child;
1470                 }
1471         }
1472
1473         /*
1474          * When perf is starting the traced process, all the events
1475          * (apart from group members) have enable_on_exec=1 set,
1476          * so don't spoil it by prematurely enabling them.
1477          */
1478         if (!target__none(&opts->target) && !opts->initial_delay)
1479                 evlist__enable(rec->evlist);
1480
1481         /*
1482          * Let the child rip
1483          */
1484         if (forks) {
1485                 struct machine *machine = &session->machines.host;
1486                 union perf_event *event;
1487                 pid_t tgid;
1488
1489                 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1490                 if (event == NULL) {
1491                         err = -ENOMEM;
1492                         goto out_child;
1493                 }
1494
1495                 /*
1496                  * Some H/W events are generated before COMM event
1497                  * which is emitted during exec(), so perf script
1498                  * cannot see a correct process name for those events.
1499                  * Synthesize COMM event to prevent it.
1500                  */
1501                 tgid = perf_event__synthesize_comm(tool, event,
1502                                                    rec->evlist->workload.pid,
1503                                                    process_synthesized_event,
1504                                                    machine);
1505                 free(event);
1506
1507                 if (tgid == -1)
1508                         goto out_child;
1509
1510                 event = malloc(sizeof(event->namespaces) +
1511                                (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1512                                machine->id_hdr_size);
1513                 if (event == NULL) {
1514                         err = -ENOMEM;
1515                         goto out_child;
1516                 }
1517
1518                 /*
1519                  * Synthesize NAMESPACES event for the command specified.
1520                  */
1521                 perf_event__synthesize_namespaces(tool, event,
1522                                                   rec->evlist->workload.pid,
1523                                                   tgid, process_synthesized_event,
1524                                                   machine);
1525                 free(event);
1526
1527                 perf_evlist__start_workload(rec->evlist);
1528         }
1529
1530         if (opts->initial_delay) {
1531                 usleep(opts->initial_delay * USEC_PER_MSEC);
1532                 evlist__enable(rec->evlist);
1533         }
1534
1535         trigger_ready(&auxtrace_snapshot_trigger);
1536         trigger_ready(&switch_output_trigger);
1537         perf_hooks__invoke_record_start();
1538         for (;;) {
1539                 unsigned long long hits = rec->samples;
1540
1541                 /*
1542                  * rec->evlist->bkw_mmap_state is possible to be
1543                  * BKW_MMAP_EMPTY here: when done == true and
1544                  * hits != rec->samples in previous round.
1545                  *
1546                  * perf_evlist__toggle_bkw_mmap ensure we never
1547                  * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1548                  */
1549                 if (trigger_is_hit(&switch_output_trigger) || done || draining)
1550                         perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1551
1552                 if (record__mmap_read_all(rec, false) < 0) {
1553                         trigger_error(&auxtrace_snapshot_trigger);
1554                         trigger_error(&switch_output_trigger);
1555                         err = -1;
1556                         goto out_child;
1557                 }
1558
1559                 if (auxtrace_record__snapshot_started) {
1560                         auxtrace_record__snapshot_started = 0;
1561                         if (!trigger_is_error(&auxtrace_snapshot_trigger))
1562                                 record__read_auxtrace_snapshot(rec, false);
1563                         if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1564                                 pr_err("AUX area tracing snapshot failed\n");
1565                                 err = -1;
1566                                 goto out_child;
1567                         }
1568                 }
1569
1570                 if (trigger_is_hit(&switch_output_trigger)) {
1571                         /*
1572                          * If switch_output_trigger is hit, the data in
1573                          * overwritable ring buffer should have been collected,
1574                          * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1575                          *
1576                          * If SIGUSR2 raise after or during record__mmap_read_all(),
1577                          * record__mmap_read_all() didn't collect data from
1578                          * overwritable ring buffer. Read again.
1579                          */
1580                         if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1581                                 continue;
1582                         trigger_ready(&switch_output_trigger);
1583
1584                         /*
1585                          * Reenable events in overwrite ring buffer after
1586                          * record__mmap_read_all(): we should have collected
1587                          * data from it.
1588                          */
1589                         perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1590
1591                         if (!quiet)
1592                                 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1593                                         waking);
1594                         waking = 0;
1595                         fd = record__switch_output(rec, false);
1596                         if (fd < 0) {
1597                                 pr_err("Failed to switch to new file\n");
1598                                 trigger_error(&switch_output_trigger);
1599                                 err = fd;
1600                                 goto out_child;
1601                         }
1602
1603                         /* re-arm the alarm */
1604                         if (rec->switch_output.time)
1605                                 alarm(rec->switch_output.time);
1606                 }
1607
1608                 if (hits == rec->samples) {
1609                         if (done || draining)
1610                                 break;
1611                         err = perf_evlist__poll(rec->evlist, -1);
1612                         /*
1613                          * Propagate error, only if there's any. Ignore positive
1614                          * number of returned events and interrupt error.
1615                          */
1616                         if (err > 0 || (err < 0 && errno == EINTR))
1617                                 err = 0;
1618                         waking++;
1619
1620                         if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1621                                 draining = true;
1622                 }
1623
1624                 /*
1625                  * When perf is starting the traced process, at the end events
1626                  * die with the process and we wait for that. Thus no need to
1627                  * disable events in this case.
1628                  */
1629                 if (done && !disabled && !target__none(&opts->target)) {
1630                         trigger_off(&auxtrace_snapshot_trigger);
1631                         evlist__disable(rec->evlist);
1632                         disabled = true;
1633                 }
1634         }
1635
1636         trigger_off(&auxtrace_snapshot_trigger);
1637         trigger_off(&switch_output_trigger);
1638
1639         if (opts->auxtrace_snapshot_on_exit)
1640                 record__auxtrace_snapshot_exit(rec);
1641
1642         if (forks && workload_exec_errno) {
1643                 char msg[STRERR_BUFSIZE];
1644                 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1645                 pr_err("Workload failed: %s\n", emsg);
1646                 err = -1;
1647                 goto out_child;
1648         }
1649
1650         if (!quiet)
1651                 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1652
1653         if (target__none(&rec->opts.target))
1654                 record__synthesize_workload(rec, true);
1655
1656 out_child:
1657         record__mmap_read_all(rec, true);
1658         record__aio_mmap_read_sync(rec);
1659
1660         if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1661                 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1662                 session->header.env.comp_ratio = ratio + 0.5;
1663         }
1664
1665         if (forks) {
1666                 int exit_status;
1667
1668                 if (!child_finished)
1669                         kill(rec->evlist->workload.pid, SIGTERM);
1670
1671                 wait(&exit_status);
1672
1673                 if (err < 0)
1674                         status = err;
1675                 else if (WIFEXITED(exit_status))
1676                         status = WEXITSTATUS(exit_status);
1677                 else if (WIFSIGNALED(exit_status))
1678                         signr = WTERMSIG(exit_status);
1679         } else
1680                 status = err;
1681
1682         record__synthesize(rec, true);
1683         /* this will be recalculated during process_buildids() */
1684         rec->samples = 0;
1685
1686         if (!err) {
1687                 if (!rec->timestamp_filename) {
1688                         record__finish_output(rec);
1689                 } else {
1690                         fd = record__switch_output(rec, true);
1691                         if (fd < 0) {
1692                                 status = fd;
1693                                 goto out_delete_session;
1694                         }
1695                 }
1696         }
1697
1698         perf_hooks__invoke_record_end();
1699
1700         if (!err && !quiet) {
1701                 char samples[128];
1702                 const char *postfix = rec->timestamp_filename ?
1703                                         ".<timestamp>" : "";
1704
1705                 if (rec->samples && !rec->opts.full_auxtrace)
1706                         scnprintf(samples, sizeof(samples),
1707                                   " (%" PRIu64 " samples)", rec->samples);
1708                 else
1709                         samples[0] = '\0';
1710
1711                 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
1712                         perf_data__size(data) / 1024.0 / 1024.0,
1713                         data->path, postfix, samples);
1714                 if (ratio) {
1715                         fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
1716                                         rec->session->bytes_transferred / 1024.0 / 1024.0,
1717                                         ratio);
1718                 }
1719                 fprintf(stderr, " ]\n");
1720         }
1721
1722 out_delete_session:
1723         zstd_fini(&session->zstd_data);
1724         perf_session__delete(session);
1725
1726         if (!opts->no_bpf_event)
1727                 perf_evlist__stop_sb_thread(sb_evlist);
1728         return status;
1729 }
1730
1731 static void callchain_debug(struct callchain_param *callchain)
1732 {
1733         static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1734
1735         pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1736
1737         if (callchain->record_mode == CALLCHAIN_DWARF)
1738                 pr_debug("callchain: stack dump size %d\n",
1739                          callchain->dump_size);
1740 }
1741
1742 int record_opts__parse_callchain(struct record_opts *record,
1743                                  struct callchain_param *callchain,
1744                                  const char *arg, bool unset)
1745 {
1746         int ret;
1747         callchain->enabled = !unset;
1748
1749         /* --no-call-graph */
1750         if (unset) {
1751                 callchain->record_mode = CALLCHAIN_NONE;
1752                 pr_debug("callchain: disabled\n");
1753                 return 0;
1754         }
1755
1756         ret = parse_callchain_record_opt(arg, callchain);
1757         if (!ret) {
1758                 /* Enable data address sampling for DWARF unwind. */
1759                 if (callchain->record_mode == CALLCHAIN_DWARF)
1760                         record->sample_address = true;
1761                 callchain_debug(callchain);
1762         }
1763
1764         return ret;
1765 }
1766
1767 int record_parse_callchain_opt(const struct option *opt,
1768                                const char *arg,
1769                                int unset)
1770 {
1771         return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1772 }
1773
1774 int record_callchain_opt(const struct option *opt,
1775                          const char *arg __maybe_unused,
1776                          int unset __maybe_unused)
1777 {
1778         struct callchain_param *callchain = opt->value;
1779
1780         callchain->enabled = true;
1781
1782         if (callchain->record_mode == CALLCHAIN_NONE)
1783                 callchain->record_mode = CALLCHAIN_FP;
1784
1785         callchain_debug(callchain);
1786         return 0;
1787 }
1788
1789 static int perf_record_config(const char *var, const char *value, void *cb)
1790 {
1791         struct record *rec = cb;
1792
1793         if (!strcmp(var, "record.build-id")) {
1794                 if (!strcmp(value, "cache"))
1795                         rec->no_buildid_cache = false;
1796                 else if (!strcmp(value, "no-cache"))
1797                         rec->no_buildid_cache = true;
1798                 else if (!strcmp(value, "skip"))
1799                         rec->no_buildid = true;
1800                 else
1801                         return -1;
1802                 return 0;
1803         }
1804         if (!strcmp(var, "record.call-graph")) {
1805                 var = "call-graph.record-mode";
1806                 return perf_default_config(var, value, cb);
1807         }
1808 #ifdef HAVE_AIO_SUPPORT
1809         if (!strcmp(var, "record.aio")) {
1810                 rec->opts.nr_cblocks = strtol(value, NULL, 0);
1811                 if (!rec->opts.nr_cblocks)
1812                         rec->opts.nr_cblocks = nr_cblocks_default;
1813         }
1814 #endif
1815
1816         return 0;
1817 }
1818
1819 struct clockid_map {
1820         const char *name;
1821         int clockid;
1822 };
1823
1824 #define CLOCKID_MAP(n, c)       \
1825         { .name = n, .clockid = (c), }
1826
1827 #define CLOCKID_END     { .name = NULL, }
1828
1829
1830 /*
1831  * Add the missing ones, we need to build on many distros...
1832  */
1833 #ifndef CLOCK_MONOTONIC_RAW
1834 #define CLOCK_MONOTONIC_RAW 4
1835 #endif
1836 #ifndef CLOCK_BOOTTIME
1837 #define CLOCK_BOOTTIME 7
1838 #endif
1839 #ifndef CLOCK_TAI
1840 #define CLOCK_TAI 11
1841 #endif
1842
1843 static const struct clockid_map clockids[] = {
1844         /* available for all events, NMI safe */
1845         CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1846         CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1847
1848         /* available for some events */
1849         CLOCKID_MAP("realtime", CLOCK_REALTIME),
1850         CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1851         CLOCKID_MAP("tai", CLOCK_TAI),
1852
1853         /* available for the lazy */
1854         CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1855         CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1856         CLOCKID_MAP("real", CLOCK_REALTIME),
1857         CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1858
1859         CLOCKID_END,
1860 };
1861
1862 static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1863 {
1864         struct timespec res;
1865
1866         *res_ns = 0;
1867         if (!clock_getres(clk_id, &res))
1868                 *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1869         else
1870                 pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1871
1872         return 0;
1873 }
1874
1875 static int parse_clockid(const struct option *opt, const char *str, int unset)
1876 {
1877         struct record_opts *opts = (struct record_opts *)opt->value;
1878         const struct clockid_map *cm;
1879         const char *ostr = str;
1880
1881         if (unset) {
1882                 opts->use_clockid = 0;
1883                 return 0;
1884         }
1885
1886         /* no arg passed */
1887         if (!str)
1888                 return 0;
1889
1890         /* no setting it twice */
1891         if (opts->use_clockid)
1892                 return -1;
1893
1894         opts->use_clockid = true;
1895
1896         /* if its a number, we're done */
1897         if (sscanf(str, "%d", &opts->clockid) == 1)
1898                 return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
1899
1900         /* allow a "CLOCK_" prefix to the name */
1901         if (!strncasecmp(str, "CLOCK_", 6))
1902                 str += 6;
1903
1904         for (cm = clockids; cm->name; cm++) {
1905                 if (!strcasecmp(str, cm->name)) {
1906                         opts->clockid = cm->clockid;
1907                         return get_clockid_res(opts->clockid,
1908                                                &opts->clockid_res_ns);
1909                 }
1910         }
1911
1912         opts->use_clockid = false;
1913         ui__warning("unknown clockid %s, check man page\n", ostr);
1914         return -1;
1915 }
1916
1917 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
1918 {
1919         struct record_opts *opts = (struct record_opts *)opt->value;
1920
1921         if (unset || !str)
1922                 return 0;
1923
1924         if (!strcasecmp(str, "node"))
1925                 opts->affinity = PERF_AFFINITY_NODE;
1926         else if (!strcasecmp(str, "cpu"))
1927                 opts->affinity = PERF_AFFINITY_CPU;
1928
1929         return 0;
1930 }
1931
1932 static int record__parse_mmap_pages(const struct option *opt,
1933                                     const char *str,
1934                                     int unset __maybe_unused)
1935 {
1936         struct record_opts *opts = opt->value;
1937         char *s, *p;
1938         unsigned int mmap_pages;
1939         int ret;
1940
1941         if (!str)
1942                 return -EINVAL;
1943
1944         s = strdup(str);
1945         if (!s)
1946                 return -ENOMEM;
1947
1948         p = strchr(s, ',');
1949         if (p)
1950                 *p = '\0';
1951
1952         if (*s) {
1953                 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1954                 if (ret)
1955                         goto out_free;
1956                 opts->mmap_pages = mmap_pages;
1957         }
1958
1959         if (!p) {
1960                 ret = 0;
1961                 goto out_free;
1962         }
1963
1964         ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1965         if (ret)
1966                 goto out_free;
1967
1968         opts->auxtrace_mmap_pages = mmap_pages;
1969
1970 out_free:
1971         free(s);
1972         return ret;
1973 }
1974
1975 static void switch_output_size_warn(struct record *rec)
1976 {
1977         u64 wakeup_size = perf_evlist__mmap_size(rec->opts.mmap_pages);
1978         struct switch_output *s = &rec->switch_output;
1979
1980         wakeup_size /= 2;
1981
1982         if (s->size < wakeup_size) {
1983                 char buf[100];
1984
1985                 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
1986                 pr_warning("WARNING: switch-output data size lower than "
1987                            "wakeup kernel buffer size (%s) "
1988                            "expect bigger perf.data sizes\n", buf);
1989         }
1990 }
1991
1992 static int switch_output_setup(struct record *rec)
1993 {
1994         struct switch_output *s = &rec->switch_output;
1995         static struct parse_tag tags_size[] = {
1996                 { .tag  = 'B', .mult = 1       },
1997                 { .tag  = 'K', .mult = 1 << 10 },
1998                 { .tag  = 'M', .mult = 1 << 20 },
1999                 { .tag  = 'G', .mult = 1 << 30 },
2000                 { .tag  = 0 },
2001         };
2002         static struct parse_tag tags_time[] = {
2003                 { .tag  = 's', .mult = 1        },
2004                 { .tag  = 'm', .mult = 60       },
2005                 { .tag  = 'h', .mult = 60*60    },
2006                 { .tag  = 'd', .mult = 60*60*24 },
2007                 { .tag  = 0 },
2008         };
2009         unsigned long val;
2010
2011         if (!s->set)
2012                 return 0;
2013
2014         if (!strcmp(s->str, "signal")) {
2015                 s->signal = true;
2016                 pr_debug("switch-output with SIGUSR2 signal\n");
2017                 goto enabled;
2018         }
2019
2020         val = parse_tag_value(s->str, tags_size);
2021         if (val != (unsigned long) -1) {
2022                 s->size = val;
2023                 pr_debug("switch-output with %s size threshold\n", s->str);
2024                 goto enabled;
2025         }
2026
2027         val = parse_tag_value(s->str, tags_time);
2028         if (val != (unsigned long) -1) {
2029                 s->time = val;
2030                 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2031                          s->str, s->time);
2032                 goto enabled;
2033         }
2034
2035         return -1;
2036
2037 enabled:
2038         rec->timestamp_filename = true;
2039         s->enabled              = true;
2040
2041         if (s->size && !rec->opts.no_buffering)
2042                 switch_output_size_warn(rec);
2043
2044         return 0;
2045 }
2046
2047 static const char * const __record_usage[] = {
2048         "perf record [<options>] [<command>]",
2049         "perf record [<options>] -- <command> [<options>]",
2050         NULL
2051 };
2052 const char * const *record_usage = __record_usage;
2053
2054 /*
2055  * XXX Ideally would be local to cmd_record() and passed to a record__new
2056  * because we need to have access to it in record__exit, that is called
2057  * after cmd_record() exits, but since record_options need to be accessible to
2058  * builtin-script, leave it here.
2059  *
2060  * At least we don't ouch it in all the other functions here directly.
2061  *
2062  * Just say no to tons of global variables, sigh.
2063  */
2064 static struct record record = {
2065         .opts = {
2066                 .sample_time         = true,
2067                 .mmap_pages          = UINT_MAX,
2068                 .user_freq           = UINT_MAX,
2069                 .user_interval       = ULLONG_MAX,
2070                 .freq                = 4000,
2071                 .target              = {
2072                         .uses_mmap   = true,
2073                         .default_per_cpu = true,
2074                 },
2075                 .mmap_flush          = MMAP_FLUSH_DEFAULT,
2076         },
2077         .tool = {
2078                 .sample         = process_sample_event,
2079                 .fork           = perf_event__process_fork,
2080                 .exit           = perf_event__process_exit,
2081                 .comm           = perf_event__process_comm,
2082                 .namespaces     = perf_event__process_namespaces,
2083                 .mmap           = perf_event__process_mmap,
2084                 .mmap2          = perf_event__process_mmap2,
2085                 .ordered_events = true,
2086         },
2087 };
2088
2089 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2090         "\n\t\t\t\tDefault: fp";
2091
2092 static bool dry_run;
2093
2094 /*
2095  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2096  * with it and switch to use the library functions in perf_evlist that came
2097  * from builtin-record.c, i.e. use record_opts,
2098  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2099  * using pipes, etc.
2100  */
2101 static struct option __record_options[] = {
2102         OPT_CALLBACK('e', "event", &record.evlist, "event",
2103                      "event selector. use 'perf list' to list available events",
2104                      parse_events_option),
2105         OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2106                      "event filter", parse_filter),
2107         OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2108                            NULL, "don't record events from perf itself",
2109                            exclude_perf),
2110         OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2111                     "record events on existing process id"),
2112         OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2113                     "record events on existing thread id"),
2114         OPT_INTEGER('r', "realtime", &record.realtime_prio,
2115                     "collect data with this RT SCHED_FIFO priority"),
2116         OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2117                     "collect data without buffering"),
2118         OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2119                     "collect raw sample records from all opened counters"),
2120         OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2121                             "system-wide collection from all CPUs"),
2122         OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2123                     "list of cpus to monitor"),
2124         OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2125         OPT_STRING('o', "output", &record.data.path, "file",
2126                     "output file name"),
2127         OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2128                         &record.opts.no_inherit_set,
2129                         "child tasks do not inherit counters"),
2130         OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2131                     "synthesize non-sample events at the end of output"),
2132         OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2133         OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
2134         OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2135                     "Fail if the specified frequency can't be used"),
2136         OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2137                      "profile at this frequency",
2138                       record__parse_freq),
2139         OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2140                      "number of mmap data pages and AUX area tracing mmap pages",
2141                      record__parse_mmap_pages),
2142         OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2143                      "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2144                      record__mmap_flush_parse),
2145         OPT_BOOLEAN(0, "group", &record.opts.group,
2146                     "put the counters into a counter group"),
2147         OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2148                            NULL, "enables call-graph recording" ,
2149                            &record_callchain_opt),
2150         OPT_CALLBACK(0, "call-graph", &record.opts,
2151                      "record_mode[,record_size]", record_callchain_help,
2152                      &record_parse_callchain_opt),
2153         OPT_INCR('v', "verbose", &verbose,
2154                     "be more verbose (show counter open errors, etc)"),
2155         OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2156         OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2157                     "per thread counts"),
2158         OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2159         OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2160                     "Record the sample physical addresses"),
2161         OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2162         OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2163                         &record.opts.sample_time_set,
2164                         "Record the sample timestamps"),
2165         OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2166                         "Record the sample period"),
2167         OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2168                     "don't sample"),
2169         OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2170                         &record.no_buildid_cache_set,
2171                         "do not update the buildid cache"),
2172         OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2173                         &record.no_buildid_set,
2174                         "do not collect buildids in perf.data"),
2175         OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2176                      "monitor event in cgroup name only",
2177                      parse_cgroups),
2178         OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2179                   "ms to wait before starting measurement after program start"),
2180         OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2181                    "user to profile"),
2182
2183         OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2184                      "branch any", "sample any taken branches",
2185                      parse_branch_stack),
2186
2187         OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2188                      "branch filter mask", "branch stack filter modes",
2189                      parse_branch_stack),
2190         OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2191                     "sample by weight (on special events only)"),
2192         OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2193                     "sample transaction flags (special events only)"),
2194         OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2195                     "use per-thread mmaps"),
2196         OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2197                     "sample selected machine registers on interrupt,"
2198                     " use '-I?' to list register names", parse_intr_regs),
2199         OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2200                     "sample selected machine registers on interrupt,"
2201                     " use '--user-regs=?' to list register names", parse_user_regs),
2202         OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2203                     "Record running/enabled time of read (:S) events"),
2204         OPT_CALLBACK('k', "clockid", &record.opts,
2205         "clockid", "clockid to use for events, see clock_gettime()",
2206         parse_clockid),
2207         OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2208                           "opts", "AUX area tracing Snapshot Mode", ""),
2209         OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2210                         "per thread proc mmap processing timeout in ms"),
2211         OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2212                     "Record namespaces events"),
2213         OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
2214                     "Record context switch events"),
2215         OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2216                          "Configure all used events to run in kernel space.",
2217                          PARSE_OPT_EXCLUSIVE),
2218         OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2219                          "Configure all used events to run in user space.",
2220                          PARSE_OPT_EXCLUSIVE),
2221         OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2222                     "collect kernel callchains"),
2223         OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2224                     "collect user callchains"),
2225         OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2226                    "clang binary to use for compiling BPF scriptlets"),
2227         OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2228                    "options passed to clang when compiling BPF scriptlets"),
2229         OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2230                    "file", "vmlinux pathname"),
2231         OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2232                     "Record build-id of all DSOs regardless of hits"),
2233         OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2234                     "append timestamp to output filename"),
2235         OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2236                     "Record timestamp boundary (time of first/last samples)"),
2237         OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2238                           &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2239                           "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2240                           "signal"),
2241         OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2242                    "Limit number of switch output generated files"),
2243         OPT_BOOLEAN(0, "dry-run", &dry_run,
2244                     "Parse options then exit"),
2245 #ifdef HAVE_AIO_SUPPORT
2246         OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2247                      &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2248                      record__aio_parse),
2249 #endif
2250         OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2251                      "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2252                      record__parse_affinity),
2253 #ifdef HAVE_ZSTD_SUPPORT
2254         OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2255                             "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2256                             record__parse_comp_level),
2257 #endif
2258         OPT_END()
2259 };
2260
2261 struct option *record_options = __record_options;
2262
2263 int cmd_record(int argc, const char **argv)
2264 {
2265         int err;
2266         struct record *rec = &record;
2267         char errbuf[BUFSIZ];
2268
2269         setlocale(LC_ALL, "");
2270
2271 #ifndef HAVE_LIBBPF_SUPPORT
2272 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2273         set_nobuild('\0', "clang-path", true);
2274         set_nobuild('\0', "clang-opt", true);
2275 # undef set_nobuild
2276 #endif
2277
2278 #ifndef HAVE_BPF_PROLOGUE
2279 # if !defined (HAVE_DWARF_SUPPORT)
2280 #  define REASON  "NO_DWARF=1"
2281 # elif !defined (HAVE_LIBBPF_SUPPORT)
2282 #  define REASON  "NO_LIBBPF=1"
2283 # else
2284 #  define REASON  "this architecture doesn't support BPF prologue"
2285 # endif
2286 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2287         set_nobuild('\0', "vmlinux", true);
2288 # undef set_nobuild
2289 # undef REASON
2290 #endif
2291
2292         CPU_ZERO(&rec->affinity_mask);
2293         rec->opts.affinity = PERF_AFFINITY_SYS;
2294
2295         rec->evlist = evlist__new();
2296         if (rec->evlist == NULL)
2297                 return -ENOMEM;
2298
2299         err = perf_config(perf_record_config, rec);
2300         if (err)
2301                 return err;
2302
2303         argc = parse_options(argc, argv, record_options, record_usage,
2304                             PARSE_OPT_STOP_AT_NON_OPTION);
2305         if (quiet)
2306                 perf_quiet_option();
2307
2308         /* Make system wide (-a) the default target. */
2309         if (!argc && target__none(&rec->opts.target))
2310                 rec->opts.target.system_wide = true;
2311
2312         if (nr_cgroups && !rec->opts.target.system_wide) {
2313                 usage_with_options_msg(record_usage, record_options,
2314                         "cgroup monitoring only available in system-wide mode");
2315
2316         }
2317
2318         if (rec->opts.comp_level != 0) {
2319                 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2320                 rec->no_buildid = true;
2321         }
2322
2323         if (rec->opts.record_switch_events &&
2324             !perf_can_record_switch_events()) {
2325                 ui__error("kernel does not support recording context switch events\n");
2326                 parse_options_usage(record_usage, record_options, "switch-events", 0);
2327                 return -EINVAL;
2328         }
2329
2330         if (switch_output_setup(rec)) {
2331                 parse_options_usage(record_usage, record_options, "switch-output", 0);
2332                 return -EINVAL;
2333         }
2334
2335         if (rec->switch_output.time) {
2336                 signal(SIGALRM, alarm_sig_handler);
2337                 alarm(rec->switch_output.time);
2338         }
2339
2340         if (rec->switch_output.num_files) {
2341                 rec->switch_output.filenames = calloc(sizeof(char *),
2342                                                       rec->switch_output.num_files);
2343                 if (!rec->switch_output.filenames)
2344                         return -EINVAL;
2345         }
2346
2347         /*
2348          * Allow aliases to facilitate the lookup of symbols for address
2349          * filters. Refer to auxtrace_parse_filters().
2350          */
2351         symbol_conf.allow_aliases = true;
2352
2353         symbol__init(NULL);
2354
2355         err = record__auxtrace_init(rec);
2356         if (err)
2357                 goto out;
2358
2359         if (dry_run)
2360                 goto out;
2361
2362         err = bpf__setup_stdout(rec->evlist);
2363         if (err) {
2364                 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2365                 pr_err("ERROR: Setup BPF stdout failed: %s\n",
2366                          errbuf);
2367                 goto out;
2368         }
2369
2370         err = -ENOMEM;
2371
2372         if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(rec->evlist))
2373                 pr_warning(
2374 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
2375 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
2376 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
2377 "file is not found in the buildid cache or in the vmlinux path.\n\n"
2378 "Samples in kernel modules won't be resolved at all.\n\n"
2379 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
2380 "even with a suitable vmlinux or kallsyms file.\n\n");
2381
2382         if (rec->no_buildid_cache || rec->no_buildid) {
2383                 disable_buildid_cache();
2384         } else if (rec->switch_output.enabled) {
2385                 /*
2386                  * In 'perf record --switch-output', disable buildid
2387                  * generation by default to reduce data file switching
2388                  * overhead. Still generate buildid if they are required
2389                  * explicitly using
2390                  *
2391                  *  perf record --switch-output --no-no-buildid \
2392                  *              --no-no-buildid-cache
2393                  *
2394                  * Following code equals to:
2395                  *
2396                  * if ((rec->no_buildid || !rec->no_buildid_set) &&
2397                  *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2398                  *         disable_buildid_cache();
2399                  */
2400                 bool disable = true;
2401
2402                 if (rec->no_buildid_set && !rec->no_buildid)
2403                         disable = false;
2404                 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2405                         disable = false;
2406                 if (disable) {
2407                         rec->no_buildid = true;
2408                         rec->no_buildid_cache = true;
2409                         disable_buildid_cache();
2410                 }
2411         }
2412
2413         if (record.opts.overwrite)
2414                 record.opts.tail_synthesize = true;
2415
2416         if (rec->evlist->core.nr_entries == 0 &&
2417             __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2418                 pr_err("Not enough memory for event selector list\n");
2419                 goto out;
2420         }
2421
2422         if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2423                 rec->opts.no_inherit = true;
2424
2425         err = target__validate(&rec->opts.target);
2426         if (err) {
2427                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2428                 ui__warning("%s\n", errbuf);
2429         }
2430
2431         err = target__parse_uid(&rec->opts.target);
2432         if (err) {
2433                 int saved_errno = errno;
2434
2435                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2436                 ui__error("%s", errbuf);
2437
2438                 err = -saved_errno;
2439                 goto out;
2440         }
2441
2442         /* Enable ignoring missing threads when -u/-p option is defined. */
2443         rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2444
2445         err = -ENOMEM;
2446         if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2447                 usage_with_options(record_usage, record_options);
2448
2449         err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2450         if (err)
2451                 goto out;
2452
2453         /*
2454          * We take all buildids when the file contains
2455          * AUX area tracing data because we do not decode the
2456          * trace because it would take too long.
2457          */
2458         if (rec->opts.full_auxtrace)
2459                 rec->buildid_all = true;
2460
2461         if (record_opts__config(&rec->opts)) {
2462                 err = -EINVAL;
2463                 goto out;
2464         }
2465
2466         if (rec->opts.nr_cblocks > nr_cblocks_max)
2467                 rec->opts.nr_cblocks = nr_cblocks_max;
2468         pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2469
2470         pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2471         pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2472
2473         if (rec->opts.comp_level > comp_level_max)
2474                 rec->opts.comp_level = comp_level_max;
2475         pr_debug("comp level: %d\n", rec->opts.comp_level);
2476
2477         err = __cmd_record(&record, argc, argv);
2478 out:
2479         evlist__delete(rec->evlist);
2480         symbol__exit();
2481         auxtrace_record__free(rec->itr);
2482         return err;
2483 }
2484
2485 static void snapshot_sig_handler(int sig __maybe_unused)
2486 {
2487         struct record *rec = &record;
2488
2489         if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2490                 trigger_hit(&auxtrace_snapshot_trigger);
2491                 auxtrace_record__snapshot_started = 1;
2492                 if (auxtrace_record__snapshot_start(record.itr))
2493                         trigger_error(&auxtrace_snapshot_trigger);
2494         }
2495
2496         if (switch_output_signal(rec))
2497                 trigger_hit(&switch_output_trigger);
2498 }
2499
2500 static void alarm_sig_handler(int sig __maybe_unused)
2501 {
2502         struct record *rec = &record;
2503
2504         if (switch_output_time(rec))
2505                 trigger_hit(&switch_output_trigger);
2506 }