]> asedeno.scripts.mit.edu Git - linux.git/blob - tools/perf/builtin-record.c
Merge tag 'arm64-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux
[linux.git] / tools / perf / builtin-record.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/mmap.h"
24 #include "util/target.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/record.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/llvm-utils.h"
38 #include "util/bpf-loader.h"
39 #include "util/trigger.h"
40 #include "util/perf-hooks.h"
41 #include "util/cpu-set-sched.h"
42 #include "util/synthetic-events.h"
43 #include "util/time-utils.h"
44 #include "util/units.h"
45 #include "util/bpf-event.h"
46 #include "asm/bug.h"
47 #include "perf.h"
48
49 #include <errno.h>
50 #include <inttypes.h>
51 #include <locale.h>
52 #include <poll.h>
53 #include <unistd.h>
54 #include <sched.h>
55 #include <signal.h>
56 #include <sys/mman.h>
57 #include <sys/wait.h>
58 #include <sys/types.h>
59 #include <sys/stat.h>
60 #include <fcntl.h>
61 #include <linux/err.h>
62 #include <linux/string.h>
63 #include <linux/time64.h>
64 #include <linux/zalloc.h>
65 #include <linux/bitmap.h>
66
67 struct switch_output {
68         bool             enabled;
69         bool             signal;
70         unsigned long    size;
71         unsigned long    time;
72         const char      *str;
73         bool             set;
74         char             **filenames;
75         int              num_files;
76         int              cur_file;
77 };
78
79 struct record {
80         struct perf_tool        tool;
81         struct record_opts      opts;
82         u64                     bytes_written;
83         struct perf_data        data;
84         struct auxtrace_record  *itr;
85         struct evlist   *evlist;
86         struct perf_session     *session;
87         int                     realtime_prio;
88         bool                    no_buildid;
89         bool                    no_buildid_set;
90         bool                    no_buildid_cache;
91         bool                    no_buildid_cache_set;
92         bool                    buildid_all;
93         bool                    timestamp_filename;
94         bool                    timestamp_boundary;
95         struct switch_output    switch_output;
96         unsigned long long      samples;
97         struct mmap_cpu_mask    affinity_mask;
98         unsigned long           output_max_size;        /* = 0: unlimited */
99 };
100
101 static volatile int done;
102
103 static volatile int auxtrace_record__snapshot_started;
104 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
105 static DEFINE_TRIGGER(switch_output_trigger);
106
107 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
108         "SYS", "NODE", "CPU"
109 };
110
111 static bool switch_output_signal(struct record *rec)
112 {
113         return rec->switch_output.signal &&
114                trigger_is_ready(&switch_output_trigger);
115 }
116
117 static bool switch_output_size(struct record *rec)
118 {
119         return rec->switch_output.size &&
120                trigger_is_ready(&switch_output_trigger) &&
121                (rec->bytes_written >= rec->switch_output.size);
122 }
123
124 static bool switch_output_time(struct record *rec)
125 {
126         return rec->switch_output.time &&
127                trigger_is_ready(&switch_output_trigger);
128 }
129
130 static bool record__output_max_size_exceeded(struct record *rec)
131 {
132         return rec->output_max_size &&
133                (rec->bytes_written >= rec->output_max_size);
134 }
135
136 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
137                          void *bf, size_t size)
138 {
139         struct perf_data_file *file = &rec->session->data->file;
140
141         if (perf_data_file__write(file, bf, size) < 0) {
142                 pr_err("failed to write perf data, error: %m\n");
143                 return -1;
144         }
145
146         rec->bytes_written += size;
147
148         if (record__output_max_size_exceeded(rec) && !done) {
149                 fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
150                                 " stopping session ]\n",
151                                 rec->bytes_written >> 10);
152                 done = 1;
153         }
154
155         if (switch_output_size(rec))
156                 trigger_hit(&switch_output_trigger);
157
158         return 0;
159 }
160
161 static int record__aio_enabled(struct record *rec);
162 static int record__comp_enabled(struct record *rec);
163 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
164                             void *src, size_t src_size);
165
166 #ifdef HAVE_AIO_SUPPORT
167 static int record__aio_write(struct aiocb *cblock, int trace_fd,
168                 void *buf, size_t size, off_t off)
169 {
170         int rc;
171
172         cblock->aio_fildes = trace_fd;
173         cblock->aio_buf    = buf;
174         cblock->aio_nbytes = size;
175         cblock->aio_offset = off;
176         cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
177
178         do {
179                 rc = aio_write(cblock);
180                 if (rc == 0) {
181                         break;
182                 } else if (errno != EAGAIN) {
183                         cblock->aio_fildes = -1;
184                         pr_err("failed to queue perf data, error: %m\n");
185                         break;
186                 }
187         } while (1);
188
189         return rc;
190 }
191
192 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
193 {
194         void *rem_buf;
195         off_t rem_off;
196         size_t rem_size;
197         int rc, aio_errno;
198         ssize_t aio_ret, written;
199
200         aio_errno = aio_error(cblock);
201         if (aio_errno == EINPROGRESS)
202                 return 0;
203
204         written = aio_ret = aio_return(cblock);
205         if (aio_ret < 0) {
206                 if (aio_errno != EINTR)
207                         pr_err("failed to write perf data, error: %m\n");
208                 written = 0;
209         }
210
211         rem_size = cblock->aio_nbytes - written;
212
213         if (rem_size == 0) {
214                 cblock->aio_fildes = -1;
215                 /*
216                  * md->refcount is incremented in record__aio_pushfn() for
217                  * every aio write request started in record__aio_push() so
218                  * decrement it because the request is now complete.
219                  */
220                 perf_mmap__put(&md->core);
221                 rc = 1;
222         } else {
223                 /*
224                  * aio write request may require restart with the
225                  * reminder if the kernel didn't write whole
226                  * chunk at once.
227                  */
228                 rem_off = cblock->aio_offset + written;
229                 rem_buf = (void *)(cblock->aio_buf + written);
230                 record__aio_write(cblock, cblock->aio_fildes,
231                                 rem_buf, rem_size, rem_off);
232                 rc = 0;
233         }
234
235         return rc;
236 }
237
238 static int record__aio_sync(struct mmap *md, bool sync_all)
239 {
240         struct aiocb **aiocb = md->aio.aiocb;
241         struct aiocb *cblocks = md->aio.cblocks;
242         struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
243         int i, do_suspend;
244
245         do {
246                 do_suspend = 0;
247                 for (i = 0; i < md->aio.nr_cblocks; ++i) {
248                         if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
249                                 if (sync_all)
250                                         aiocb[i] = NULL;
251                                 else
252                                         return i;
253                         } else {
254                                 /*
255                                  * Started aio write is not complete yet
256                                  * so it has to be waited before the
257                                  * next allocation.
258                                  */
259                                 aiocb[i] = &cblocks[i];
260                                 do_suspend = 1;
261                         }
262                 }
263                 if (!do_suspend)
264                         return -1;
265
266                 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
267                         if (!(errno == EAGAIN || errno == EINTR))
268                                 pr_err("failed to sync perf data, error: %m\n");
269                 }
270         } while (1);
271 }
272
273 struct record_aio {
274         struct record   *rec;
275         void            *data;
276         size_t          size;
277 };
278
279 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
280 {
281         struct record_aio *aio = to;
282
283         /*
284          * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
285          * to release space in the kernel buffer as fast as possible, calling
286          * perf_mmap__consume() from perf_mmap__push() function.
287          *
288          * That lets the kernel to proceed with storing more profiling data into
289          * the kernel buffer earlier than other per-cpu kernel buffers are handled.
290          *
291          * Coping can be done in two steps in case the chunk of profiling data
292          * crosses the upper bound of the kernel buffer. In this case we first move
293          * part of data from map->start till the upper bound and then the reminder
294          * from the beginning of the kernel buffer till the end of the data chunk.
295          */
296
297         if (record__comp_enabled(aio->rec)) {
298                 size = zstd_compress(aio->rec->session, aio->data + aio->size,
299                                      mmap__mmap_len(map) - aio->size,
300                                      buf, size);
301         } else {
302                 memcpy(aio->data + aio->size, buf, size);
303         }
304
305         if (!aio->size) {
306                 /*
307                  * Increment map->refcount to guard map->aio.data[] buffer
308                  * from premature deallocation because map object can be
309                  * released earlier than aio write request started on
310                  * map->aio.data[] buffer is complete.
311                  *
312                  * perf_mmap__put() is done at record__aio_complete()
313                  * after started aio request completion or at record__aio_push()
314                  * if the request failed to start.
315                  */
316                 perf_mmap__get(&map->core);
317         }
318
319         aio->size += size;
320
321         return size;
322 }
323
324 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
325 {
326         int ret, idx;
327         int trace_fd = rec->session->data->file.fd;
328         struct record_aio aio = { .rec = rec, .size = 0 };
329
330         /*
331          * Call record__aio_sync() to wait till map->aio.data[] buffer
332          * becomes available after previous aio write operation.
333          */
334
335         idx = record__aio_sync(map, false);
336         aio.data = map->aio.data[idx];
337         ret = perf_mmap__push(map, &aio, record__aio_pushfn);
338         if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
339                 return ret;
340
341         rec->samples++;
342         ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
343         if (!ret) {
344                 *off += aio.size;
345                 rec->bytes_written += aio.size;
346                 if (switch_output_size(rec))
347                         trigger_hit(&switch_output_trigger);
348         } else {
349                 /*
350                  * Decrement map->refcount incremented in record__aio_pushfn()
351                  * back if record__aio_write() operation failed to start, otherwise
352                  * map->refcount is decremented in record__aio_complete() after
353                  * aio write operation finishes successfully.
354                  */
355                 perf_mmap__put(&map->core);
356         }
357
358         return ret;
359 }
360
361 static off_t record__aio_get_pos(int trace_fd)
362 {
363         return lseek(trace_fd, 0, SEEK_CUR);
364 }
365
366 static void record__aio_set_pos(int trace_fd, off_t pos)
367 {
368         lseek(trace_fd, pos, SEEK_SET);
369 }
370
371 static void record__aio_mmap_read_sync(struct record *rec)
372 {
373         int i;
374         struct evlist *evlist = rec->evlist;
375         struct mmap *maps = evlist->mmap;
376
377         if (!record__aio_enabled(rec))
378                 return;
379
380         for (i = 0; i < evlist->core.nr_mmaps; i++) {
381                 struct mmap *map = &maps[i];
382
383                 if (map->core.base)
384                         record__aio_sync(map, true);
385         }
386 }
387
388 static int nr_cblocks_default = 1;
389 static int nr_cblocks_max = 4;
390
391 static int record__aio_parse(const struct option *opt,
392                              const char *str,
393                              int unset)
394 {
395         struct record_opts *opts = (struct record_opts *)opt->value;
396
397         if (unset) {
398                 opts->nr_cblocks = 0;
399         } else {
400                 if (str)
401                         opts->nr_cblocks = strtol(str, NULL, 0);
402                 if (!opts->nr_cblocks)
403                         opts->nr_cblocks = nr_cblocks_default;
404         }
405
406         return 0;
407 }
408 #else /* HAVE_AIO_SUPPORT */
409 static int nr_cblocks_max = 0;
410
411 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
412                             off_t *off __maybe_unused)
413 {
414         return -1;
415 }
416
417 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
418 {
419         return -1;
420 }
421
422 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
423 {
424 }
425
426 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
427 {
428 }
429 #endif
430
431 static int record__aio_enabled(struct record *rec)
432 {
433         return rec->opts.nr_cblocks > 0;
434 }
435
436 #define MMAP_FLUSH_DEFAULT 1
437 static int record__mmap_flush_parse(const struct option *opt,
438                                     const char *str,
439                                     int unset)
440 {
441         int flush_max;
442         struct record_opts *opts = (struct record_opts *)opt->value;
443         static struct parse_tag tags[] = {
444                         { .tag  = 'B', .mult = 1       },
445                         { .tag  = 'K', .mult = 1 << 10 },
446                         { .tag  = 'M', .mult = 1 << 20 },
447                         { .tag  = 'G', .mult = 1 << 30 },
448                         { .tag  = 0 },
449         };
450
451         if (unset)
452                 return 0;
453
454         if (str) {
455                 opts->mmap_flush = parse_tag_value(str, tags);
456                 if (opts->mmap_flush == (int)-1)
457                         opts->mmap_flush = strtol(str, NULL, 0);
458         }
459
460         if (!opts->mmap_flush)
461                 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
462
463         flush_max = evlist__mmap_size(opts->mmap_pages);
464         flush_max /= 4;
465         if (opts->mmap_flush > flush_max)
466                 opts->mmap_flush = flush_max;
467
468         return 0;
469 }
470
471 #ifdef HAVE_ZSTD_SUPPORT
472 static unsigned int comp_level_default = 1;
473
474 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
475 {
476         struct record_opts *opts = opt->value;
477
478         if (unset) {
479                 opts->comp_level = 0;
480         } else {
481                 if (str)
482                         opts->comp_level = strtol(str, NULL, 0);
483                 if (!opts->comp_level)
484                         opts->comp_level = comp_level_default;
485         }
486
487         return 0;
488 }
489 #endif
490 static unsigned int comp_level_max = 22;
491
492 static int record__comp_enabled(struct record *rec)
493 {
494         return rec->opts.comp_level > 0;
495 }
496
497 static int process_synthesized_event(struct perf_tool *tool,
498                                      union perf_event *event,
499                                      struct perf_sample *sample __maybe_unused,
500                                      struct machine *machine __maybe_unused)
501 {
502         struct record *rec = container_of(tool, struct record, tool);
503         return record__write(rec, NULL, event, event->header.size);
504 }
505
506 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
507 {
508         struct record *rec = to;
509
510         if (record__comp_enabled(rec)) {
511                 size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size);
512                 bf   = map->data;
513         }
514
515         rec->samples++;
516         return record__write(rec, map, bf, size);
517 }
518
519 static volatile int signr = -1;
520 static volatile int child_finished;
521
522 static void sig_handler(int sig)
523 {
524         if (sig == SIGCHLD)
525                 child_finished = 1;
526         else
527                 signr = sig;
528
529         done = 1;
530 }
531
532 static void sigsegv_handler(int sig)
533 {
534         perf_hooks__recover();
535         sighandler_dump_stack(sig);
536 }
537
538 static void record__sig_exit(void)
539 {
540         if (signr == -1)
541                 return;
542
543         signal(signr, SIG_DFL);
544         raise(signr);
545 }
546
547 #ifdef HAVE_AUXTRACE_SUPPORT
548
549 static int record__process_auxtrace(struct perf_tool *tool,
550                                     struct mmap *map,
551                                     union perf_event *event, void *data1,
552                                     size_t len1, void *data2, size_t len2)
553 {
554         struct record *rec = container_of(tool, struct record, tool);
555         struct perf_data *data = &rec->data;
556         size_t padding;
557         u8 pad[8] = {0};
558
559         if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
560                 off_t file_offset;
561                 int fd = perf_data__fd(data);
562                 int err;
563
564                 file_offset = lseek(fd, 0, SEEK_CUR);
565                 if (file_offset == -1)
566                         return -1;
567                 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
568                                                      event, file_offset);
569                 if (err)
570                         return err;
571         }
572
573         /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
574         padding = (len1 + len2) & 7;
575         if (padding)
576                 padding = 8 - padding;
577
578         record__write(rec, map, event, event->header.size);
579         record__write(rec, map, data1, len1);
580         if (len2)
581                 record__write(rec, map, data2, len2);
582         record__write(rec, map, &pad, padding);
583
584         return 0;
585 }
586
587 static int record__auxtrace_mmap_read(struct record *rec,
588                                       struct mmap *map)
589 {
590         int ret;
591
592         ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
593                                   record__process_auxtrace);
594         if (ret < 0)
595                 return ret;
596
597         if (ret)
598                 rec->samples++;
599
600         return 0;
601 }
602
603 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
604                                                struct mmap *map)
605 {
606         int ret;
607
608         ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
609                                            record__process_auxtrace,
610                                            rec->opts.auxtrace_snapshot_size);
611         if (ret < 0)
612                 return ret;
613
614         if (ret)
615                 rec->samples++;
616
617         return 0;
618 }
619
620 static int record__auxtrace_read_snapshot_all(struct record *rec)
621 {
622         int i;
623         int rc = 0;
624
625         for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
626                 struct mmap *map = &rec->evlist->mmap[i];
627
628                 if (!map->auxtrace_mmap.base)
629                         continue;
630
631                 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
632                         rc = -1;
633                         goto out;
634                 }
635         }
636 out:
637         return rc;
638 }
639
640 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
641 {
642         pr_debug("Recording AUX area tracing snapshot\n");
643         if (record__auxtrace_read_snapshot_all(rec) < 0) {
644                 trigger_error(&auxtrace_snapshot_trigger);
645         } else {
646                 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
647                         trigger_error(&auxtrace_snapshot_trigger);
648                 else
649                         trigger_ready(&auxtrace_snapshot_trigger);
650         }
651 }
652
653 static int record__auxtrace_snapshot_exit(struct record *rec)
654 {
655         if (trigger_is_error(&auxtrace_snapshot_trigger))
656                 return 0;
657
658         if (!auxtrace_record__snapshot_started &&
659             auxtrace_record__snapshot_start(rec->itr))
660                 return -1;
661
662         record__read_auxtrace_snapshot(rec, true);
663         if (trigger_is_error(&auxtrace_snapshot_trigger))
664                 return -1;
665
666         return 0;
667 }
668
669 static int record__auxtrace_init(struct record *rec)
670 {
671         int err;
672
673         if (!rec->itr) {
674                 rec->itr = auxtrace_record__init(rec->evlist, &err);
675                 if (err)
676                         return err;
677         }
678
679         err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
680                                               rec->opts.auxtrace_snapshot_opts);
681         if (err)
682                 return err;
683
684         err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
685                                             rec->opts.auxtrace_sample_opts);
686         if (err)
687                 return err;
688
689         return auxtrace_parse_filters(rec->evlist);
690 }
691
692 #else
693
694 static inline
695 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
696                                struct mmap *map __maybe_unused)
697 {
698         return 0;
699 }
700
701 static inline
702 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
703                                     bool on_exit __maybe_unused)
704 {
705 }
706
707 static inline
708 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
709 {
710         return 0;
711 }
712
713 static inline
714 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
715 {
716         return 0;
717 }
718
719 static int record__auxtrace_init(struct record *rec __maybe_unused)
720 {
721         return 0;
722 }
723
724 #endif
725
726 static bool record__kcore_readable(struct machine *machine)
727 {
728         char kcore[PATH_MAX];
729         int fd;
730
731         scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
732
733         fd = open(kcore, O_RDONLY);
734         if (fd < 0)
735                 return false;
736
737         close(fd);
738
739         return true;
740 }
741
742 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
743 {
744         char from_dir[PATH_MAX];
745         char kcore_dir[PATH_MAX];
746         int ret;
747
748         snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
749
750         ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
751         if (ret)
752                 return ret;
753
754         return kcore_copy(from_dir, kcore_dir);
755 }
756
757 static int record__mmap_evlist(struct record *rec,
758                                struct evlist *evlist)
759 {
760         struct record_opts *opts = &rec->opts;
761         bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
762                                   opts->auxtrace_sample_mode;
763         char msg[512];
764
765         if (opts->affinity != PERF_AFFINITY_SYS)
766                 cpu__setup_cpunode_map();
767
768         if (evlist__mmap_ex(evlist, opts->mmap_pages,
769                                  opts->auxtrace_mmap_pages,
770                                  auxtrace_overwrite,
771                                  opts->nr_cblocks, opts->affinity,
772                                  opts->mmap_flush, opts->comp_level) < 0) {
773                 if (errno == EPERM) {
774                         pr_err("Permission error mapping pages.\n"
775                                "Consider increasing "
776                                "/proc/sys/kernel/perf_event_mlock_kb,\n"
777                                "or try again with a smaller value of -m/--mmap_pages.\n"
778                                "(current value: %u,%u)\n",
779                                opts->mmap_pages, opts->auxtrace_mmap_pages);
780                         return -errno;
781                 } else {
782                         pr_err("failed to mmap with %d (%s)\n", errno,
783                                 str_error_r(errno, msg, sizeof(msg)));
784                         if (errno)
785                                 return -errno;
786                         else
787                                 return -EINVAL;
788                 }
789         }
790         return 0;
791 }
792
793 static int record__mmap(struct record *rec)
794 {
795         return record__mmap_evlist(rec, rec->evlist);
796 }
797
798 static int record__open(struct record *rec)
799 {
800         char msg[BUFSIZ];
801         struct evsel *pos;
802         struct evlist *evlist = rec->evlist;
803         struct perf_session *session = rec->session;
804         struct record_opts *opts = &rec->opts;
805         int rc = 0;
806
807         /*
808          * For initial_delay we need to add a dummy event so that we can track
809          * PERF_RECORD_MMAP while we wait for the initial delay to enable the
810          * real events, the ones asked by the user.
811          */
812         if (opts->initial_delay) {
813                 if (perf_evlist__add_dummy(evlist))
814                         return -ENOMEM;
815
816                 pos = evlist__first(evlist);
817                 pos->tracking = 0;
818                 pos = evlist__last(evlist);
819                 pos->tracking = 1;
820                 pos->core.attr.enable_on_exec = 1;
821         }
822
823         perf_evlist__config(evlist, opts, &callchain_param);
824
825         evlist__for_each_entry(evlist, pos) {
826 try_again:
827                 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
828                         if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
829                                 if (verbose > 0)
830                                         ui__warning("%s\n", msg);
831                                 goto try_again;
832                         }
833                         if ((errno == EINVAL || errno == EBADF) &&
834                             pos->leader != pos &&
835                             pos->weak_group) {
836                                 pos = perf_evlist__reset_weak_group(evlist, pos, true);
837                                 goto try_again;
838                         }
839                         rc = -errno;
840                         perf_evsel__open_strerror(pos, &opts->target,
841                                                   errno, msg, sizeof(msg));
842                         ui__error("%s\n", msg);
843                         goto out;
844                 }
845
846                 pos->supported = true;
847         }
848
849         if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(evlist)) {
850                 pr_warning(
851 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
852 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
853 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
854 "file is not found in the buildid cache or in the vmlinux path.\n\n"
855 "Samples in kernel modules won't be resolved at all.\n\n"
856 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
857 "even with a suitable vmlinux or kallsyms file.\n\n");
858         }
859
860         if (perf_evlist__apply_filters(evlist, &pos)) {
861                 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
862                         pos->filter, perf_evsel__name(pos), errno,
863                         str_error_r(errno, msg, sizeof(msg)));
864                 rc = -1;
865                 goto out;
866         }
867
868         rc = record__mmap(rec);
869         if (rc)
870                 goto out;
871
872         session->evlist = evlist;
873         perf_session__set_id_hdr_size(session);
874 out:
875         return rc;
876 }
877
878 static int process_sample_event(struct perf_tool *tool,
879                                 union perf_event *event,
880                                 struct perf_sample *sample,
881                                 struct evsel *evsel,
882                                 struct machine *machine)
883 {
884         struct record *rec = container_of(tool, struct record, tool);
885
886         if (rec->evlist->first_sample_time == 0)
887                 rec->evlist->first_sample_time = sample->time;
888
889         rec->evlist->last_sample_time = sample->time;
890
891         if (rec->buildid_all)
892                 return 0;
893
894         rec->samples++;
895         return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
896 }
897
898 static int process_buildids(struct record *rec)
899 {
900         struct perf_session *session = rec->session;
901
902         if (perf_data__size(&rec->data) == 0)
903                 return 0;
904
905         /*
906          * During this process, it'll load kernel map and replace the
907          * dso->long_name to a real pathname it found.  In this case
908          * we prefer the vmlinux path like
909          *   /lib/modules/3.16.4/build/vmlinux
910          *
911          * rather than build-id path (in debug directory).
912          *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
913          */
914         symbol_conf.ignore_vmlinux_buildid = true;
915
916         /*
917          * If --buildid-all is given, it marks all DSO regardless of hits,
918          * so no need to process samples. But if timestamp_boundary is enabled,
919          * it still needs to walk on all samples to get the timestamps of
920          * first/last samples.
921          */
922         if (rec->buildid_all && !rec->timestamp_boundary)
923                 rec->tool.sample = NULL;
924
925         return perf_session__process_events(session);
926 }
927
928 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
929 {
930         int err;
931         struct perf_tool *tool = data;
932         /*
933          *As for guest kernel when processing subcommand record&report,
934          *we arrange module mmap prior to guest kernel mmap and trigger
935          *a preload dso because default guest module symbols are loaded
936          *from guest kallsyms instead of /lib/modules/XXX/XXX. This
937          *method is used to avoid symbol missing when the first addr is
938          *in module instead of in guest kernel.
939          */
940         err = perf_event__synthesize_modules(tool, process_synthesized_event,
941                                              machine);
942         if (err < 0)
943                 pr_err("Couldn't record guest kernel [%d]'s reference"
944                        " relocation symbol.\n", machine->pid);
945
946         /*
947          * We use _stext for guest kernel because guest kernel's /proc/kallsyms
948          * have no _text sometimes.
949          */
950         err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
951                                                  machine);
952         if (err < 0)
953                 pr_err("Couldn't record guest kernel [%d]'s reference"
954                        " relocation symbol.\n", machine->pid);
955 }
956
957 static struct perf_event_header finished_round_event = {
958         .size = sizeof(struct perf_event_header),
959         .type = PERF_RECORD_FINISHED_ROUND,
960 };
961
962 static void record__adjust_affinity(struct record *rec, struct mmap *map)
963 {
964         if (rec->opts.affinity != PERF_AFFINITY_SYS &&
965             !bitmap_equal(rec->affinity_mask.bits, map->affinity_mask.bits,
966                           rec->affinity_mask.nbits)) {
967                 bitmap_zero(rec->affinity_mask.bits, rec->affinity_mask.nbits);
968                 bitmap_or(rec->affinity_mask.bits, rec->affinity_mask.bits,
969                           map->affinity_mask.bits, rec->affinity_mask.nbits);
970                 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&rec->affinity_mask),
971                                   (cpu_set_t *)rec->affinity_mask.bits);
972                 if (verbose == 2)
973                         mmap_cpu_mask__scnprintf(&rec->affinity_mask, "thread");
974         }
975 }
976
977 static size_t process_comp_header(void *record, size_t increment)
978 {
979         struct perf_record_compressed *event = record;
980         size_t size = sizeof(*event);
981
982         if (increment) {
983                 event->header.size += increment;
984                 return increment;
985         }
986
987         event->header.type = PERF_RECORD_COMPRESSED;
988         event->header.size = size;
989
990         return size;
991 }
992
993 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
994                             void *src, size_t src_size)
995 {
996         size_t compressed;
997         size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
998
999         compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
1000                                                      max_record_size, process_comp_header);
1001
1002         session->bytes_transferred += src_size;
1003         session->bytes_compressed  += compressed;
1004
1005         return compressed;
1006 }
1007
1008 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1009                                     bool overwrite, bool synch)
1010 {
1011         u64 bytes_written = rec->bytes_written;
1012         int i;
1013         int rc = 0;
1014         struct mmap *maps;
1015         int trace_fd = rec->data.file.fd;
1016         off_t off = 0;
1017
1018         if (!evlist)
1019                 return 0;
1020
1021         maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
1022         if (!maps)
1023                 return 0;
1024
1025         if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1026                 return 0;
1027
1028         if (record__aio_enabled(rec))
1029                 off = record__aio_get_pos(trace_fd);
1030
1031         for (i = 0; i < evlist->core.nr_mmaps; i++) {
1032                 u64 flush = 0;
1033                 struct mmap *map = &maps[i];
1034
1035                 if (map->core.base) {
1036                         record__adjust_affinity(rec, map);
1037                         if (synch) {
1038                                 flush = map->core.flush;
1039                                 map->core.flush = 1;
1040                         }
1041                         if (!record__aio_enabled(rec)) {
1042                                 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1043                                         if (synch)
1044                                                 map->core.flush = flush;
1045                                         rc = -1;
1046                                         goto out;
1047                                 }
1048                         } else {
1049                                 if (record__aio_push(rec, map, &off) < 0) {
1050                                         record__aio_set_pos(trace_fd, off);
1051                                         if (synch)
1052                                                 map->core.flush = flush;
1053                                         rc = -1;
1054                                         goto out;
1055                                 }
1056                         }
1057                         if (synch)
1058                                 map->core.flush = flush;
1059                 }
1060
1061                 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1062                     !rec->opts.auxtrace_sample_mode &&
1063                     record__auxtrace_mmap_read(rec, map) != 0) {
1064                         rc = -1;
1065                         goto out;
1066                 }
1067         }
1068
1069         if (record__aio_enabled(rec))
1070                 record__aio_set_pos(trace_fd, off);
1071
1072         /*
1073          * Mark the round finished in case we wrote
1074          * at least one event.
1075          */
1076         if (bytes_written != rec->bytes_written)
1077                 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1078
1079         if (overwrite)
1080                 perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1081 out:
1082         return rc;
1083 }
1084
1085 static int record__mmap_read_all(struct record *rec, bool synch)
1086 {
1087         int err;
1088
1089         err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1090         if (err)
1091                 return err;
1092
1093         return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1094 }
1095
1096 static void record__init_features(struct record *rec)
1097 {
1098         struct perf_session *session = rec->session;
1099         int feat;
1100
1101         for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1102                 perf_header__set_feat(&session->header, feat);
1103
1104         if (rec->no_buildid)
1105                 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1106
1107         if (!have_tracepoints(&rec->evlist->core.entries))
1108                 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1109
1110         if (!rec->opts.branch_stack)
1111                 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1112
1113         if (!rec->opts.full_auxtrace)
1114                 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1115
1116         if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1117                 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1118
1119         perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1120         if (!record__comp_enabled(rec))
1121                 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1122
1123         perf_header__clear_feat(&session->header, HEADER_STAT);
1124 }
1125
1126 static void
1127 record__finish_output(struct record *rec)
1128 {
1129         struct perf_data *data = &rec->data;
1130         int fd = perf_data__fd(data);
1131
1132         if (data->is_pipe)
1133                 return;
1134
1135         rec->session->header.data_size += rec->bytes_written;
1136         data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1137
1138         if (!rec->no_buildid) {
1139                 process_buildids(rec);
1140
1141                 if (rec->buildid_all)
1142                         dsos__hit_all(rec->session);
1143         }
1144         perf_session__write_header(rec->session, rec->evlist, fd, true);
1145
1146         return;
1147 }
1148
1149 static int record__synthesize_workload(struct record *rec, bool tail)
1150 {
1151         int err;
1152         struct perf_thread_map *thread_map;
1153
1154         if (rec->opts.tail_synthesize != tail)
1155                 return 0;
1156
1157         thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1158         if (thread_map == NULL)
1159                 return -1;
1160
1161         err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1162                                                  process_synthesized_event,
1163                                                  &rec->session->machines.host,
1164                                                  rec->opts.sample_address);
1165         perf_thread_map__put(thread_map);
1166         return err;
1167 }
1168
1169 static int record__synthesize(struct record *rec, bool tail);
1170
1171 static int
1172 record__switch_output(struct record *rec, bool at_exit)
1173 {
1174         struct perf_data *data = &rec->data;
1175         int fd, err;
1176         char *new_filename;
1177
1178         /* Same Size:      "2015122520103046"*/
1179         char timestamp[] = "InvalidTimestamp";
1180
1181         record__aio_mmap_read_sync(rec);
1182
1183         record__synthesize(rec, true);
1184         if (target__none(&rec->opts.target))
1185                 record__synthesize_workload(rec, true);
1186
1187         rec->samples = 0;
1188         record__finish_output(rec);
1189         err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1190         if (err) {
1191                 pr_err("Failed to get current timestamp\n");
1192                 return -EINVAL;
1193         }
1194
1195         fd = perf_data__switch(data, timestamp,
1196                                     rec->session->header.data_offset,
1197                                     at_exit, &new_filename);
1198         if (fd >= 0 && !at_exit) {
1199                 rec->bytes_written = 0;
1200                 rec->session->header.data_size = 0;
1201         }
1202
1203         if (!quiet)
1204                 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1205                         data->path, timestamp);
1206
1207         if (rec->switch_output.num_files) {
1208                 int n = rec->switch_output.cur_file + 1;
1209
1210                 if (n >= rec->switch_output.num_files)
1211                         n = 0;
1212                 rec->switch_output.cur_file = n;
1213                 if (rec->switch_output.filenames[n]) {
1214                         remove(rec->switch_output.filenames[n]);
1215                         zfree(&rec->switch_output.filenames[n]);
1216                 }
1217                 rec->switch_output.filenames[n] = new_filename;
1218         } else {
1219                 free(new_filename);
1220         }
1221
1222         /* Output tracking events */
1223         if (!at_exit) {
1224                 record__synthesize(rec, false);
1225
1226                 /*
1227                  * In 'perf record --switch-output' without -a,
1228                  * record__synthesize() in record__switch_output() won't
1229                  * generate tracking events because there's no thread_map
1230                  * in evlist. Which causes newly created perf.data doesn't
1231                  * contain map and comm information.
1232                  * Create a fake thread_map and directly call
1233                  * perf_event__synthesize_thread_map() for those events.
1234                  */
1235                 if (target__none(&rec->opts.target))
1236                         record__synthesize_workload(rec, false);
1237         }
1238         return fd;
1239 }
1240
1241 static volatile int workload_exec_errno;
1242
1243 /*
1244  * perf_evlist__prepare_workload will send a SIGUSR1
1245  * if the fork fails, since we asked by setting its
1246  * want_signal to true.
1247  */
1248 static void workload_exec_failed_signal(int signo __maybe_unused,
1249                                         siginfo_t *info,
1250                                         void *ucontext __maybe_unused)
1251 {
1252         workload_exec_errno = info->si_value.sival_int;
1253         done = 1;
1254         child_finished = 1;
1255 }
1256
1257 static void snapshot_sig_handler(int sig);
1258 static void alarm_sig_handler(int sig);
1259
1260 static const struct perf_event_mmap_page *
1261 perf_evlist__pick_pc(struct evlist *evlist)
1262 {
1263         if (evlist) {
1264                 if (evlist->mmap && evlist->mmap[0].core.base)
1265                         return evlist->mmap[0].core.base;
1266                 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1267                         return evlist->overwrite_mmap[0].core.base;
1268         }
1269         return NULL;
1270 }
1271
1272 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1273 {
1274         const struct perf_event_mmap_page *pc;
1275
1276         pc = perf_evlist__pick_pc(rec->evlist);
1277         if (pc)
1278                 return pc;
1279         return NULL;
1280 }
1281
1282 static int record__synthesize(struct record *rec, bool tail)
1283 {
1284         struct perf_session *session = rec->session;
1285         struct machine *machine = &session->machines.host;
1286         struct perf_data *data = &rec->data;
1287         struct record_opts *opts = &rec->opts;
1288         struct perf_tool *tool = &rec->tool;
1289         int fd = perf_data__fd(data);
1290         int err = 0;
1291
1292         if (rec->opts.tail_synthesize != tail)
1293                 return 0;
1294
1295         if (data->is_pipe) {
1296                 /*
1297                  * We need to synthesize events first, because some
1298                  * features works on top of them (on report side).
1299                  */
1300                 err = perf_event__synthesize_attrs(tool, rec->evlist,
1301                                                    process_synthesized_event);
1302                 if (err < 0) {
1303                         pr_err("Couldn't synthesize attrs.\n");
1304                         goto out;
1305                 }
1306
1307                 err = perf_event__synthesize_features(tool, session, rec->evlist,
1308                                                       process_synthesized_event);
1309                 if (err < 0) {
1310                         pr_err("Couldn't synthesize features.\n");
1311                         return err;
1312                 }
1313
1314                 if (have_tracepoints(&rec->evlist->core.entries)) {
1315                         /*
1316                          * FIXME err <= 0 here actually means that
1317                          * there were no tracepoints so its not really
1318                          * an error, just that we don't need to
1319                          * synthesize anything.  We really have to
1320                          * return this more properly and also
1321                          * propagate errors that now are calling die()
1322                          */
1323                         err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
1324                                                                   process_synthesized_event);
1325                         if (err <= 0) {
1326                                 pr_err("Couldn't record tracing data.\n");
1327                                 goto out;
1328                         }
1329                         rec->bytes_written += err;
1330                 }
1331         }
1332
1333         err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1334                                           process_synthesized_event, machine);
1335         if (err)
1336                 goto out;
1337
1338         /* Synthesize id_index before auxtrace_info */
1339         if (rec->opts.auxtrace_sample_mode) {
1340                 err = perf_event__synthesize_id_index(tool,
1341                                                       process_synthesized_event,
1342                                                       session->evlist, machine);
1343                 if (err)
1344                         goto out;
1345         }
1346
1347         if (rec->opts.full_auxtrace) {
1348                 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1349                                         session, process_synthesized_event);
1350                 if (err)
1351                         goto out;
1352         }
1353
1354         if (!perf_evlist__exclude_kernel(rec->evlist)) {
1355                 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1356                                                          machine);
1357                 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1358                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1359                                    "Check /proc/kallsyms permission or run as root.\n");
1360
1361                 err = perf_event__synthesize_modules(tool, process_synthesized_event,
1362                                                      machine);
1363                 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1364                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1365                                    "Check /proc/modules permission or run as root.\n");
1366         }
1367
1368         if (perf_guest) {
1369                 machines__process_guests(&session->machines,
1370                                          perf_event__synthesize_guest_os, tool);
1371         }
1372
1373         err = perf_event__synthesize_extra_attr(&rec->tool,
1374                                                 rec->evlist,
1375                                                 process_synthesized_event,
1376                                                 data->is_pipe);
1377         if (err)
1378                 goto out;
1379
1380         err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1381                                                  process_synthesized_event,
1382                                                 NULL);
1383         if (err < 0) {
1384                 pr_err("Couldn't synthesize thread map.\n");
1385                 return err;
1386         }
1387
1388         err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1389                                              process_synthesized_event, NULL);
1390         if (err < 0) {
1391                 pr_err("Couldn't synthesize cpu map.\n");
1392                 return err;
1393         }
1394
1395         err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1396                                                 machine, opts);
1397         if (err < 0)
1398                 pr_warning("Couldn't synthesize bpf events.\n");
1399
1400         err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1401                                             process_synthesized_event, opts->sample_address,
1402                                             1);
1403 out:
1404         return err;
1405 }
1406
1407 static int __cmd_record(struct record *rec, int argc, const char **argv)
1408 {
1409         int err;
1410         int status = 0;
1411         unsigned long waking = 0;
1412         const bool forks = argc > 0;
1413         struct perf_tool *tool = &rec->tool;
1414         struct record_opts *opts = &rec->opts;
1415         struct perf_data *data = &rec->data;
1416         struct perf_session *session;
1417         bool disabled = false, draining = false;
1418         struct evlist *sb_evlist = NULL;
1419         int fd;
1420         float ratio = 0;
1421
1422         atexit(record__sig_exit);
1423         signal(SIGCHLD, sig_handler);
1424         signal(SIGINT, sig_handler);
1425         signal(SIGTERM, sig_handler);
1426         signal(SIGSEGV, sigsegv_handler);
1427
1428         if (rec->opts.record_namespaces)
1429                 tool->namespace_events = true;
1430
1431         if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1432                 signal(SIGUSR2, snapshot_sig_handler);
1433                 if (rec->opts.auxtrace_snapshot_mode)
1434                         trigger_on(&auxtrace_snapshot_trigger);
1435                 if (rec->switch_output.enabled)
1436                         trigger_on(&switch_output_trigger);
1437         } else {
1438                 signal(SIGUSR2, SIG_IGN);
1439         }
1440
1441         session = perf_session__new(data, false, tool);
1442         if (IS_ERR(session)) {
1443                 pr_err("Perf session creation failed.\n");
1444                 return PTR_ERR(session);
1445         }
1446
1447         fd = perf_data__fd(data);
1448         rec->session = session;
1449
1450         if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1451                 pr_err("Compression initialization failed.\n");
1452                 return -1;
1453         }
1454
1455         session->header.env.comp_type  = PERF_COMP_ZSTD;
1456         session->header.env.comp_level = rec->opts.comp_level;
1457
1458         if (rec->opts.kcore &&
1459             !record__kcore_readable(&session->machines.host)) {
1460                 pr_err("ERROR: kcore is not readable.\n");
1461                 return -1;
1462         }
1463
1464         record__init_features(rec);
1465
1466         if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1467                 session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1468
1469         if (forks) {
1470                 err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1471                                                     argv, data->is_pipe,
1472                                                     workload_exec_failed_signal);
1473                 if (err < 0) {
1474                         pr_err("Couldn't run the workload!\n");
1475                         status = err;
1476                         goto out_delete_session;
1477                 }
1478         }
1479
1480         /*
1481          * If we have just single event and are sending data
1482          * through pipe, we need to force the ids allocation,
1483          * because we synthesize event name through the pipe
1484          * and need the id for that.
1485          */
1486         if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1487                 rec->opts.sample_id = true;
1488
1489         if (record__open(rec) != 0) {
1490                 err = -1;
1491                 goto out_child;
1492         }
1493         session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
1494
1495         if (rec->opts.kcore) {
1496                 err = record__kcore_copy(&session->machines.host, data);
1497                 if (err) {
1498                         pr_err("ERROR: Failed to copy kcore\n");
1499                         goto out_child;
1500                 }
1501         }
1502
1503         err = bpf__apply_obj_config();
1504         if (err) {
1505                 char errbuf[BUFSIZ];
1506
1507                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1508                 pr_err("ERROR: Apply config to BPF failed: %s\n",
1509                          errbuf);
1510                 goto out_child;
1511         }
1512
1513         /*
1514          * Normally perf_session__new would do this, but it doesn't have the
1515          * evlist.
1516          */
1517         if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1518                 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1519                 rec->tool.ordered_events = false;
1520         }
1521
1522         if (!rec->evlist->nr_groups)
1523                 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1524
1525         if (data->is_pipe) {
1526                 err = perf_header__write_pipe(fd);
1527                 if (err < 0)
1528                         goto out_child;
1529         } else {
1530                 err = perf_session__write_header(session, rec->evlist, fd, false);
1531                 if (err < 0)
1532                         goto out_child;
1533         }
1534
1535         if (!rec->no_buildid
1536             && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1537                 pr_err("Couldn't generate buildids. "
1538                        "Use --no-buildid to profile anyway.\n");
1539                 err = -1;
1540                 goto out_child;
1541         }
1542
1543         if (!opts->no_bpf_event)
1544                 bpf_event__add_sb_event(&sb_evlist, &session->header.env);
1545
1546         if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) {
1547                 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1548                 opts->no_bpf_event = true;
1549         }
1550
1551         err = record__synthesize(rec, false);
1552         if (err < 0)
1553                 goto out_child;
1554
1555         if (rec->realtime_prio) {
1556                 struct sched_param param;
1557
1558                 param.sched_priority = rec->realtime_prio;
1559                 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1560                         pr_err("Could not set realtime priority.\n");
1561                         err = -1;
1562                         goto out_child;
1563                 }
1564         }
1565
1566         /*
1567          * When perf is starting the traced process, all the events
1568          * (apart from group members) have enable_on_exec=1 set,
1569          * so don't spoil it by prematurely enabling them.
1570          */
1571         if (!target__none(&opts->target) && !opts->initial_delay)
1572                 evlist__enable(rec->evlist);
1573
1574         /*
1575          * Let the child rip
1576          */
1577         if (forks) {
1578                 struct machine *machine = &session->machines.host;
1579                 union perf_event *event;
1580                 pid_t tgid;
1581
1582                 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1583                 if (event == NULL) {
1584                         err = -ENOMEM;
1585                         goto out_child;
1586                 }
1587
1588                 /*
1589                  * Some H/W events are generated before COMM event
1590                  * which is emitted during exec(), so perf script
1591                  * cannot see a correct process name for those events.
1592                  * Synthesize COMM event to prevent it.
1593                  */
1594                 tgid = perf_event__synthesize_comm(tool, event,
1595                                                    rec->evlist->workload.pid,
1596                                                    process_synthesized_event,
1597                                                    machine);
1598                 free(event);
1599
1600                 if (tgid == -1)
1601                         goto out_child;
1602
1603                 event = malloc(sizeof(event->namespaces) +
1604                                (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1605                                machine->id_hdr_size);
1606                 if (event == NULL) {
1607                         err = -ENOMEM;
1608                         goto out_child;
1609                 }
1610
1611                 /*
1612                  * Synthesize NAMESPACES event for the command specified.
1613                  */
1614                 perf_event__synthesize_namespaces(tool, event,
1615                                                   rec->evlist->workload.pid,
1616                                                   tgid, process_synthesized_event,
1617                                                   machine);
1618                 free(event);
1619
1620                 perf_evlist__start_workload(rec->evlist);
1621         }
1622
1623         if (opts->initial_delay) {
1624                 usleep(opts->initial_delay * USEC_PER_MSEC);
1625                 evlist__enable(rec->evlist);
1626         }
1627
1628         trigger_ready(&auxtrace_snapshot_trigger);
1629         trigger_ready(&switch_output_trigger);
1630         perf_hooks__invoke_record_start();
1631         for (;;) {
1632                 unsigned long long hits = rec->samples;
1633
1634                 /*
1635                  * rec->evlist->bkw_mmap_state is possible to be
1636                  * BKW_MMAP_EMPTY here: when done == true and
1637                  * hits != rec->samples in previous round.
1638                  *
1639                  * perf_evlist__toggle_bkw_mmap ensure we never
1640                  * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1641                  */
1642                 if (trigger_is_hit(&switch_output_trigger) || done || draining)
1643                         perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1644
1645                 if (record__mmap_read_all(rec, false) < 0) {
1646                         trigger_error(&auxtrace_snapshot_trigger);
1647                         trigger_error(&switch_output_trigger);
1648                         err = -1;
1649                         goto out_child;
1650                 }
1651
1652                 if (auxtrace_record__snapshot_started) {
1653                         auxtrace_record__snapshot_started = 0;
1654                         if (!trigger_is_error(&auxtrace_snapshot_trigger))
1655                                 record__read_auxtrace_snapshot(rec, false);
1656                         if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1657                                 pr_err("AUX area tracing snapshot failed\n");
1658                                 err = -1;
1659                                 goto out_child;
1660                         }
1661                 }
1662
1663                 if (trigger_is_hit(&switch_output_trigger)) {
1664                         /*
1665                          * If switch_output_trigger is hit, the data in
1666                          * overwritable ring buffer should have been collected,
1667                          * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1668                          *
1669                          * If SIGUSR2 raise after or during record__mmap_read_all(),
1670                          * record__mmap_read_all() didn't collect data from
1671                          * overwritable ring buffer. Read again.
1672                          */
1673                         if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1674                                 continue;
1675                         trigger_ready(&switch_output_trigger);
1676
1677                         /*
1678                          * Reenable events in overwrite ring buffer after
1679                          * record__mmap_read_all(): we should have collected
1680                          * data from it.
1681                          */
1682                         perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1683
1684                         if (!quiet)
1685                                 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1686                                         waking);
1687                         waking = 0;
1688                         fd = record__switch_output(rec, false);
1689                         if (fd < 0) {
1690                                 pr_err("Failed to switch to new file\n");
1691                                 trigger_error(&switch_output_trigger);
1692                                 err = fd;
1693                                 goto out_child;
1694                         }
1695
1696                         /* re-arm the alarm */
1697                         if (rec->switch_output.time)
1698                                 alarm(rec->switch_output.time);
1699                 }
1700
1701                 if (hits == rec->samples) {
1702                         if (done || draining)
1703                                 break;
1704                         err = evlist__poll(rec->evlist, -1);
1705                         /*
1706                          * Propagate error, only if there's any. Ignore positive
1707                          * number of returned events and interrupt error.
1708                          */
1709                         if (err > 0 || (err < 0 && errno == EINTR))
1710                                 err = 0;
1711                         waking++;
1712
1713                         if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1714                                 draining = true;
1715                 }
1716
1717                 /*
1718                  * When perf is starting the traced process, at the end events
1719                  * die with the process and we wait for that. Thus no need to
1720                  * disable events in this case.
1721                  */
1722                 if (done && !disabled && !target__none(&opts->target)) {
1723                         trigger_off(&auxtrace_snapshot_trigger);
1724                         evlist__disable(rec->evlist);
1725                         disabled = true;
1726                 }
1727         }
1728
1729         trigger_off(&auxtrace_snapshot_trigger);
1730         trigger_off(&switch_output_trigger);
1731
1732         if (opts->auxtrace_snapshot_on_exit)
1733                 record__auxtrace_snapshot_exit(rec);
1734
1735         if (forks && workload_exec_errno) {
1736                 char msg[STRERR_BUFSIZE];
1737                 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1738                 pr_err("Workload failed: %s\n", emsg);
1739                 err = -1;
1740                 goto out_child;
1741         }
1742
1743         if (!quiet)
1744                 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1745
1746         if (target__none(&rec->opts.target))
1747                 record__synthesize_workload(rec, true);
1748
1749 out_child:
1750         record__mmap_read_all(rec, true);
1751         record__aio_mmap_read_sync(rec);
1752
1753         if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1754                 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1755                 session->header.env.comp_ratio = ratio + 0.5;
1756         }
1757
1758         if (forks) {
1759                 int exit_status;
1760
1761                 if (!child_finished)
1762                         kill(rec->evlist->workload.pid, SIGTERM);
1763
1764                 wait(&exit_status);
1765
1766                 if (err < 0)
1767                         status = err;
1768                 else if (WIFEXITED(exit_status))
1769                         status = WEXITSTATUS(exit_status);
1770                 else if (WIFSIGNALED(exit_status))
1771                         signr = WTERMSIG(exit_status);
1772         } else
1773                 status = err;
1774
1775         record__synthesize(rec, true);
1776         /* this will be recalculated during process_buildids() */
1777         rec->samples = 0;
1778
1779         if (!err) {
1780                 if (!rec->timestamp_filename) {
1781                         record__finish_output(rec);
1782                 } else {
1783                         fd = record__switch_output(rec, true);
1784                         if (fd < 0) {
1785                                 status = fd;
1786                                 goto out_delete_session;
1787                         }
1788                 }
1789         }
1790
1791         perf_hooks__invoke_record_end();
1792
1793         if (!err && !quiet) {
1794                 char samples[128];
1795                 const char *postfix = rec->timestamp_filename ?
1796                                         ".<timestamp>" : "";
1797
1798                 if (rec->samples && !rec->opts.full_auxtrace)
1799                         scnprintf(samples, sizeof(samples),
1800                                   " (%" PRIu64 " samples)", rec->samples);
1801                 else
1802                         samples[0] = '\0';
1803
1804                 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
1805                         perf_data__size(data) / 1024.0 / 1024.0,
1806                         data->path, postfix, samples);
1807                 if (ratio) {
1808                         fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
1809                                         rec->session->bytes_transferred / 1024.0 / 1024.0,
1810                                         ratio);
1811                 }
1812                 fprintf(stderr, " ]\n");
1813         }
1814
1815 out_delete_session:
1816         zstd_fini(&session->zstd_data);
1817         perf_session__delete(session);
1818
1819         if (!opts->no_bpf_event)
1820                 perf_evlist__stop_sb_thread(sb_evlist);
1821         return status;
1822 }
1823
1824 static void callchain_debug(struct callchain_param *callchain)
1825 {
1826         static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1827
1828         pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1829
1830         if (callchain->record_mode == CALLCHAIN_DWARF)
1831                 pr_debug("callchain: stack dump size %d\n",
1832                          callchain->dump_size);
1833 }
1834
1835 int record_opts__parse_callchain(struct record_opts *record,
1836                                  struct callchain_param *callchain,
1837                                  const char *arg, bool unset)
1838 {
1839         int ret;
1840         callchain->enabled = !unset;
1841
1842         /* --no-call-graph */
1843         if (unset) {
1844                 callchain->record_mode = CALLCHAIN_NONE;
1845                 pr_debug("callchain: disabled\n");
1846                 return 0;
1847         }
1848
1849         ret = parse_callchain_record_opt(arg, callchain);
1850         if (!ret) {
1851                 /* Enable data address sampling for DWARF unwind. */
1852                 if (callchain->record_mode == CALLCHAIN_DWARF)
1853                         record->sample_address = true;
1854                 callchain_debug(callchain);
1855         }
1856
1857         return ret;
1858 }
1859
1860 int record_parse_callchain_opt(const struct option *opt,
1861                                const char *arg,
1862                                int unset)
1863 {
1864         return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1865 }
1866
1867 int record_callchain_opt(const struct option *opt,
1868                          const char *arg __maybe_unused,
1869                          int unset __maybe_unused)
1870 {
1871         struct callchain_param *callchain = opt->value;
1872
1873         callchain->enabled = true;
1874
1875         if (callchain->record_mode == CALLCHAIN_NONE)
1876                 callchain->record_mode = CALLCHAIN_FP;
1877
1878         callchain_debug(callchain);
1879         return 0;
1880 }
1881
1882 static int perf_record_config(const char *var, const char *value, void *cb)
1883 {
1884         struct record *rec = cb;
1885
1886         if (!strcmp(var, "record.build-id")) {
1887                 if (!strcmp(value, "cache"))
1888                         rec->no_buildid_cache = false;
1889                 else if (!strcmp(value, "no-cache"))
1890                         rec->no_buildid_cache = true;
1891                 else if (!strcmp(value, "skip"))
1892                         rec->no_buildid = true;
1893                 else
1894                         return -1;
1895                 return 0;
1896         }
1897         if (!strcmp(var, "record.call-graph")) {
1898                 var = "call-graph.record-mode";
1899                 return perf_default_config(var, value, cb);
1900         }
1901 #ifdef HAVE_AIO_SUPPORT
1902         if (!strcmp(var, "record.aio")) {
1903                 rec->opts.nr_cblocks = strtol(value, NULL, 0);
1904                 if (!rec->opts.nr_cblocks)
1905                         rec->opts.nr_cblocks = nr_cblocks_default;
1906         }
1907 #endif
1908
1909         return 0;
1910 }
1911
1912 struct clockid_map {
1913         const char *name;
1914         int clockid;
1915 };
1916
1917 #define CLOCKID_MAP(n, c)       \
1918         { .name = n, .clockid = (c), }
1919
1920 #define CLOCKID_END     { .name = NULL, }
1921
1922
1923 /*
1924  * Add the missing ones, we need to build on many distros...
1925  */
1926 #ifndef CLOCK_MONOTONIC_RAW
1927 #define CLOCK_MONOTONIC_RAW 4
1928 #endif
1929 #ifndef CLOCK_BOOTTIME
1930 #define CLOCK_BOOTTIME 7
1931 #endif
1932 #ifndef CLOCK_TAI
1933 #define CLOCK_TAI 11
1934 #endif
1935
1936 static const struct clockid_map clockids[] = {
1937         /* available for all events, NMI safe */
1938         CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1939         CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1940
1941         /* available for some events */
1942         CLOCKID_MAP("realtime", CLOCK_REALTIME),
1943         CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1944         CLOCKID_MAP("tai", CLOCK_TAI),
1945
1946         /* available for the lazy */
1947         CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1948         CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1949         CLOCKID_MAP("real", CLOCK_REALTIME),
1950         CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1951
1952         CLOCKID_END,
1953 };
1954
1955 static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1956 {
1957         struct timespec res;
1958
1959         *res_ns = 0;
1960         if (!clock_getres(clk_id, &res))
1961                 *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1962         else
1963                 pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1964
1965         return 0;
1966 }
1967
1968 static int parse_clockid(const struct option *opt, const char *str, int unset)
1969 {
1970         struct record_opts *opts = (struct record_opts *)opt->value;
1971         const struct clockid_map *cm;
1972         const char *ostr = str;
1973
1974         if (unset) {
1975                 opts->use_clockid = 0;
1976                 return 0;
1977         }
1978
1979         /* no arg passed */
1980         if (!str)
1981                 return 0;
1982
1983         /* no setting it twice */
1984         if (opts->use_clockid)
1985                 return -1;
1986
1987         opts->use_clockid = true;
1988
1989         /* if its a number, we're done */
1990         if (sscanf(str, "%d", &opts->clockid) == 1)
1991                 return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
1992
1993         /* allow a "CLOCK_" prefix to the name */
1994         if (!strncasecmp(str, "CLOCK_", 6))
1995                 str += 6;
1996
1997         for (cm = clockids; cm->name; cm++) {
1998                 if (!strcasecmp(str, cm->name)) {
1999                         opts->clockid = cm->clockid;
2000                         return get_clockid_res(opts->clockid,
2001                                                &opts->clockid_res_ns);
2002                 }
2003         }
2004
2005         opts->use_clockid = false;
2006         ui__warning("unknown clockid %s, check man page\n", ostr);
2007         return -1;
2008 }
2009
2010 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2011 {
2012         struct record_opts *opts = (struct record_opts *)opt->value;
2013
2014         if (unset || !str)
2015                 return 0;
2016
2017         if (!strcasecmp(str, "node"))
2018                 opts->affinity = PERF_AFFINITY_NODE;
2019         else if (!strcasecmp(str, "cpu"))
2020                 opts->affinity = PERF_AFFINITY_CPU;
2021
2022         return 0;
2023 }
2024
2025 static int parse_output_max_size(const struct option *opt,
2026                                  const char *str, int unset)
2027 {
2028         unsigned long *s = (unsigned long *)opt->value;
2029         static struct parse_tag tags_size[] = {
2030                 { .tag  = 'B', .mult = 1       },
2031                 { .tag  = 'K', .mult = 1 << 10 },
2032                 { .tag  = 'M', .mult = 1 << 20 },
2033                 { .tag  = 'G', .mult = 1 << 30 },
2034                 { .tag  = 0 },
2035         };
2036         unsigned long val;
2037
2038         if (unset) {
2039                 *s = 0;
2040                 return 0;
2041         }
2042
2043         val = parse_tag_value(str, tags_size);
2044         if (val != (unsigned long) -1) {
2045                 *s = val;
2046                 return 0;
2047         }
2048
2049         return -1;
2050 }
2051
2052 static int record__parse_mmap_pages(const struct option *opt,
2053                                     const char *str,
2054                                     int unset __maybe_unused)
2055 {
2056         struct record_opts *opts = opt->value;
2057         char *s, *p;
2058         unsigned int mmap_pages;
2059         int ret;
2060
2061         if (!str)
2062                 return -EINVAL;
2063
2064         s = strdup(str);
2065         if (!s)
2066                 return -ENOMEM;
2067
2068         p = strchr(s, ',');
2069         if (p)
2070                 *p = '\0';
2071
2072         if (*s) {
2073                 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
2074                 if (ret)
2075                         goto out_free;
2076                 opts->mmap_pages = mmap_pages;
2077         }
2078
2079         if (!p) {
2080                 ret = 0;
2081                 goto out_free;
2082         }
2083
2084         ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
2085         if (ret)
2086                 goto out_free;
2087
2088         opts->auxtrace_mmap_pages = mmap_pages;
2089
2090 out_free:
2091         free(s);
2092         return ret;
2093 }
2094
2095 static void switch_output_size_warn(struct record *rec)
2096 {
2097         u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
2098         struct switch_output *s = &rec->switch_output;
2099
2100         wakeup_size /= 2;
2101
2102         if (s->size < wakeup_size) {
2103                 char buf[100];
2104
2105                 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
2106                 pr_warning("WARNING: switch-output data size lower than "
2107                            "wakeup kernel buffer size (%s) "
2108                            "expect bigger perf.data sizes\n", buf);
2109         }
2110 }
2111
2112 static int switch_output_setup(struct record *rec)
2113 {
2114         struct switch_output *s = &rec->switch_output;
2115         static struct parse_tag tags_size[] = {
2116                 { .tag  = 'B', .mult = 1       },
2117                 { .tag  = 'K', .mult = 1 << 10 },
2118                 { .tag  = 'M', .mult = 1 << 20 },
2119                 { .tag  = 'G', .mult = 1 << 30 },
2120                 { .tag  = 0 },
2121         };
2122         static struct parse_tag tags_time[] = {
2123                 { .tag  = 's', .mult = 1        },
2124                 { .tag  = 'm', .mult = 60       },
2125                 { .tag  = 'h', .mult = 60*60    },
2126                 { .tag  = 'd', .mult = 60*60*24 },
2127                 { .tag  = 0 },
2128         };
2129         unsigned long val;
2130
2131         if (!s->set)
2132                 return 0;
2133
2134         if (!strcmp(s->str, "signal")) {
2135                 s->signal = true;
2136                 pr_debug("switch-output with SIGUSR2 signal\n");
2137                 goto enabled;
2138         }
2139
2140         val = parse_tag_value(s->str, tags_size);
2141         if (val != (unsigned long) -1) {
2142                 s->size = val;
2143                 pr_debug("switch-output with %s size threshold\n", s->str);
2144                 goto enabled;
2145         }
2146
2147         val = parse_tag_value(s->str, tags_time);
2148         if (val != (unsigned long) -1) {
2149                 s->time = val;
2150                 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2151                          s->str, s->time);
2152                 goto enabled;
2153         }
2154
2155         return -1;
2156
2157 enabled:
2158         rec->timestamp_filename = true;
2159         s->enabled              = true;
2160
2161         if (s->size && !rec->opts.no_buffering)
2162                 switch_output_size_warn(rec);
2163
2164         return 0;
2165 }
2166
2167 static const char * const __record_usage[] = {
2168         "perf record [<options>] [<command>]",
2169         "perf record [<options>] -- <command> [<options>]",
2170         NULL
2171 };
2172 const char * const *record_usage = __record_usage;
2173
2174 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
2175                                   struct perf_sample *sample, struct machine *machine)
2176 {
2177         /*
2178          * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2179          * no need to add them twice.
2180          */
2181         if (!(event->header.misc & PERF_RECORD_MISC_USER))
2182                 return 0;
2183         return perf_event__process_mmap(tool, event, sample, machine);
2184 }
2185
2186 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
2187                                    struct perf_sample *sample, struct machine *machine)
2188 {
2189         /*
2190          * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2191          * no need to add them twice.
2192          */
2193         if (!(event->header.misc & PERF_RECORD_MISC_USER))
2194                 return 0;
2195
2196         return perf_event__process_mmap2(tool, event, sample, machine);
2197 }
2198
2199 /*
2200  * XXX Ideally would be local to cmd_record() and passed to a record__new
2201  * because we need to have access to it in record__exit, that is called
2202  * after cmd_record() exits, but since record_options need to be accessible to
2203  * builtin-script, leave it here.
2204  *
2205  * At least we don't ouch it in all the other functions here directly.
2206  *
2207  * Just say no to tons of global variables, sigh.
2208  */
2209 static struct record record = {
2210         .opts = {
2211                 .sample_time         = true,
2212                 .mmap_pages          = UINT_MAX,
2213                 .user_freq           = UINT_MAX,
2214                 .user_interval       = ULLONG_MAX,
2215                 .freq                = 4000,
2216                 .target              = {
2217                         .uses_mmap   = true,
2218                         .default_per_cpu = true,
2219                 },
2220                 .mmap_flush          = MMAP_FLUSH_DEFAULT,
2221         },
2222         .tool = {
2223                 .sample         = process_sample_event,
2224                 .fork           = perf_event__process_fork,
2225                 .exit           = perf_event__process_exit,
2226                 .comm           = perf_event__process_comm,
2227                 .namespaces     = perf_event__process_namespaces,
2228                 .mmap           = build_id__process_mmap,
2229                 .mmap2          = build_id__process_mmap2,
2230                 .ordered_events = true,
2231         },
2232 };
2233
2234 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2235         "\n\t\t\t\tDefault: fp";
2236
2237 static bool dry_run;
2238
2239 /*
2240  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2241  * with it and switch to use the library functions in perf_evlist that came
2242  * from builtin-record.c, i.e. use record_opts,
2243  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2244  * using pipes, etc.
2245  */
2246 static struct option __record_options[] = {
2247         OPT_CALLBACK('e', "event", &record.evlist, "event",
2248                      "event selector. use 'perf list' to list available events",
2249                      parse_events_option),
2250         OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2251                      "event filter", parse_filter),
2252         OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2253                            NULL, "don't record events from perf itself",
2254                            exclude_perf),
2255         OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2256                     "record events on existing process id"),
2257         OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2258                     "record events on existing thread id"),
2259         OPT_INTEGER('r', "realtime", &record.realtime_prio,
2260                     "collect data with this RT SCHED_FIFO priority"),
2261         OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2262                     "collect data without buffering"),
2263         OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2264                     "collect raw sample records from all opened counters"),
2265         OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2266                             "system-wide collection from all CPUs"),
2267         OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2268                     "list of cpus to monitor"),
2269         OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2270         OPT_STRING('o', "output", &record.data.path, "file",
2271                     "output file name"),
2272         OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2273                         &record.opts.no_inherit_set,
2274                         "child tasks do not inherit counters"),
2275         OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2276                     "synthesize non-sample events at the end of output"),
2277         OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2278         OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
2279         OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2280                     "Fail if the specified frequency can't be used"),
2281         OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2282                      "profile at this frequency",
2283                       record__parse_freq),
2284         OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2285                      "number of mmap data pages and AUX area tracing mmap pages",
2286                      record__parse_mmap_pages),
2287         OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2288                      "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2289                      record__mmap_flush_parse),
2290         OPT_BOOLEAN(0, "group", &record.opts.group,
2291                     "put the counters into a counter group"),
2292         OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2293                            NULL, "enables call-graph recording" ,
2294                            &record_callchain_opt),
2295         OPT_CALLBACK(0, "call-graph", &record.opts,
2296                      "record_mode[,record_size]", record_callchain_help,
2297                      &record_parse_callchain_opt),
2298         OPT_INCR('v', "verbose", &verbose,
2299                     "be more verbose (show counter open errors, etc)"),
2300         OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2301         OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2302                     "per thread counts"),
2303         OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2304         OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2305                     "Record the sample physical addresses"),
2306         OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2307         OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2308                         &record.opts.sample_time_set,
2309                         "Record the sample timestamps"),
2310         OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2311                         "Record the sample period"),
2312         OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2313                     "don't sample"),
2314         OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2315                         &record.no_buildid_cache_set,
2316                         "do not update the buildid cache"),
2317         OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2318                         &record.no_buildid_set,
2319                         "do not collect buildids in perf.data"),
2320         OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2321                      "monitor event in cgroup name only",
2322                      parse_cgroups),
2323         OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2324                   "ms to wait before starting measurement after program start"),
2325         OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
2326         OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2327                    "user to profile"),
2328
2329         OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2330                      "branch any", "sample any taken branches",
2331                      parse_branch_stack),
2332
2333         OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2334                      "branch filter mask", "branch stack filter modes",
2335                      parse_branch_stack),
2336         OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2337                     "sample by weight (on special events only)"),
2338         OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2339                     "sample transaction flags (special events only)"),
2340         OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2341                     "use per-thread mmaps"),
2342         OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2343                     "sample selected machine registers on interrupt,"
2344                     " use '-I?' to list register names", parse_intr_regs),
2345         OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2346                     "sample selected machine registers on interrupt,"
2347                     " use '--user-regs=?' to list register names", parse_user_regs),
2348         OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2349                     "Record running/enabled time of read (:S) events"),
2350         OPT_CALLBACK('k', "clockid", &record.opts,
2351         "clockid", "clockid to use for events, see clock_gettime()",
2352         parse_clockid),
2353         OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2354                           "opts", "AUX area tracing Snapshot Mode", ""),
2355         OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
2356                           "opts", "sample AUX area", ""),
2357         OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2358                         "per thread proc mmap processing timeout in ms"),
2359         OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2360                     "Record namespaces events"),
2361         OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
2362                     "Record context switch events"),
2363         OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2364                          "Configure all used events to run in kernel space.",
2365                          PARSE_OPT_EXCLUSIVE),
2366         OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2367                          "Configure all used events to run in user space.",
2368                          PARSE_OPT_EXCLUSIVE),
2369         OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2370                     "collect kernel callchains"),
2371         OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2372                     "collect user callchains"),
2373         OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2374                    "clang binary to use for compiling BPF scriptlets"),
2375         OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2376                    "options passed to clang when compiling BPF scriptlets"),
2377         OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2378                    "file", "vmlinux pathname"),
2379         OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2380                     "Record build-id of all DSOs regardless of hits"),
2381         OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2382                     "append timestamp to output filename"),
2383         OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2384                     "Record timestamp boundary (time of first/last samples)"),
2385         OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2386                           &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2387                           "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2388                           "signal"),
2389         OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2390                    "Limit number of switch output generated files"),
2391         OPT_BOOLEAN(0, "dry-run", &dry_run,
2392                     "Parse options then exit"),
2393 #ifdef HAVE_AIO_SUPPORT
2394         OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2395                      &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2396                      record__aio_parse),
2397 #endif
2398         OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2399                      "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2400                      record__parse_affinity),
2401 #ifdef HAVE_ZSTD_SUPPORT
2402         OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2403                             "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2404                             record__parse_comp_level),
2405 #endif
2406         OPT_CALLBACK(0, "max-size", &record.output_max_size,
2407                      "size", "Limit the maximum size of the output file", parse_output_max_size),
2408         OPT_END()
2409 };
2410
2411 struct option *record_options = __record_options;
2412
2413 int cmd_record(int argc, const char **argv)
2414 {
2415         int err;
2416         struct record *rec = &record;
2417         char errbuf[BUFSIZ];
2418
2419         setlocale(LC_ALL, "");
2420
2421 #ifndef HAVE_LIBBPF_SUPPORT
2422 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2423         set_nobuild('\0', "clang-path", true);
2424         set_nobuild('\0', "clang-opt", true);
2425 # undef set_nobuild
2426 #endif
2427
2428 #ifndef HAVE_BPF_PROLOGUE
2429 # if !defined (HAVE_DWARF_SUPPORT)
2430 #  define REASON  "NO_DWARF=1"
2431 # elif !defined (HAVE_LIBBPF_SUPPORT)
2432 #  define REASON  "NO_LIBBPF=1"
2433 # else
2434 #  define REASON  "this architecture doesn't support BPF prologue"
2435 # endif
2436 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2437         set_nobuild('\0', "vmlinux", true);
2438 # undef set_nobuild
2439 # undef REASON
2440 #endif
2441
2442         rec->opts.affinity = PERF_AFFINITY_SYS;
2443
2444         rec->evlist = evlist__new();
2445         if (rec->evlist == NULL)
2446                 return -ENOMEM;
2447
2448         err = perf_config(perf_record_config, rec);
2449         if (err)
2450                 return err;
2451
2452         argc = parse_options(argc, argv, record_options, record_usage,
2453                             PARSE_OPT_STOP_AT_NON_OPTION);
2454         if (quiet)
2455                 perf_quiet_option();
2456
2457         /* Make system wide (-a) the default target. */
2458         if (!argc && target__none(&rec->opts.target))
2459                 rec->opts.target.system_wide = true;
2460
2461         if (nr_cgroups && !rec->opts.target.system_wide) {
2462                 usage_with_options_msg(record_usage, record_options,
2463                         "cgroup monitoring only available in system-wide mode");
2464
2465         }
2466
2467         if (rec->opts.kcore)
2468                 rec->data.is_dir = true;
2469
2470         if (rec->opts.comp_level != 0) {
2471                 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2472                 rec->no_buildid = true;
2473         }
2474
2475         if (rec->opts.record_switch_events &&
2476             !perf_can_record_switch_events()) {
2477                 ui__error("kernel does not support recording context switch events\n");
2478                 parse_options_usage(record_usage, record_options, "switch-events", 0);
2479                 return -EINVAL;
2480         }
2481
2482         if (switch_output_setup(rec)) {
2483                 parse_options_usage(record_usage, record_options, "switch-output", 0);
2484                 return -EINVAL;
2485         }
2486
2487         if (rec->switch_output.time) {
2488                 signal(SIGALRM, alarm_sig_handler);
2489                 alarm(rec->switch_output.time);
2490         }
2491
2492         if (rec->switch_output.num_files) {
2493                 rec->switch_output.filenames = calloc(sizeof(char *),
2494                                                       rec->switch_output.num_files);
2495                 if (!rec->switch_output.filenames)
2496                         return -EINVAL;
2497         }
2498
2499         /*
2500          * Allow aliases to facilitate the lookup of symbols for address
2501          * filters. Refer to auxtrace_parse_filters().
2502          */
2503         symbol_conf.allow_aliases = true;
2504
2505         symbol__init(NULL);
2506
2507         if (rec->opts.affinity != PERF_AFFINITY_SYS) {
2508                 rec->affinity_mask.nbits = cpu__max_cpu();
2509                 rec->affinity_mask.bits = bitmap_alloc(rec->affinity_mask.nbits);
2510                 if (!rec->affinity_mask.bits) {
2511                         pr_err("Failed to allocate thread mask for %zd cpus\n", rec->affinity_mask.nbits);
2512                         return -ENOMEM;
2513                 }
2514                 pr_debug2("thread mask[%zd]: empty\n", rec->affinity_mask.nbits);
2515         }
2516
2517         err = record__auxtrace_init(rec);
2518         if (err)
2519                 goto out;
2520
2521         if (dry_run)
2522                 goto out;
2523
2524         err = bpf__setup_stdout(rec->evlist);
2525         if (err) {
2526                 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2527                 pr_err("ERROR: Setup BPF stdout failed: %s\n",
2528                          errbuf);
2529                 goto out;
2530         }
2531
2532         err = -ENOMEM;
2533
2534         if (rec->no_buildid_cache || rec->no_buildid) {
2535                 disable_buildid_cache();
2536         } else if (rec->switch_output.enabled) {
2537                 /*
2538                  * In 'perf record --switch-output', disable buildid
2539                  * generation by default to reduce data file switching
2540                  * overhead. Still generate buildid if they are required
2541                  * explicitly using
2542                  *
2543                  *  perf record --switch-output --no-no-buildid \
2544                  *              --no-no-buildid-cache
2545                  *
2546                  * Following code equals to:
2547                  *
2548                  * if ((rec->no_buildid || !rec->no_buildid_set) &&
2549                  *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2550                  *         disable_buildid_cache();
2551                  */
2552                 bool disable = true;
2553
2554                 if (rec->no_buildid_set && !rec->no_buildid)
2555                         disable = false;
2556                 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2557                         disable = false;
2558                 if (disable) {
2559                         rec->no_buildid = true;
2560                         rec->no_buildid_cache = true;
2561                         disable_buildid_cache();
2562                 }
2563         }
2564
2565         if (record.opts.overwrite)
2566                 record.opts.tail_synthesize = true;
2567
2568         if (rec->evlist->core.nr_entries == 0 &&
2569             __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2570                 pr_err("Not enough memory for event selector list\n");
2571                 goto out;
2572         }
2573
2574         if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2575                 rec->opts.no_inherit = true;
2576
2577         err = target__validate(&rec->opts.target);
2578         if (err) {
2579                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2580                 ui__warning("%s\n", errbuf);
2581         }
2582
2583         err = target__parse_uid(&rec->opts.target);
2584         if (err) {
2585                 int saved_errno = errno;
2586
2587                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2588                 ui__error("%s", errbuf);
2589
2590                 err = -saved_errno;
2591                 goto out;
2592         }
2593
2594         /* Enable ignoring missing threads when -u/-p option is defined. */
2595         rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2596
2597         err = -ENOMEM;
2598         if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2599                 usage_with_options(record_usage, record_options);
2600
2601         err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2602         if (err)
2603                 goto out;
2604
2605         /*
2606          * We take all buildids when the file contains
2607          * AUX area tracing data because we do not decode the
2608          * trace because it would take too long.
2609          */
2610         if (rec->opts.full_auxtrace)
2611                 rec->buildid_all = true;
2612
2613         if (record_opts__config(&rec->opts)) {
2614                 err = -EINVAL;
2615                 goto out;
2616         }
2617
2618         if (rec->opts.nr_cblocks > nr_cblocks_max)
2619                 rec->opts.nr_cblocks = nr_cblocks_max;
2620         pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2621
2622         pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2623         pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2624
2625         if (rec->opts.comp_level > comp_level_max)
2626                 rec->opts.comp_level = comp_level_max;
2627         pr_debug("comp level: %d\n", rec->opts.comp_level);
2628
2629         err = __cmd_record(&record, argc, argv);
2630 out:
2631         bitmap_free(rec->affinity_mask.bits);
2632         evlist__delete(rec->evlist);
2633         symbol__exit();
2634         auxtrace_record__free(rec->itr);
2635         return err;
2636 }
2637
2638 static void snapshot_sig_handler(int sig __maybe_unused)
2639 {
2640         struct record *rec = &record;
2641
2642         if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2643                 trigger_hit(&auxtrace_snapshot_trigger);
2644                 auxtrace_record__snapshot_started = 1;
2645                 if (auxtrace_record__snapshot_start(record.itr))
2646                         trigger_error(&auxtrace_snapshot_trigger);
2647         }
2648
2649         if (switch_output_signal(rec))
2650                 trigger_hit(&switch_output_trigger);
2651 }
2652
2653 static void alarm_sig_handler(int sig __maybe_unused)
2654 {
2655         struct record *rec = &record;
2656
2657         if (switch_output_time(rec))
2658                 trigger_hit(&switch_output_trigger);
2659 }