]> asedeno.scripts.mit.edu Git - linux.git/blob - tools/perf/util/mmap.c
libperf: Add 'overwrite' to 'struct perf_mmap'
[linux.git] / tools / perf / util / mmap.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2011-2017, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
4  *
5  * Parts came from evlist.c builtin-{top,stat,record}.c, see those files for further
6  * copyright notes.
7  */
8
9 #include <sys/mman.h>
10 #include <inttypes.h>
11 #include <asm/bug.h>
12 #include <linux/zalloc.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #ifdef HAVE_LIBNUMA_SUPPORT
16 #include <numaif.h>
17 #endif
18 #include "cpumap.h"
19 #include "debug.h"
20 #include "event.h"
21 #include "mmap.h"
22 #include "../perf.h"
23 #include "util.h" /* page_size */
24
25 size_t perf_mmap__mmap_len(struct mmap *map)
26 {
27         return map->core.mask + 1 + page_size;
28 }
29
30 /* When check_messup is true, 'end' must points to a good entry */
31 static union perf_event *perf_mmap__read(struct mmap *map,
32                                          u64 *startp, u64 end)
33 {
34         unsigned char *data = map->core.base + page_size;
35         union perf_event *event = NULL;
36         int diff = end - *startp;
37
38         if (diff >= (int)sizeof(event->header)) {
39                 size_t size;
40
41                 event = (union perf_event *)&data[*startp & map->core.mask];
42                 size = event->header.size;
43
44                 if (size < sizeof(event->header) || diff < (int)size)
45                         return NULL;
46
47                 /*
48                  * Event straddles the mmap boundary -- header should always
49                  * be inside due to u64 alignment of output.
50                  */
51                 if ((*startp & map->core.mask) + size != ((*startp + size) & map->core.mask)) {
52                         unsigned int offset = *startp;
53                         unsigned int len = min(sizeof(*event), size), cpy;
54                         void *dst = map->event_copy;
55
56                         do {
57                                 cpy = min(map->core.mask + 1 - (offset & map->core.mask), len);
58                                 memcpy(dst, &data[offset & map->core.mask], cpy);
59                                 offset += cpy;
60                                 dst += cpy;
61                                 len -= cpy;
62                         } while (len);
63
64                         event = (union perf_event *)map->event_copy;
65                 }
66
67                 *startp += size;
68         }
69
70         return event;
71 }
72
73 /*
74  * Read event from ring buffer one by one.
75  * Return one event for each call.
76  *
77  * Usage:
78  * perf_mmap__read_init()
79  * while(event = perf_mmap__read_event()) {
80  *      //process the event
81  *      perf_mmap__consume()
82  * }
83  * perf_mmap__read_done()
84  */
85 union perf_event *perf_mmap__read_event(struct mmap *map)
86 {
87         union perf_event *event;
88
89         /*
90          * Check if event was unmapped due to a POLLHUP/POLLERR.
91          */
92         if (!refcount_read(&map->core.refcnt))
93                 return NULL;
94
95         /* non-overwirte doesn't pause the ringbuffer */
96         if (!map->core.overwrite)
97                 map->core.end = perf_mmap__read_head(map);
98
99         event = perf_mmap__read(map, &map->core.start, map->core.end);
100
101         if (!map->core.overwrite)
102                 map->core.prev = map->core.start;
103
104         return event;
105 }
106
107 static bool perf_mmap__empty(struct mmap *map)
108 {
109         return perf_mmap__read_head(map) == map->core.prev && !map->auxtrace_mmap.base;
110 }
111
112 void perf_mmap__get(struct mmap *map)
113 {
114         refcount_inc(&map->core.refcnt);
115 }
116
117 void perf_mmap__put(struct mmap *map)
118 {
119         BUG_ON(map->core.base && refcount_read(&map->core.refcnt) == 0);
120
121         if (refcount_dec_and_test(&map->core.refcnt))
122                 perf_mmap__munmap(map);
123 }
124
125 void perf_mmap__consume(struct mmap *map)
126 {
127         if (!map->core.overwrite) {
128                 u64 old = map->core.prev;
129
130                 perf_mmap__write_tail(map, old);
131         }
132
133         if (refcount_read(&map->core.refcnt) == 1 && perf_mmap__empty(map))
134                 perf_mmap__put(map);
135 }
136
137 int __weak auxtrace_mmap__mmap(struct auxtrace_mmap *mm __maybe_unused,
138                                struct auxtrace_mmap_params *mp __maybe_unused,
139                                void *userpg __maybe_unused,
140                                int fd __maybe_unused)
141 {
142         return 0;
143 }
144
145 void __weak auxtrace_mmap__munmap(struct auxtrace_mmap *mm __maybe_unused)
146 {
147 }
148
149 void __weak auxtrace_mmap_params__init(struct auxtrace_mmap_params *mp __maybe_unused,
150                                        off_t auxtrace_offset __maybe_unused,
151                                        unsigned int auxtrace_pages __maybe_unused,
152                                        bool auxtrace_overwrite __maybe_unused)
153 {
154 }
155
156 void __weak auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp __maybe_unused,
157                                           struct evlist *evlist __maybe_unused,
158                                           int idx __maybe_unused,
159                                           bool per_cpu __maybe_unused)
160 {
161 }
162
163 #ifdef HAVE_AIO_SUPPORT
164 static int perf_mmap__aio_enabled(struct mmap *map)
165 {
166         return map->aio.nr_cblocks > 0;
167 }
168
169 #ifdef HAVE_LIBNUMA_SUPPORT
170 static int perf_mmap__aio_alloc(struct mmap *map, int idx)
171 {
172         map->aio.data[idx] = mmap(NULL, perf_mmap__mmap_len(map), PROT_READ|PROT_WRITE,
173                                   MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
174         if (map->aio.data[idx] == MAP_FAILED) {
175                 map->aio.data[idx] = NULL;
176                 return -1;
177         }
178
179         return 0;
180 }
181
182 static void perf_mmap__aio_free(struct mmap *map, int idx)
183 {
184         if (map->aio.data[idx]) {
185                 munmap(map->aio.data[idx], perf_mmap__mmap_len(map));
186                 map->aio.data[idx] = NULL;
187         }
188 }
189
190 static int perf_mmap__aio_bind(struct mmap *map, int idx, int cpu, int affinity)
191 {
192         void *data;
193         size_t mmap_len;
194         unsigned long node_mask;
195
196         if (affinity != PERF_AFFINITY_SYS && cpu__max_node() > 1) {
197                 data = map->aio.data[idx];
198                 mmap_len = perf_mmap__mmap_len(map);
199                 node_mask = 1UL << cpu__get_node(cpu);
200                 if (mbind(data, mmap_len, MPOL_BIND, &node_mask, 1, 0)) {
201                         pr_err("Failed to bind [%p-%p] AIO buffer to node %d: error %m\n",
202                                 data, data + mmap_len, cpu__get_node(cpu));
203                         return -1;
204                 }
205         }
206
207         return 0;
208 }
209 #else /* !HAVE_LIBNUMA_SUPPORT */
210 static int perf_mmap__aio_alloc(struct mmap *map, int idx)
211 {
212         map->aio.data[idx] = malloc(perf_mmap__mmap_len(map));
213         if (map->aio.data[idx] == NULL)
214                 return -1;
215
216         return 0;
217 }
218
219 static void perf_mmap__aio_free(struct mmap *map, int idx)
220 {
221         zfree(&(map->aio.data[idx]));
222 }
223
224 static int perf_mmap__aio_bind(struct mmap *map __maybe_unused, int idx __maybe_unused,
225                 int cpu __maybe_unused, int affinity __maybe_unused)
226 {
227         return 0;
228 }
229 #endif
230
231 static int perf_mmap__aio_mmap(struct mmap *map, struct mmap_params *mp)
232 {
233         int delta_max, i, prio, ret;
234
235         map->aio.nr_cblocks = mp->nr_cblocks;
236         if (map->aio.nr_cblocks) {
237                 map->aio.aiocb = calloc(map->aio.nr_cblocks, sizeof(struct aiocb *));
238                 if (!map->aio.aiocb) {
239                         pr_debug2("failed to allocate aiocb for data buffer, error %m\n");
240                         return -1;
241                 }
242                 map->aio.cblocks = calloc(map->aio.nr_cblocks, sizeof(struct aiocb));
243                 if (!map->aio.cblocks) {
244                         pr_debug2("failed to allocate cblocks for data buffer, error %m\n");
245                         return -1;
246                 }
247                 map->aio.data = calloc(map->aio.nr_cblocks, sizeof(void *));
248                 if (!map->aio.data) {
249                         pr_debug2("failed to allocate data buffer, error %m\n");
250                         return -1;
251                 }
252                 delta_max = sysconf(_SC_AIO_PRIO_DELTA_MAX);
253                 for (i = 0; i < map->aio.nr_cblocks; ++i) {
254                         ret = perf_mmap__aio_alloc(map, i);
255                         if (ret == -1) {
256                                 pr_debug2("failed to allocate data buffer area, error %m");
257                                 return -1;
258                         }
259                         ret = perf_mmap__aio_bind(map, i, map->core.cpu, mp->affinity);
260                         if (ret == -1)
261                                 return -1;
262                         /*
263                          * Use cblock.aio_fildes value different from -1
264                          * to denote started aio write operation on the
265                          * cblock so it requires explicit record__aio_sync()
266                          * call prior the cblock may be reused again.
267                          */
268                         map->aio.cblocks[i].aio_fildes = -1;
269                         /*
270                          * Allocate cblocks with priority delta to have
271                          * faster aio write system calls because queued requests
272                          * are kept in separate per-prio queues and adding
273                          * a new request will iterate thru shorter per-prio
274                          * list. Blocks with numbers higher than
275                          *  _SC_AIO_PRIO_DELTA_MAX go with priority 0.
276                          */
277                         prio = delta_max - i;
278                         map->aio.cblocks[i].aio_reqprio = prio >= 0 ? prio : 0;
279                 }
280         }
281
282         return 0;
283 }
284
285 static void perf_mmap__aio_munmap(struct mmap *map)
286 {
287         int i;
288
289         for (i = 0; i < map->aio.nr_cblocks; ++i)
290                 perf_mmap__aio_free(map, i);
291         if (map->aio.data)
292                 zfree(&map->aio.data);
293         zfree(&map->aio.cblocks);
294         zfree(&map->aio.aiocb);
295 }
296 #else /* !HAVE_AIO_SUPPORT */
297 static int perf_mmap__aio_enabled(struct mmap *map __maybe_unused)
298 {
299         return 0;
300 }
301
302 static int perf_mmap__aio_mmap(struct mmap *map __maybe_unused,
303                                struct mmap_params *mp __maybe_unused)
304 {
305         return 0;
306 }
307
308 static void perf_mmap__aio_munmap(struct mmap *map __maybe_unused)
309 {
310 }
311 #endif
312
313 void perf_mmap__munmap(struct mmap *map)
314 {
315         perf_mmap__aio_munmap(map);
316         if (map->data != NULL) {
317                 munmap(map->data, perf_mmap__mmap_len(map));
318                 map->data = NULL;
319         }
320         if (map->core.base != NULL) {
321                 munmap(map->core.base, perf_mmap__mmap_len(map));
322                 map->core.base = NULL;
323                 map->core.fd = -1;
324                 refcount_set(&map->core.refcnt, 0);
325         }
326         auxtrace_mmap__munmap(&map->auxtrace_mmap);
327 }
328
329 static void build_node_mask(int node, cpu_set_t *mask)
330 {
331         int c, cpu, nr_cpus;
332         const struct perf_cpu_map *cpu_map = NULL;
333
334         cpu_map = cpu_map__online();
335         if (!cpu_map)
336                 return;
337
338         nr_cpus = perf_cpu_map__nr(cpu_map);
339         for (c = 0; c < nr_cpus; c++) {
340                 cpu = cpu_map->map[c]; /* map c index to online cpu index */
341                 if (cpu__get_node(cpu) == node)
342                         CPU_SET(cpu, mask);
343         }
344 }
345
346 static void perf_mmap__setup_affinity_mask(struct mmap *map, struct mmap_params *mp)
347 {
348         CPU_ZERO(&map->affinity_mask);
349         if (mp->affinity == PERF_AFFINITY_NODE && cpu__max_node() > 1)
350                 build_node_mask(cpu__get_node(map->core.cpu), &map->affinity_mask);
351         else if (mp->affinity == PERF_AFFINITY_CPU)
352                 CPU_SET(map->core.cpu, &map->affinity_mask);
353 }
354
355 int perf_mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu)
356 {
357         /*
358          * The last one will be done at perf_mmap__consume(), so that we
359          * make sure we don't prevent tools from consuming every last event in
360          * the ring buffer.
361          *
362          * I.e. we can get the POLLHUP meaning that the fd doesn't exist
363          * anymore, but the last events for it are still in the ring buffer,
364          * waiting to be consumed.
365          *
366          * Tools can chose to ignore this at their own discretion, but the
367          * evlist layer can't just drop it when filtering events in
368          * perf_evlist__filter_pollfd().
369          */
370         refcount_set(&map->core.refcnt, 2);
371         map->core.prev = 0;
372         map->core.mask = mp->mask;
373         map->core.base = mmap(NULL, perf_mmap__mmap_len(map), mp->prot,
374                          MAP_SHARED, fd, 0);
375         if (map->core.base == MAP_FAILED) {
376                 pr_debug2("failed to mmap perf event ring buffer, error %d\n",
377                           errno);
378                 map->core.base = NULL;
379                 return -1;
380         }
381         map->core.fd = fd;
382         map->core.cpu = cpu;
383
384         perf_mmap__setup_affinity_mask(map, mp);
385
386         map->flush = mp->flush;
387
388         map->comp_level = mp->comp_level;
389
390         if (map->comp_level && !perf_mmap__aio_enabled(map)) {
391                 map->data = mmap(NULL, perf_mmap__mmap_len(map), PROT_READ|PROT_WRITE,
392                                  MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
393                 if (map->data == MAP_FAILED) {
394                         pr_debug2("failed to mmap data buffer, error %d\n",
395                                         errno);
396                         map->data = NULL;
397                         return -1;
398                 }
399         }
400
401         if (auxtrace_mmap__mmap(&map->auxtrace_mmap,
402                                 &mp->auxtrace_mp, map->core.base, fd))
403                 return -1;
404
405         return perf_mmap__aio_mmap(map, mp);
406 }
407
408 static int overwrite_rb_find_range(void *buf, int mask, u64 *start, u64 *end)
409 {
410         struct perf_event_header *pheader;
411         u64 evt_head = *start;
412         int size = mask + 1;
413
414         pr_debug2("%s: buf=%p, start=%"PRIx64"\n", __func__, buf, *start);
415         pheader = (struct perf_event_header *)(buf + (*start & mask));
416         while (true) {
417                 if (evt_head - *start >= (unsigned int)size) {
418                         pr_debug("Finished reading overwrite ring buffer: rewind\n");
419                         if (evt_head - *start > (unsigned int)size)
420                                 evt_head -= pheader->size;
421                         *end = evt_head;
422                         return 0;
423                 }
424
425                 pheader = (struct perf_event_header *)(buf + (evt_head & mask));
426
427                 if (pheader->size == 0) {
428                         pr_debug("Finished reading overwrite ring buffer: get start\n");
429                         *end = evt_head;
430                         return 0;
431                 }
432
433                 evt_head += pheader->size;
434                 pr_debug3("move evt_head: %"PRIx64"\n", evt_head);
435         }
436         WARN_ONCE(1, "Shouldn't get here\n");
437         return -1;
438 }
439
440 /*
441  * Report the start and end of the available data in ringbuffer
442  */
443 static int __perf_mmap__read_init(struct mmap *md)
444 {
445         u64 head = perf_mmap__read_head(md);
446         u64 old = md->core.prev;
447         unsigned char *data = md->core.base + page_size;
448         unsigned long size;
449
450         md->core.start = md->core.overwrite ? head : old;
451         md->core.end = md->core.overwrite ? old : head;
452
453         if ((md->core.end - md->core.start) < md->flush)
454                 return -EAGAIN;
455
456         size = md->core.end - md->core.start;
457         if (size > (unsigned long)(md->core.mask) + 1) {
458                 if (!md->core.overwrite) {
459                         WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n");
460
461                         md->core.prev = head;
462                         perf_mmap__consume(md);
463                         return -EAGAIN;
464                 }
465
466                 /*
467                  * Backward ring buffer is full. We still have a chance to read
468                  * most of data from it.
469                  */
470                 if (overwrite_rb_find_range(data, md->core.mask, &md->core.start, &md->core.end))
471                         return -EINVAL;
472         }
473
474         return 0;
475 }
476
477 int perf_mmap__read_init(struct mmap *map)
478 {
479         /*
480          * Check if event was unmapped due to a POLLHUP/POLLERR.
481          */
482         if (!refcount_read(&map->core.refcnt))
483                 return -ENOENT;
484
485         return __perf_mmap__read_init(map);
486 }
487
488 int perf_mmap__push(struct mmap *md, void *to,
489                     int push(struct mmap *map, void *to, void *buf, size_t size))
490 {
491         u64 head = perf_mmap__read_head(md);
492         unsigned char *data = md->core.base + page_size;
493         unsigned long size;
494         void *buf;
495         int rc = 0;
496
497         rc = perf_mmap__read_init(md);
498         if (rc < 0)
499                 return (rc == -EAGAIN) ? 1 : -1;
500
501         size = md->core.end - md->core.start;
502
503         if ((md->core.start & md->core.mask) + size != (md->core.end & md->core.mask)) {
504                 buf = &data[md->core.start & md->core.mask];
505                 size = md->core.mask + 1 - (md->core.start & md->core.mask);
506                 md->core.start += size;
507
508                 if (push(md, to, buf, size) < 0) {
509                         rc = -1;
510                         goto out;
511                 }
512         }
513
514         buf = &data[md->core.start & md->core.mask];
515         size = md->core.end - md->core.start;
516         md->core.start += size;
517
518         if (push(md, to, buf, size) < 0) {
519                 rc = -1;
520                 goto out;
521         }
522
523         md->core.prev = head;
524         perf_mmap__consume(md);
525 out:
526         return rc;
527 }
528
529 /*
530  * Mandatory for overwrite mode
531  * The direction of overwrite mode is backward.
532  * The last perf_mmap__read() will set tail to map->core.prev.
533  * Need to correct the map->core.prev to head which is the end of next read.
534  */
535 void perf_mmap__read_done(struct mmap *map)
536 {
537         /*
538          * Check if event was unmapped due to a POLLHUP/POLLERR.
539          */
540         if (!refcount_read(&map->core.refcnt))
541                 return;
542
543         map->core.prev = perf_mmap__read_head(map);
544 }