]> asedeno.scripts.mit.edu Git - linux.git/blob - tools/perf/util/thread-stack.c
6ff1ff4d4ce7fe6aa69b8fa789bb1a2a0eac71f5
[linux.git] / tools / perf / util / thread-stack.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * thread-stack.c: Synthesize a thread's stack using call / return events
4  * Copyright (c) 2014, Intel Corporation.
5  */
6
7 #include <linux/rbtree.h>
8 #include <linux/list.h>
9 #include <linux/log2.h>
10 #include <errno.h>
11 #include "thread.h"
12 #include "event.h"
13 #include "machine.h"
14 #include "env.h"
15 #include "util.h"
16 #include "debug.h"
17 #include "symbol.h"
18 #include "comm.h"
19 #include "call-path.h"
20 #include "thread-stack.h"
21
22 #define STACK_GROWTH 2048
23
24 /*
25  * State of retpoline detection.
26  *
27  * RETPOLINE_NONE: no retpoline detection
28  * X86_RETPOLINE_POSSIBLE: x86 retpoline possible
29  * X86_RETPOLINE_DETECTED: x86 retpoline detected
30  */
31 enum retpoline_state_t {
32         RETPOLINE_NONE,
33         X86_RETPOLINE_POSSIBLE,
34         X86_RETPOLINE_DETECTED,
35 };
36
37 /**
38  * struct thread_stack_entry - thread stack entry.
39  * @ret_addr: return address
40  * @timestamp: timestamp (if known)
41  * @ref: external reference (e.g. db_id of sample)
42  * @branch_count: the branch count when the entry was created
43  * @insn_count: the instruction count when the entry was created
44  * @cyc_count the cycle count when the entry was created
45  * @db_id: id used for db-export
46  * @cp: call path
47  * @no_call: a 'call' was not seen
48  * @trace_end: a 'call' but trace ended
49  * @non_call: a branch but not a 'call' to the start of a different symbol
50  */
51 struct thread_stack_entry {
52         u64 ret_addr;
53         u64 timestamp;
54         u64 ref;
55         u64 branch_count;
56         u64 insn_count;
57         u64 cyc_count;
58         u64 db_id;
59         struct call_path *cp;
60         bool no_call;
61         bool trace_end;
62         bool non_call;
63 };
64
65 /**
66  * struct thread_stack - thread stack constructed from 'call' and 'return'
67  *                       branch samples.
68  * @stack: array that holds the stack
69  * @cnt: number of entries in the stack
70  * @sz: current maximum stack size
71  * @trace_nr: current trace number
72  * @branch_count: running branch count
73  * @insn_count: running  instruction count
74  * @cyc_count running  cycle count
75  * @kernel_start: kernel start address
76  * @last_time: last timestamp
77  * @crp: call/return processor
78  * @comm: current comm
79  * @arr_sz: size of array if this is the first element of an array
80  * @rstate: used to detect retpolines
81  */
82 struct thread_stack {
83         struct thread_stack_entry *stack;
84         size_t cnt;
85         size_t sz;
86         u64 trace_nr;
87         u64 branch_count;
88         u64 insn_count;
89         u64 cyc_count;
90         u64 kernel_start;
91         u64 last_time;
92         struct call_return_processor *crp;
93         struct comm *comm;
94         unsigned int arr_sz;
95         enum retpoline_state_t rstate;
96 };
97
98 /*
99  * Assume pid == tid == 0 identifies the idle task as defined by
100  * perf_session__register_idle_thread(). The idle task is really 1 task per cpu,
101  * and therefore requires a stack for each cpu.
102  */
103 static inline bool thread_stack__per_cpu(struct thread *thread)
104 {
105         return !(thread->tid || thread->pid_);
106 }
107
108 static int thread_stack__grow(struct thread_stack *ts)
109 {
110         struct thread_stack_entry *new_stack;
111         size_t sz, new_sz;
112
113         new_sz = ts->sz + STACK_GROWTH;
114         sz = new_sz * sizeof(struct thread_stack_entry);
115
116         new_stack = realloc(ts->stack, sz);
117         if (!new_stack)
118                 return -ENOMEM;
119
120         ts->stack = new_stack;
121         ts->sz = new_sz;
122
123         return 0;
124 }
125
126 static int thread_stack__init(struct thread_stack *ts, struct thread *thread,
127                               struct call_return_processor *crp)
128 {
129         int err;
130
131         err = thread_stack__grow(ts);
132         if (err)
133                 return err;
134
135         if (thread->mg && thread->mg->machine) {
136                 struct machine *machine = thread->mg->machine;
137                 const char *arch = perf_env__arch(machine->env);
138
139                 ts->kernel_start = machine__kernel_start(machine);
140                 if (!strcmp(arch, "x86"))
141                         ts->rstate = X86_RETPOLINE_POSSIBLE;
142         } else {
143                 ts->kernel_start = 1ULL << 63;
144         }
145         ts->crp = crp;
146
147         return 0;
148 }
149
150 static struct thread_stack *thread_stack__new(struct thread *thread, int cpu,
151                                               struct call_return_processor *crp)
152 {
153         struct thread_stack *ts = thread->ts, *new_ts;
154         unsigned int old_sz = ts ? ts->arr_sz : 0;
155         unsigned int new_sz = 1;
156
157         if (thread_stack__per_cpu(thread) && cpu > 0)
158                 new_sz = roundup_pow_of_two(cpu + 1);
159
160         if (!ts || new_sz > old_sz) {
161                 new_ts = calloc(new_sz, sizeof(*ts));
162                 if (!new_ts)
163                         return NULL;
164                 if (ts)
165                         memcpy(new_ts, ts, old_sz * sizeof(*ts));
166                 new_ts->arr_sz = new_sz;
167                 zfree(&thread->ts);
168                 thread->ts = new_ts;
169                 ts = new_ts;
170         }
171
172         if (thread_stack__per_cpu(thread) && cpu > 0 &&
173             (unsigned int)cpu < ts->arr_sz)
174                 ts += cpu;
175
176         if (!ts->stack &&
177             thread_stack__init(ts, thread, crp))
178                 return NULL;
179
180         return ts;
181 }
182
183 static struct thread_stack *thread__cpu_stack(struct thread *thread, int cpu)
184 {
185         struct thread_stack *ts = thread->ts;
186
187         if (cpu < 0)
188                 cpu = 0;
189
190         if (!ts || (unsigned int)cpu >= ts->arr_sz)
191                 return NULL;
192
193         ts += cpu;
194
195         if (!ts->stack)
196                 return NULL;
197
198         return ts;
199 }
200
201 static inline struct thread_stack *thread__stack(struct thread *thread,
202                                                     int cpu)
203 {
204         if (!thread)
205                 return NULL;
206
207         if (thread_stack__per_cpu(thread))
208                 return thread__cpu_stack(thread, cpu);
209
210         return thread->ts;
211 }
212
213 static int thread_stack__push(struct thread_stack *ts, u64 ret_addr,
214                               bool trace_end)
215 {
216         int err = 0;
217
218         if (ts->cnt == ts->sz) {
219                 err = thread_stack__grow(ts);
220                 if (err) {
221                         pr_warning("Out of memory: discarding thread stack\n");
222                         ts->cnt = 0;
223                 }
224         }
225
226         ts->stack[ts->cnt].trace_end = trace_end;
227         ts->stack[ts->cnt++].ret_addr = ret_addr;
228
229         return err;
230 }
231
232 static void thread_stack__pop(struct thread_stack *ts, u64 ret_addr)
233 {
234         size_t i;
235
236         /*
237          * In some cases there may be functions which are not seen to return.
238          * For example when setjmp / longjmp has been used.  Or the perf context
239          * switch in the kernel which doesn't stop and start tracing in exactly
240          * the same code path.  When that happens the return address will be
241          * further down the stack.  If the return address is not found at all,
242          * we assume the opposite (i.e. this is a return for a call that wasn't
243          * seen for some reason) and leave the stack alone.
244          */
245         for (i = ts->cnt; i; ) {
246                 if (ts->stack[--i].ret_addr == ret_addr) {
247                         ts->cnt = i;
248                         return;
249                 }
250         }
251 }
252
253 static void thread_stack__pop_trace_end(struct thread_stack *ts)
254 {
255         size_t i;
256
257         for (i = ts->cnt; i; ) {
258                 if (ts->stack[--i].trace_end)
259                         ts->cnt = i;
260                 else
261                         return;
262         }
263 }
264
265 static bool thread_stack__in_kernel(struct thread_stack *ts)
266 {
267         if (!ts->cnt)
268                 return false;
269
270         return ts->stack[ts->cnt - 1].cp->in_kernel;
271 }
272
273 static int thread_stack__call_return(struct thread *thread,
274                                      struct thread_stack *ts, size_t idx,
275                                      u64 timestamp, u64 ref, bool no_return)
276 {
277         struct call_return_processor *crp = ts->crp;
278         struct thread_stack_entry *tse;
279         struct call_return cr = {
280                 .thread = thread,
281                 .comm = ts->comm,
282                 .db_id = 0,
283         };
284         u64 *parent_db_id;
285
286         tse = &ts->stack[idx];
287         cr.cp = tse->cp;
288         cr.call_time = tse->timestamp;
289         cr.return_time = timestamp;
290         cr.branch_count = ts->branch_count - tse->branch_count;
291         cr.insn_count = ts->insn_count - tse->insn_count;
292         cr.cyc_count = ts->cyc_count - tse->cyc_count;
293         cr.db_id = tse->db_id;
294         cr.call_ref = tse->ref;
295         cr.return_ref = ref;
296         if (tse->no_call)
297                 cr.flags |= CALL_RETURN_NO_CALL;
298         if (no_return)
299                 cr.flags |= CALL_RETURN_NO_RETURN;
300         if (tse->non_call)
301                 cr.flags |= CALL_RETURN_NON_CALL;
302
303         /*
304          * The parent db_id must be assigned before exporting the child. Note
305          * it is not possible to export the parent first because its information
306          * is not yet complete because its 'return' has not yet been processed.
307          */
308         parent_db_id = idx ? &(tse - 1)->db_id : NULL;
309
310         return crp->process(&cr, parent_db_id, crp->data);
311 }
312
313 static int __thread_stack__flush(struct thread *thread, struct thread_stack *ts)
314 {
315         struct call_return_processor *crp = ts->crp;
316         int err;
317
318         if (!crp) {
319                 ts->cnt = 0;
320                 return 0;
321         }
322
323         while (ts->cnt) {
324                 err = thread_stack__call_return(thread, ts, --ts->cnt,
325                                                 ts->last_time, 0, true);
326                 if (err) {
327                         pr_err("Error flushing thread stack!\n");
328                         ts->cnt = 0;
329                         return err;
330                 }
331         }
332
333         return 0;
334 }
335
336 int thread_stack__flush(struct thread *thread)
337 {
338         struct thread_stack *ts = thread->ts;
339         unsigned int pos;
340         int err = 0;
341
342         if (ts) {
343                 for (pos = 0; pos < ts->arr_sz; pos++) {
344                         int ret = __thread_stack__flush(thread, ts + pos);
345
346                         if (ret)
347                                 err = ret;
348                 }
349         }
350
351         return err;
352 }
353
354 int thread_stack__event(struct thread *thread, int cpu, u32 flags, u64 from_ip,
355                         u64 to_ip, u16 insn_len, u64 trace_nr)
356 {
357         struct thread_stack *ts = thread__stack(thread, cpu);
358
359         if (!thread)
360                 return -EINVAL;
361
362         if (!ts) {
363                 ts = thread_stack__new(thread, cpu, NULL);
364                 if (!ts) {
365                         pr_warning("Out of memory: no thread stack\n");
366                         return -ENOMEM;
367                 }
368                 ts->trace_nr = trace_nr;
369         }
370
371         /*
372          * When the trace is discontinuous, the trace_nr changes.  In that case
373          * the stack might be completely invalid.  Better to report nothing than
374          * to report something misleading, so flush the stack.
375          */
376         if (trace_nr != ts->trace_nr) {
377                 if (ts->trace_nr)
378                         __thread_stack__flush(thread, ts);
379                 ts->trace_nr = trace_nr;
380         }
381
382         /* Stop here if thread_stack__process() is in use */
383         if (ts->crp)
384                 return 0;
385
386         if (flags & PERF_IP_FLAG_CALL) {
387                 u64 ret_addr;
388
389                 if (!to_ip)
390                         return 0;
391                 ret_addr = from_ip + insn_len;
392                 if (ret_addr == to_ip)
393                         return 0; /* Zero-length calls are excluded */
394                 return thread_stack__push(ts, ret_addr,
395                                           flags & PERF_IP_FLAG_TRACE_END);
396         } else if (flags & PERF_IP_FLAG_TRACE_BEGIN) {
397                 /*
398                  * If the caller did not change the trace number (which would
399                  * have flushed the stack) then try to make sense of the stack.
400                  * Possibly, tracing began after returning to the current
401                  * address, so try to pop that. Also, do not expect a call made
402                  * when the trace ended, to return, so pop that.
403                  */
404                 thread_stack__pop(ts, to_ip);
405                 thread_stack__pop_trace_end(ts);
406         } else if ((flags & PERF_IP_FLAG_RETURN) && from_ip) {
407                 thread_stack__pop(ts, to_ip);
408         }
409
410         return 0;
411 }
412
413 void thread_stack__set_trace_nr(struct thread *thread, int cpu, u64 trace_nr)
414 {
415         struct thread_stack *ts = thread__stack(thread, cpu);
416
417         if (!ts)
418                 return;
419
420         if (trace_nr != ts->trace_nr) {
421                 if (ts->trace_nr)
422                         __thread_stack__flush(thread, ts);
423                 ts->trace_nr = trace_nr;
424         }
425 }
426
427 static void __thread_stack__free(struct thread *thread, struct thread_stack *ts)
428 {
429         __thread_stack__flush(thread, ts);
430         zfree(&ts->stack);
431 }
432
433 static void thread_stack__reset(struct thread *thread, struct thread_stack *ts)
434 {
435         unsigned int arr_sz = ts->arr_sz;
436
437         __thread_stack__free(thread, ts);
438         memset(ts, 0, sizeof(*ts));
439         ts->arr_sz = arr_sz;
440 }
441
442 void thread_stack__free(struct thread *thread)
443 {
444         struct thread_stack *ts = thread->ts;
445         unsigned int pos;
446
447         if (ts) {
448                 for (pos = 0; pos < ts->arr_sz; pos++)
449                         __thread_stack__free(thread, ts + pos);
450                 zfree(&thread->ts);
451         }
452 }
453
454 static inline u64 callchain_context(u64 ip, u64 kernel_start)
455 {
456         return ip < kernel_start ? PERF_CONTEXT_USER : PERF_CONTEXT_KERNEL;
457 }
458
459 void thread_stack__sample(struct thread *thread, int cpu,
460                           struct ip_callchain *chain,
461                           size_t sz, u64 ip, u64 kernel_start)
462 {
463         struct thread_stack *ts = thread__stack(thread, cpu);
464         u64 context = callchain_context(ip, kernel_start);
465         u64 last_context;
466         size_t i, j;
467
468         if (sz < 2) {
469                 chain->nr = 0;
470                 return;
471         }
472
473         chain->ips[0] = context;
474         chain->ips[1] = ip;
475
476         if (!ts) {
477                 chain->nr = 2;
478                 return;
479         }
480
481         last_context = context;
482
483         for (i = 2, j = 1; i < sz && j <= ts->cnt; i++, j++) {
484                 ip = ts->stack[ts->cnt - j].ret_addr;
485                 context = callchain_context(ip, kernel_start);
486                 if (context != last_context) {
487                         if (i >= sz - 1)
488                                 break;
489                         chain->ips[i++] = context;
490                         last_context = context;
491                 }
492                 chain->ips[i] = ip;
493         }
494
495         chain->nr = i;
496 }
497
498 struct call_return_processor *
499 call_return_processor__new(int (*process)(struct call_return *cr, u64 *parent_db_id, void *data),
500                            void *data)
501 {
502         struct call_return_processor *crp;
503
504         crp = zalloc(sizeof(struct call_return_processor));
505         if (!crp)
506                 return NULL;
507         crp->cpr = call_path_root__new();
508         if (!crp->cpr)
509                 goto out_free;
510         crp->process = process;
511         crp->data = data;
512         return crp;
513
514 out_free:
515         free(crp);
516         return NULL;
517 }
518
519 void call_return_processor__free(struct call_return_processor *crp)
520 {
521         if (crp) {
522                 call_path_root__free(crp->cpr);
523                 free(crp);
524         }
525 }
526
527 static int thread_stack__push_cp(struct thread_stack *ts, u64 ret_addr,
528                                  u64 timestamp, u64 ref, struct call_path *cp,
529                                  bool no_call, bool trace_end)
530 {
531         struct thread_stack_entry *tse;
532         int err;
533
534         if (!cp)
535                 return -ENOMEM;
536
537         if (ts->cnt == ts->sz) {
538                 err = thread_stack__grow(ts);
539                 if (err)
540                         return err;
541         }
542
543         tse = &ts->stack[ts->cnt++];
544         tse->ret_addr = ret_addr;
545         tse->timestamp = timestamp;
546         tse->ref = ref;
547         tse->branch_count = ts->branch_count;
548         tse->insn_count = ts->insn_count;
549         tse->cyc_count = ts->cyc_count;
550         tse->cp = cp;
551         tse->no_call = no_call;
552         tse->trace_end = trace_end;
553         tse->non_call = false;
554         tse->db_id = 0;
555
556         return 0;
557 }
558
559 static int thread_stack__pop_cp(struct thread *thread, struct thread_stack *ts,
560                                 u64 ret_addr, u64 timestamp, u64 ref,
561                                 struct symbol *sym)
562 {
563         int err;
564
565         if (!ts->cnt)
566                 return 1;
567
568         if (ts->cnt == 1) {
569                 struct thread_stack_entry *tse = &ts->stack[0];
570
571                 if (tse->cp->sym == sym)
572                         return thread_stack__call_return(thread, ts, --ts->cnt,
573                                                          timestamp, ref, false);
574         }
575
576         if (ts->stack[ts->cnt - 1].ret_addr == ret_addr &&
577             !ts->stack[ts->cnt - 1].non_call) {
578                 return thread_stack__call_return(thread, ts, --ts->cnt,
579                                                  timestamp, ref, false);
580         } else {
581                 size_t i = ts->cnt - 1;
582
583                 while (i--) {
584                         if (ts->stack[i].ret_addr != ret_addr ||
585                             ts->stack[i].non_call)
586                                 continue;
587                         i += 1;
588                         while (ts->cnt > i) {
589                                 err = thread_stack__call_return(thread, ts,
590                                                                 --ts->cnt,
591                                                                 timestamp, ref,
592                                                                 true);
593                                 if (err)
594                                         return err;
595                         }
596                         return thread_stack__call_return(thread, ts, --ts->cnt,
597                                                          timestamp, ref, false);
598                 }
599         }
600
601         return 1;
602 }
603
604 static int thread_stack__bottom(struct thread_stack *ts,
605                                 struct perf_sample *sample,
606                                 struct addr_location *from_al,
607                                 struct addr_location *to_al, u64 ref)
608 {
609         struct call_path_root *cpr = ts->crp->cpr;
610         struct call_path *cp;
611         struct symbol *sym;
612         u64 ip;
613
614         if (sample->ip) {
615                 ip = sample->ip;
616                 sym = from_al->sym;
617         } else if (sample->addr) {
618                 ip = sample->addr;
619                 sym = to_al->sym;
620         } else {
621                 return 0;
622         }
623
624         cp = call_path__findnew(cpr, &cpr->call_path, sym, ip,
625                                 ts->kernel_start);
626
627         return thread_stack__push_cp(ts, ip, sample->time, ref, cp,
628                                      true, false);
629 }
630
631 static int thread_stack__pop_ks(struct thread *thread, struct thread_stack *ts,
632                                 struct perf_sample *sample, u64 ref)
633 {
634         u64 tm = sample->time;
635         int err;
636
637         /* Return to userspace, so pop all kernel addresses */
638         while (thread_stack__in_kernel(ts)) {
639                 err = thread_stack__call_return(thread, ts, --ts->cnt,
640                                                 tm, ref, true);
641                 if (err)
642                         return err;
643         }
644
645         return 0;
646 }
647
648 static int thread_stack__no_call_return(struct thread *thread,
649                                         struct thread_stack *ts,
650                                         struct perf_sample *sample,
651                                         struct addr_location *from_al,
652                                         struct addr_location *to_al, u64 ref)
653 {
654         struct call_path_root *cpr = ts->crp->cpr;
655         struct call_path *root = &cpr->call_path;
656         struct symbol *fsym = from_al->sym;
657         struct symbol *tsym = to_al->sym;
658         struct call_path *cp, *parent;
659         u64 ks = ts->kernel_start;
660         u64 addr = sample->addr;
661         u64 tm = sample->time;
662         u64 ip = sample->ip;
663         int err;
664
665         if (ip >= ks && addr < ks) {
666                 /* Return to userspace, so pop all kernel addresses */
667                 err = thread_stack__pop_ks(thread, ts, sample, ref);
668                 if (err)
669                         return err;
670
671                 /* If the stack is empty, push the userspace address */
672                 if (!ts->cnt) {
673                         cp = call_path__findnew(cpr, root, tsym, addr, ks);
674                         return thread_stack__push_cp(ts, 0, tm, ref, cp, true,
675                                                      false);
676                 }
677         } else if (thread_stack__in_kernel(ts) && ip < ks) {
678                 /* Return to userspace, so pop all kernel addresses */
679                 err = thread_stack__pop_ks(thread, ts, sample, ref);
680                 if (err)
681                         return err;
682         }
683
684         if (ts->cnt)
685                 parent = ts->stack[ts->cnt - 1].cp;
686         else
687                 parent = root;
688
689         if (parent->sym == from_al->sym) {
690                 /*
691                  * At the bottom of the stack, assume the missing 'call' was
692                  * before the trace started. So, pop the current symbol and push
693                  * the 'to' symbol.
694                  */
695                 if (ts->cnt == 1) {
696                         err = thread_stack__call_return(thread, ts, --ts->cnt,
697                                                         tm, ref, false);
698                         if (err)
699                                 return err;
700                 }
701
702                 if (!ts->cnt) {
703                         cp = call_path__findnew(cpr, root, tsym, addr, ks);
704
705                         return thread_stack__push_cp(ts, addr, tm, ref, cp,
706                                                      true, false);
707                 }
708
709                 /*
710                  * Otherwise assume the 'return' is being used as a jump (e.g.
711                  * retpoline) and just push the 'to' symbol.
712                  */
713                 cp = call_path__findnew(cpr, parent, tsym, addr, ks);
714
715                 err = thread_stack__push_cp(ts, 0, tm, ref, cp, true, false);
716                 if (!err)
717                         ts->stack[ts->cnt - 1].non_call = true;
718
719                 return err;
720         }
721
722         /*
723          * Assume 'parent' has not yet returned, so push 'to', and then push and
724          * pop 'from'.
725          */
726
727         cp = call_path__findnew(cpr, parent, tsym, addr, ks);
728
729         err = thread_stack__push_cp(ts, addr, tm, ref, cp, true, false);
730         if (err)
731                 return err;
732
733         cp = call_path__findnew(cpr, cp, fsym, ip, ks);
734
735         err = thread_stack__push_cp(ts, ip, tm, ref, cp, true, false);
736         if (err)
737                 return err;
738
739         return thread_stack__call_return(thread, ts, --ts->cnt, tm, ref, false);
740 }
741
742 static int thread_stack__trace_begin(struct thread *thread,
743                                      struct thread_stack *ts, u64 timestamp,
744                                      u64 ref)
745 {
746         struct thread_stack_entry *tse;
747         int err;
748
749         if (!ts->cnt)
750                 return 0;
751
752         /* Pop trace end */
753         tse = &ts->stack[ts->cnt - 1];
754         if (tse->trace_end) {
755                 err = thread_stack__call_return(thread, ts, --ts->cnt,
756                                                 timestamp, ref, false);
757                 if (err)
758                         return err;
759         }
760
761         return 0;
762 }
763
764 static int thread_stack__trace_end(struct thread_stack *ts,
765                                    struct perf_sample *sample, u64 ref)
766 {
767         struct call_path_root *cpr = ts->crp->cpr;
768         struct call_path *cp;
769         u64 ret_addr;
770
771         /* No point having 'trace end' on the bottom of the stack */
772         if (!ts->cnt || (ts->cnt == 1 && ts->stack[0].ref == ref))
773                 return 0;
774
775         cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp, NULL, 0,
776                                 ts->kernel_start);
777
778         ret_addr = sample->ip + sample->insn_len;
779
780         return thread_stack__push_cp(ts, ret_addr, sample->time, ref, cp,
781                                      false, true);
782 }
783
784 static bool is_x86_retpoline(const char *name)
785 {
786         const char *p = strstr(name, "__x86_indirect_thunk_");
787
788         return p == name || !strcmp(name, "__indirect_thunk_start");
789 }
790
791 /*
792  * x86 retpoline functions pollute the call graph. This function removes them.
793  * This does not handle function return thunks, nor is there any improvement
794  * for the handling of inline thunks or extern thunks.
795  */
796 static int thread_stack__x86_retpoline(struct thread_stack *ts,
797                                        struct perf_sample *sample,
798                                        struct addr_location *to_al)
799 {
800         struct thread_stack_entry *tse = &ts->stack[ts->cnt - 1];
801         struct call_path_root *cpr = ts->crp->cpr;
802         struct symbol *sym = tse->cp->sym;
803         struct symbol *tsym = to_al->sym;
804         struct call_path *cp;
805
806         if (sym && is_x86_retpoline(sym->name)) {
807                 /*
808                  * This is a x86 retpoline fn. It pollutes the call graph by
809                  * showing up everywhere there is an indirect branch, but does
810                  * not itself mean anything. Here the top-of-stack is removed,
811                  * by decrementing the stack count, and then further down, the
812                  * resulting top-of-stack is replaced with the actual target.
813                  * The result is that the retpoline functions will no longer
814                  * appear in the call graph. Note this only affects the call
815                  * graph, since all the original branches are left unchanged.
816                  */
817                 ts->cnt -= 1;
818                 sym = ts->stack[ts->cnt - 2].cp->sym;
819                 if (sym && sym == tsym && to_al->addr != tsym->start) {
820                         /*
821                          * Target is back to the middle of the symbol we came
822                          * from so assume it is an indirect jmp and forget it
823                          * altogether.
824                          */
825                         ts->cnt -= 1;
826                         return 0;
827                 }
828         } else if (sym && sym == tsym) {
829                 /*
830                  * Target is back to the symbol we came from so assume it is an
831                  * indirect jmp and forget it altogether.
832                  */
833                 ts->cnt -= 1;
834                 return 0;
835         }
836
837         cp = call_path__findnew(cpr, ts->stack[ts->cnt - 2].cp, tsym,
838                                 sample->addr, ts->kernel_start);
839         if (!cp)
840                 return -ENOMEM;
841
842         /* Replace the top-of-stack with the actual target */
843         ts->stack[ts->cnt - 1].cp = cp;
844
845         return 0;
846 }
847
848 int thread_stack__process(struct thread *thread, struct comm *comm,
849                           struct perf_sample *sample,
850                           struct addr_location *from_al,
851                           struct addr_location *to_al, u64 ref,
852                           struct call_return_processor *crp)
853 {
854         struct thread_stack *ts = thread__stack(thread, sample->cpu);
855         enum retpoline_state_t rstate;
856         int err = 0;
857
858         if (ts && !ts->crp) {
859                 /* Supersede thread_stack__event() */
860                 thread_stack__reset(thread, ts);
861                 ts = NULL;
862         }
863
864         if (!ts) {
865                 ts = thread_stack__new(thread, sample->cpu, crp);
866                 if (!ts)
867                         return -ENOMEM;
868                 ts->comm = comm;
869         }
870
871         rstate = ts->rstate;
872         if (rstate == X86_RETPOLINE_DETECTED)
873                 ts->rstate = X86_RETPOLINE_POSSIBLE;
874
875         /* Flush stack on exec */
876         if (ts->comm != comm && thread->pid_ == thread->tid) {
877                 err = __thread_stack__flush(thread, ts);
878                 if (err)
879                         return err;
880                 ts->comm = comm;
881         }
882
883         /* If the stack is empty, put the current symbol on the stack */
884         if (!ts->cnt) {
885                 err = thread_stack__bottom(ts, sample, from_al, to_al, ref);
886                 if (err)
887                         return err;
888         }
889
890         ts->branch_count += 1;
891         ts->insn_count += sample->insn_cnt;
892         ts->cyc_count += sample->cyc_cnt;
893         ts->last_time = sample->time;
894
895         if (sample->flags & PERF_IP_FLAG_CALL) {
896                 bool trace_end = sample->flags & PERF_IP_FLAG_TRACE_END;
897                 struct call_path_root *cpr = ts->crp->cpr;
898                 struct call_path *cp;
899                 u64 ret_addr;
900
901                 if (!sample->ip || !sample->addr)
902                         return 0;
903
904                 ret_addr = sample->ip + sample->insn_len;
905                 if (ret_addr == sample->addr)
906                         return 0; /* Zero-length calls are excluded */
907
908                 cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp,
909                                         to_al->sym, sample->addr,
910                                         ts->kernel_start);
911                 err = thread_stack__push_cp(ts, ret_addr, sample->time, ref,
912                                             cp, false, trace_end);
913
914                 /*
915                  * A call to the same symbol but not the start of the symbol,
916                  * may be the start of a x86 retpoline.
917                  */
918                 if (!err && rstate == X86_RETPOLINE_POSSIBLE && to_al->sym &&
919                     from_al->sym == to_al->sym &&
920                     to_al->addr != to_al->sym->start)
921                         ts->rstate = X86_RETPOLINE_DETECTED;
922
923         } else if (sample->flags & PERF_IP_FLAG_RETURN) {
924                 if (!sample->addr) {
925                         u32 return_from_kernel = PERF_IP_FLAG_SYSCALLRET |
926                                                  PERF_IP_FLAG_INTERRUPT;
927
928                         if (!(sample->flags & return_from_kernel))
929                                 return 0;
930
931                         /* Pop kernel stack */
932                         return thread_stack__pop_ks(thread, ts, sample, ref);
933                 }
934
935                 if (!sample->ip)
936                         return 0;
937
938                 /* x86 retpoline 'return' doesn't match the stack */
939                 if (rstate == X86_RETPOLINE_DETECTED && ts->cnt > 2 &&
940                     ts->stack[ts->cnt - 1].ret_addr != sample->addr)
941                         return thread_stack__x86_retpoline(ts, sample, to_al);
942
943                 err = thread_stack__pop_cp(thread, ts, sample->addr,
944                                            sample->time, ref, from_al->sym);
945                 if (err) {
946                         if (err < 0)
947                                 return err;
948                         err = thread_stack__no_call_return(thread, ts, sample,
949                                                            from_al, to_al, ref);
950                 }
951         } else if (sample->flags & PERF_IP_FLAG_TRACE_BEGIN) {
952                 err = thread_stack__trace_begin(thread, ts, sample->time, ref);
953         } else if (sample->flags & PERF_IP_FLAG_TRACE_END) {
954                 err = thread_stack__trace_end(ts, sample, ref);
955         } else if (sample->flags & PERF_IP_FLAG_BRANCH &&
956                    from_al->sym != to_al->sym && to_al->sym &&
957                    to_al->addr == to_al->sym->start) {
958                 struct call_path_root *cpr = ts->crp->cpr;
959                 struct call_path *cp;
960
961                 /*
962                  * The compiler might optimize a call/ret combination by making
963                  * it a jmp. Make that visible by recording on the stack a
964                  * branch to the start of a different symbol. Note, that means
965                  * when a ret pops the stack, all jmps must be popped off first.
966                  */
967                 cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp,
968                                         to_al->sym, sample->addr,
969                                         ts->kernel_start);
970                 err = thread_stack__push_cp(ts, 0, sample->time, ref, cp, false,
971                                             false);
972                 if (!err)
973                         ts->stack[ts->cnt - 1].non_call = true;
974         }
975
976         return err;
977 }
978
979 size_t thread_stack__depth(struct thread *thread, int cpu)
980 {
981         struct thread_stack *ts = thread__stack(thread, cpu);
982
983         if (!ts)
984                 return 0;
985         return ts->cnt;
986 }