tools/perf/builtin-sched.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 #include "builtin.h"
   3 #include "perf.h"
   4 #include "perf-sys.h"
   5
   6 #include "util/cpumap.h"
   7 #include "util/evlist.h"
   8 #include "util/evsel.h"
   9 #include "util/symbol.h"
  10 #include "util/thread.h"
  11 #include "util/header.h"
  12 #include "util/session.h"
  13 #include "util/tool.h"
  14 #include "util/cloexec.h"
  15 #include "util/thread_map.h"
  16 #include "util/color.h"
  17 #include "util/stat.h"
  18 #include "util/string2.h"
  19 #include "util/callchain.h"
  20 #include "util/time-utils.h"
  21
  22 #include <subcmd/pager.h>
  23 #include <subcmd/parse-options.h>
  24 #include "util/trace-event.h"
  25
  26 #include "util/debug.h"
  27 #include "util/event.h"
  28
  29 #include <linux/kernel.h>
  30 #include <linux/log2.h>
  31 #include <linux/zalloc.h>
  32 #include <sys/prctl.h>
  33 #include <sys/resource.h>
  34 #include <inttypes.h>
  35
  36 #include <errno.h>
  37 #include <semaphore.h>
  38 #include <pthread.h>
  39 #include <math.h>
  40 #include <api/fs/fs.h>
  41 #include <perf/cpumap.h>
  42 #include <linux/time64.h>
  43 #include <linux/err.h>
  44
  45 #include <linux/ctype.h>
  46
  47 #define PR_SET_NAME             15               /* Set process name */
  48 #define MAX_CPUS                4096
  49 #define COMM_LEN                20
  50 #define SYM_LEN                 129
  51 #define MAX_PID                 1024000
  52
  53 struct sched_atom;
  54
  55 struct task_desc {
  56         unsigned long           nr;
  57         unsigned long           pid;
  58         char                    comm[COMM_LEN];
  59
  60         unsigned long           nr_events;
  61         unsigned long           curr_event;
  62         struct sched_atom       **atoms;
  63
  64         pthread_t               thread;
  65         sem_t                   sleep_sem;
  66
  67         sem_t                   ready_for_work;
  68         sem_t                   work_done_sem;
  69
  70         u64                     cpu_usage;
  71 };
  72
  73 enum sched_event_type {
  74         SCHED_EVENT_RUN,
  75         SCHED_EVENT_SLEEP,
  76         SCHED_EVENT_WAKEUP,
  77         SCHED_EVENT_MIGRATION,
  78 };
  79
  80 struct sched_atom {
  81         enum sched_event_type   type;
  82         int                     specific_wait;
  83         u64                     timestamp;
  84         u64                     duration;
  85         unsigned long           nr;
  86         sem_t                   *wait_sem;
  87         struct task_desc        *wakee;
  88 };
  89
  90 #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
  91
  92 /* task state bitmask, copied from include/linux/sched.h */
  93 #define TASK_RUNNING            0
  94 #define TASK_INTERRUPTIBLE      1
  95 #define TASK_UNINTERRUPTIBLE    2
  96 #define __TASK_STOPPED          4
  97 #define __TASK_TRACED           8
  98 /* in tsk->exit_state */
  99 #define EXIT_DEAD               16
 100 #define EXIT_ZOMBIE             32
 101 #define EXIT_TRACE              (EXIT_ZOMBIE | EXIT_DEAD)
 102 /* in tsk->state again */
 103 #define TASK_DEAD               64
 104 #define TASK_WAKEKILL           128
 105 #define TASK_WAKING             256
 106 #define TASK_PARKED             512
 107
 108 enum thread_state {
 109         THREAD_SLEEPING = 0,
 110         THREAD_WAIT_CPU,
 111         THREAD_SCHED_IN,
 112         THREAD_IGNORE
 113 };
 114
 115 struct work_atom {
 116         struct list_head        list;
 117         enum thread_state       state;
 118         u64                     sched_out_time;
 119         u64                     wake_up_time;
 120         u64                     sched_in_time;
 121         u64                     runtime;
 122 };
 123
 124 struct work_atoms {
 125         struct list_head        work_list;
 126         struct thread           *thread;
 127         struct rb_node          node;
 128         u64                     max_lat;
 129         u64                     max_lat_at;
 130         u64                     total_lat;
 131         u64                     nb_atoms;
 132         u64                     total_runtime;
 133         int                     num_merged;
 134 };
 135
 136 typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
 137
 138 struct perf_sched;
 139
 140 struct trace_sched_handler {
 141         int (*switch_event)(struct perf_sched *sched, struct evsel *evsel,
 142                             struct perf_sample *sample, struct machine *machine);
 143
 144         int (*runtime_event)(struct perf_sched *sched, struct evsel *evsel,
 145                              struct perf_sample *sample, struct machine *machine);
 146
 147         int (*wakeup_event)(struct perf_sched *sched, struct evsel *evsel,
 148                             struct perf_sample *sample, struct machine *machine);
 149
 150         /* PERF_RECORD_FORK event, not sched_process_fork tracepoint */
 151         int (*fork_event)(struct perf_sched *sched, union perf_event *event,
 152                           struct machine *machine);
 153
 154         int (*migrate_task_event)(struct perf_sched *sched,
 155                                   struct evsel *evsel,
 156                                   struct perf_sample *sample,
 157                                   struct machine *machine);
 158 };
 159
 160 #define COLOR_PIDS PERF_COLOR_BLUE
 161 #define COLOR_CPUS PERF_COLOR_BG_RED
 162
 163 struct perf_sched_map {
 164         DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS);
 165         int                     *comp_cpus;
 166         bool                     comp;
 167         struct perf_thread_map *color_pids;
 168         const char              *color_pids_str;
 169         struct perf_cpu_map     *color_cpus;
 170         const char              *color_cpus_str;
 171         struct perf_cpu_map     *cpus;
 172         const char              *cpus_str;
 173 };
 174
 175 struct perf_sched {
 176         struct perf_tool tool;
 177         const char       *sort_order;
 178         unsigned long    nr_tasks;
 179         struct task_desc **pid_to_task;
 180         struct task_desc **tasks;
 181         const struct trace_sched_handler *tp_handler;
 182         pthread_mutex_t  start_work_mutex;
 183         pthread_mutex_t  work_done_wait_mutex;
 184         int              profile_cpu;
 185 /*
 186  * Track the current task - that way we can know whether there's any
 187  * weird events, such as a task being switched away that is not current.
 188  */
 189         int              max_cpu;
 190         u32              curr_pid[MAX_CPUS];
 191         struct thread    *curr_thread[MAX_CPUS];
 192         char             next_shortname1;
 193         char             next_shortname2;
 194         unsigned int     replay_repeat;
 195         unsigned long    nr_run_events;
 196         unsigned long    nr_sleep_events;
 197         unsigned long    nr_wakeup_events;
 198         unsigned long    nr_sleep_corrections;
 199         unsigned long    nr_run_events_optimized;
 200         unsigned long    targetless_wakeups;
 201         unsigned long    multitarget_wakeups;
 202         unsigned long    nr_runs;
 203         unsigned long    nr_timestamps;
 204         unsigned long    nr_unordered_timestamps;
 205         unsigned long    nr_context_switch_bugs;
 206         unsigned long    nr_events;
 207         unsigned long    nr_lost_chunks;
 208         unsigned long    nr_lost_events;
 209         u64              run_measurement_overhead;
 210         u64              sleep_measurement_overhead;
 211         u64              start_time;
 212         u64              cpu_usage;
 213         u64              runavg_cpu_usage;
 214         u64              parent_cpu_usage;
 215         u64              runavg_parent_cpu_usage;
 216         u64              sum_runtime;
 217         u64              sum_fluct;
 218         u64              run_avg;
 219         u64              all_runtime;
 220         u64              all_count;
 221         u64              cpu_last_switched[MAX_CPUS];
 222         struct rb_root_cached atom_root, sorted_atom_root, merged_atom_root;
 223         struct list_head sort_list, cmp_pid;
 224         bool force;
 225         bool skip_merge;
 226         struct perf_sched_map map;
 227
 228         /* options for timehist command */
 229         bool            summary;
 230         bool            summary_only;
 231         bool            idle_hist;
 232         bool            show_callchain;
 233         unsigned int    max_stack;
 234         bool            show_cpu_visual;
 235         bool            show_wakeups;
 236         bool            show_next;
 237         bool            show_migrations;
 238         bool            show_state;
 239         u64             skipped_samples;
 240         const char      *time_str;
 241         struct perf_time_interval ptime;
 242         struct perf_time_interval hist_time;
 243 };
 244
 245 /* per thread run time data */
 246 struct thread_runtime {
 247         u64 last_time;      /* time of previous sched in/out event */
 248         u64 dt_run;         /* run time */
 249         u64 dt_sleep;       /* time between CPU access by sleep (off cpu) */
 250         u64 dt_iowait;      /* time between CPU access by iowait (off cpu) */
 251         u64 dt_preempt;     /* time between CPU access by preempt (off cpu) */
 252         u64 dt_delay;       /* time between wakeup and sched-in */
 253         u64 ready_to_run;   /* time of wakeup */
 254
 255         struct stats run_stats;
 256         u64 total_run_time;
 257         u64 total_sleep_time;
 258         u64 total_iowait_time;
 259         u64 total_preempt_time;
 260         u64 total_delay_time;
 261
 262         int last_state;
 263
 264         char shortname[3];
 265         bool comm_changed;
 266
 267         u64 migrations;
 268 };
 269
 270 /* per event run time data */
 271 struct evsel_runtime {
 272         u64 *last_time; /* time this event was last seen per cpu */
 273         u32 ncpu;       /* highest cpu slot allocated */
 274 };
 275
 276 /* per cpu idle time data */
 277 struct idle_thread_runtime {
 278         struct thread_runtime   tr;
 279         struct thread           *last_thread;
 280         struct rb_root_cached   sorted_root;
 281         struct callchain_root   callchain;
 282         struct callchain_cursor cursor;
 283 };
 284
 285 /* track idle times per cpu */
 286 static struct thread **idle_threads;
 287 static int idle_max_cpu;
 288 static char idle_comm[] = "<idle>";
 289
 290 static u64 get_nsecs(void)
 291 {
 292         struct timespec ts;
 293
 294         clock_gettime(CLOCK_MONOTONIC, &ts);
 295
 296         return ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec;
 297 }
 298
 299 static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
 300 {
 301         u64 T0 = get_nsecs(), T1;
 302
 303         do {
 304                 T1 = get_nsecs();
 305         } while (T1 + sched->run_measurement_overhead < T0 + nsecs);
 306 }
 307
 308 static void sleep_nsecs(u64 nsecs)
 309 {
 310         struct timespec ts;
 311
 312         ts.tv_nsec = nsecs % 999999999;
 313         ts.tv_sec = nsecs / 999999999;
 314
 315         nanosleep(&ts, NULL);
 316 }
 317
 318 static void calibrate_run_measurement_overhead(struct perf_sched *sched)
 319 {
 320         u64 T0, T1, delta, min_delta = NSEC_PER_SEC;
 321         int i;
 322
 323         for (i = 0; i < 10; i++) {
 324                 T0 = get_nsecs();
 325                 burn_nsecs(sched, 0);
 326                 T1 = get_nsecs();
 327                 delta = T1-T0;
 328                 min_delta = min(min_delta, delta);
 329         }
 330         sched->run_measurement_overhead = min_delta;
 331
 332         printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
 333 }
 334
 335 static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
 336 {
 337         u64 T0, T1, delta, min_delta = NSEC_PER_SEC;
 338         int i;
 339
 340         for (i = 0; i < 10; i++) {
 341                 T0 = get_nsecs();
 342                 sleep_nsecs(10000);
 343                 T1 = get_nsecs();
 344                 delta = T1-T0;
 345                 min_delta = min(min_delta, delta);
 346         }
 347         min_delta -= 10000;
 348         sched->sleep_measurement_overhead = min_delta;
 349
 350         printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
 351 }
 352
 353 static struct sched_atom *
 354 get_new_event(struct task_desc *task, u64 timestamp)
 355 {
 356         struct sched_atom *event = zalloc(sizeof(*event));
 357         unsigned long idx = task->nr_events;
 358         size_t size;
 359
 360         event->timestamp = timestamp;
 361         event->nr = idx;
 362
 363         task->nr_events++;
 364         size = sizeof(struct sched_atom *) * task->nr_events;
 365         task->atoms = realloc(task->atoms, size);
 366         BUG_ON(!task->atoms);
 367
 368         task->atoms[idx] = event;
 369
 370         return event;
 371 }
 372
 373 static struct sched_atom *last_event(struct task_desc *task)
 374 {
 375         if (!task->nr_events)
 376                 return NULL;
 377
 378         return task->atoms[task->nr_events - 1];
 379 }
 380
 381 static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task,
 382                                 u64 timestamp, u64 duration)
 383 {
 384         struct sched_atom *event, *curr_event = last_event(task);
 385
 386         /*
 387          * optimize an existing RUN event by merging this one
 388          * to it:
 389          */
 390         if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
 391                 sched->nr_run_events_optimized++;
 392                 curr_event->duration += duration;
 393                 return;
 394         }
 395
 396         event = get_new_event(task, timestamp);
 397
 398         event->type = SCHED_EVENT_RUN;
 399         event->duration = duration;
 400
 401         sched->nr_run_events++;
 402 }
 403
 404 static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task,
 405                                    u64 timestamp, struct task_desc *wakee)
 406 {
 407         struct sched_atom *event, *wakee_event;
 408
 409         event = get_new_event(task, timestamp);
 410         event->type = SCHED_EVENT_WAKEUP;
 411         event->wakee = wakee;
 412
 413         wakee_event = last_event(wakee);
 414         if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
 415                 sched->targetless_wakeups++;
 416                 return;
 417         }
 418         if (wakee_event->wait_sem) {
 419                 sched->multitarget_wakeups++;
 420                 return;
 421         }
 422
 423         wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
 424         sem_init(wakee_event->wait_sem, 0, 0);
 425         wakee_event->specific_wait = 1;
 426         event->wait_sem = wakee_event->wait_sem;
 427
 428         sched->nr_wakeup_events++;
 429 }
 430
 431 static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
 432                                   u64 timestamp, u64 task_state __maybe_unused)
 433 {
 434         struct sched_atom *event = get_new_event(task, timestamp);
 435
 436         event->type = SCHED_EVENT_SLEEP;
 437
 438         sched->nr_sleep_events++;
 439 }
 440
 441 static struct task_desc *register_pid(struct perf_sched *sched,
 442                                       unsigned long pid, const char *comm)
 443 {
 444         struct task_desc *task;
 445         static int pid_max;
 446
 447         if (sched->pid_to_task == NULL) {
 448                 if (sysctl__read_int("kernel/pid_max", &pid_max) < 0)
 449                         pid_max = MAX_PID;
 450                 BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL);
 451         }
 452         if (pid >= (unsigned long)pid_max) {
 453                 BUG_ON((sched->pid_to_task = realloc(sched->pid_to_task, (pid + 1) *
 454                         sizeof(struct task_desc *))) == NULL);
 455                 while (pid >= (unsigned long)pid_max)
 456                         sched->pid_to_task[pid_max++] = NULL;
 457         }
 458
 459         task = sched->pid_to_task[pid];
 460
 461         if (task)
 462                 return task;
 463
 464         task = zalloc(sizeof(*task));
 465         task->pid = pid;
 466         task->nr = sched->nr_tasks;
 467         strcpy(task->comm, comm);
 468         /*
 469          * every task starts in sleeping state - this gets ignored
 470          * if there's no wakeup pointing to this sleep state:
 471          */
 472         add_sched_event_sleep(sched, task, 0, 0);
 473
 474         sched->pid_to_task[pid] = task;
 475         sched->nr_tasks++;
 476         sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_desc *));
 477         BUG_ON(!sched->tasks);
 478         sched->tasks[task->nr] = task;
 479
 480         if (verbose > 0)
 481                 printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
 482
 483         return task;
 484 }
 485
 486
 487 static void print_task_traces(struct perf_sched *sched)
 488 {
 489         struct task_desc *task;
 490         unsigned long i;
 491
 492         for (i = 0; i < sched->nr_tasks; i++) {
 493                 task = sched->tasks[i];
 494                 printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
 495                         task->nr, task->comm, task->pid, task->nr_events);
 496         }
 497 }
 498
 499 static void add_cross_task_wakeups(struct perf_sched *sched)
 500 {
 501         struct task_desc *task1, *task2;
 502         unsigned long i, j;
 503
 504         for (i = 0; i < sched->nr_tasks; i++) {
 505                 task1 = sched->tasks[i];
 506                 j = i + 1;
 507                 if (j == sched->nr_tasks)
 508                         j = 0;
 509                 task2 = sched->tasks[j];
 510                 add_sched_event_wakeup(sched, task1, 0, task2);
 511         }
 512 }
 513
 514 static void perf_sched__process_event(struct perf_sched *sched,
 515                                       struct sched_atom *atom)
 516 {
 517         int ret = 0;
 518
 519         switch (atom->type) {
 520                 case SCHED_EVENT_RUN:
 521                         burn_nsecs(sched, atom->duration);
 522                         break;
 523                 case SCHED_EVENT_SLEEP:
 524                         if (atom->wait_sem)
 525                                 ret = sem_wait(atom->wait_sem);
 526                         BUG_ON(ret);
 527                         break;
 528                 case SCHED_EVENT_WAKEUP:
 529                         if (atom->wait_sem)
 530                                 ret = sem_post(atom->wait_sem);
 531                         BUG_ON(ret);
 532                         break;
 533                 case SCHED_EVENT_MIGRATION:
 534                         break;
 535                 default:
 536                         BUG_ON(1);
 537         }
 538 }
 539
 540 static u64 get_cpu_usage_nsec_parent(void)
 541 {
 542         struct rusage ru;
 543         u64 sum;
 544         int err;
 545
 546         err = getrusage(RUSAGE_SELF, &ru);
 547         BUG_ON(err);
 548
 549         sum =  ru.ru_utime.tv_sec * NSEC_PER_SEC + ru.ru_utime.tv_usec * NSEC_PER_USEC;
 550         sum += ru.ru_stime.tv_sec * NSEC_PER_SEC + ru.ru_stime.tv_usec * NSEC_PER_USEC;
 551
 552         return sum;
 553 }
 554
 555 static int self_open_counters(struct perf_sched *sched, unsigned long cur_task)
 556 {
 557         struct perf_event_attr attr;
 558         char sbuf[STRERR_BUFSIZE], info[STRERR_BUFSIZE];
 559         int fd;
 560         struct rlimit limit;
 561         bool need_privilege = false;
 562
 563         memset(&attr, 0, sizeof(attr));
 564
 565         attr.type = PERF_TYPE_SOFTWARE;
 566         attr.config = PERF_COUNT_SW_TASK_CLOCK;
 567
 568 force_again:
 569         fd = sys_perf_event_open(&attr, 0, -1, -1,
 570                                  perf_event_open_cloexec_flag());
 571
 572         if (fd < 0) {
 573                 if (errno == EMFILE) {
 574                         if (sched->force) {
 575                                 BUG_ON(getrlimit(RLIMIT_NOFILE, &limit) == -1);
 576                                 limit.rlim_cur += sched->nr_tasks - cur_task;
 577                                 if (limit.rlim_cur > limit.rlim_max) {
 578                                         limit.rlim_max = limit.rlim_cur;
 579                                         need_privilege = true;
 580                                 }
 581                                 if (setrlimit(RLIMIT_NOFILE, &limit) == -1) {
 582                                         if (need_privilege && errno == EPERM)
 583                                                 strcpy(info, "Need privilege\n");
 584                                 } else
 585                                         goto force_again;
 586                         } else
 587                                 strcpy(info, "Have a try with -f option\n");
 588                 }
 589                 pr_err("Error: sys_perf_event_open() syscall returned "
 590                        "with %d (%s)\n%s", fd,
 591                        str_error_r(errno, sbuf, sizeof(sbuf)), info);
 592                 exit(EXIT_FAILURE);
 593         }
 594         return fd;
 595 }
 596
 597 static u64 get_cpu_usage_nsec_self(int fd)
 598 {
 599         u64 runtime;
 600         int ret;
 601
 602         ret = read(fd, &runtime, sizeof(runtime));
 603         BUG_ON(ret != sizeof(runtime));
 604
 605         return runtime;
 606 }
 607
 608 struct sched_thread_parms {
 609         struct task_desc  *task;
 610         struct perf_sched *sched;
 611         int fd;
 612 };
 613
 614 static void *thread_func(void *ctx)
 615 {
 616         struct sched_thread_parms *parms = ctx;
 617         struct task_desc *this_task = parms->task;
 618         struct perf_sched *sched = parms->sched;
 619         u64 cpu_usage_0, cpu_usage_1;
 620         unsigned long i, ret;
 621         char comm2[22];
 622         int fd = parms->fd;
 623
 624         zfree(&parms);
 625
 626         sprintf(comm2, ":%s", this_task->comm);
 627         prctl(PR_SET_NAME, comm2);
 628         if (fd < 0)
 629                 return NULL;
 630 again:
 631         ret = sem_post(&this_task->ready_for_work);
 632         BUG_ON(ret);
 633         ret = pthread_mutex_lock(&sched->start_work_mutex);
 634         BUG_ON(ret);
 635         ret = pthread_mutex_unlock(&sched->start_work_mutex);
 636         BUG_ON(ret);
 637
 638         cpu_usage_0 = get_cpu_usage_nsec_self(fd);
 639
 640         for (i = 0; i < this_task->nr_events; i++) {
 641                 this_task->curr_event = i;
 642                 perf_sched__process_event(sched, this_task->atoms[i]);
 643         }
 644
 645         cpu_usage_1 = get_cpu_usage_nsec_self(fd);
 646         this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
 647         ret = sem_post(&this_task->work_done_sem);
 648         BUG_ON(ret);
 649
 650         ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
 651         BUG_ON(ret);
 652         ret = pthread_mutex_unlock(&sched->work_done_wait_mutex);
 653         BUG_ON(ret);
 654
 655         goto again;
 656 }
 657
 658 static void create_tasks(struct perf_sched *sched)
 659 {
 660         struct task_desc *task;
 661         pthread_attr_t attr;
 662         unsigned long i;
 663         int err;
 664
 665         err = pthread_attr_init(&attr);
 666         BUG_ON(err);
 667         err = pthread_attr_setstacksize(&attr,
 668                         (size_t) max(16 * 1024, PTHREAD_STACK_MIN));
 669         BUG_ON(err);
 670         err = pthread_mutex_lock(&sched->start_work_mutex);
 671         BUG_ON(err);
 672         err = pthread_mutex_lock(&sched->work_done_wait_mutex);
 673         BUG_ON(err);
 674         for (i = 0; i < sched->nr_tasks; i++) {
 675                 struct sched_thread_parms *parms = malloc(sizeof(*parms));
 676                 BUG_ON(parms == NULL);
 677                 parms->task = task = sched->tasks[i];
 678                 parms->sched = sched;
 679                 parms->fd = self_open_counters(sched, i);
 680                 sem_init(&task->sleep_sem, 0, 0);
 681                 sem_init(&task->ready_for_work, 0, 0);
 682                 sem_init(&task->work_done_sem, 0, 0);
 683                 task->curr_event = 0;
 684                 err = pthread_create(&task->thread, &attr, thread_func, parms);
 685                 BUG_ON(err);
 686         }
 687 }
 688
 689 static void wait_for_tasks(struct perf_sched *sched)
 690 {
 691         u64 cpu_usage_0, cpu_usage_1;
 692         struct task_desc *task;
 693         unsigned long i, ret;
 694
 695         sched->start_time = get_nsecs();
 696         sched->cpu_usage = 0;
 697         pthread_mutex_unlock(&sched->work_done_wait_mutex);
 698
 699         for (i = 0; i < sched->nr_tasks; i++) {
 700                 task = sched->tasks[i];
 701                 ret = sem_wait(&task->ready_for_work);
 702                 BUG_ON(ret);
 703                 sem_init(&task->ready_for_work, 0, 0);
 704         }
 705         ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
 706         BUG_ON(ret);
 707
 708         cpu_usage_0 = get_cpu_usage_nsec_parent();
 709
 710         pthread_mutex_unlock(&sched->start_work_mutex);
 711
 712         for (i = 0; i < sched->nr_tasks; i++) {
 713                 task = sched->tasks[i];
 714                 ret = sem_wait(&task->work_done_sem);
 715                 BUG_ON(ret);
 716                 sem_init(&task->work_done_sem, 0, 0);
 717                 sched->cpu_usage += task->cpu_usage;
 718                 task->cpu_usage = 0;
 719         }
 720
 721         cpu_usage_1 = get_cpu_usage_nsec_parent();
 722         if (!sched->runavg_cpu_usage)
 723                 sched->runavg_cpu_usage = sched->cpu_usage;
 724         sched->runavg_cpu_usage = (sched->runavg_cpu_usage * (sched->replay_repeat - 1) + sched->cpu_usage) / sched->replay_repeat;
 725
 726         sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
 727         if (!sched->runavg_parent_cpu_usage)
 728                 sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
 729         sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * (sched->replay_repeat - 1) +
 730                                          sched->parent_cpu_usage)/sched->replay_repeat;
 731
 732         ret = pthread_mutex_lock(&sched->start_work_mutex);
 733         BUG_ON(ret);
 734
 735         for (i = 0; i < sched->nr_tasks; i++) {
 736                 task = sched->tasks[i];
 737                 sem_init(&task->sleep_sem, 0, 0);
 738                 task->curr_event = 0;
 739         }
 740 }
 741
 742 static void run_one_test(struct perf_sched *sched)
 743 {
 744         u64 T0, T1, delta, avg_delta, fluct;
 745
 746         T0 = get_nsecs();
 747         wait_for_tasks(sched);
 748         T1 = get_nsecs();
 749
 750         delta = T1 - T0;
 751         sched->sum_runtime += delta;
 752         sched->nr_runs++;
 753
 754         avg_delta = sched->sum_runtime / sched->nr_runs;
 755         if (delta < avg_delta)
 756                 fluct = avg_delta - delta;
 757         else
 758                 fluct = delta - avg_delta;
 759         sched->sum_fluct += fluct;
 760         if (!sched->run_avg)
 761                 sched->run_avg = delta;
 762         sched->run_avg = (sched->run_avg * (sched->replay_repeat - 1) + delta) / sched->replay_repeat;
 763
 764         printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / NSEC_PER_MSEC);
 765
 766         printf("ravg: %0.2f, ", (double)sched->run_avg / NSEC_PER_MSEC);
 767
 768         printf("cpu: %0.2f / %0.2f",
 769                 (double)sched->cpu_usage / NSEC_PER_MSEC, (double)sched->runavg_cpu_usage / NSEC_PER_MSEC);
 770
 771 #if 0
 772         /*
 773          * rusage statistics done by the parent, these are less
 774          * accurate than the sched->sum_exec_runtime based statistics:
 775          */
 776         printf(" [%0.2f / %0.2f]",
 777                 (double)sched->parent_cpu_usage / NSEC_PER_MSEC,
 778                 (double)sched->runavg_parent_cpu_usage / NSEC_PER_MSEC);
 779 #endif
 780
 781         printf("\n");
 782
 783         if (sched->nr_sleep_corrections)
 784                 printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections);
 785         sched->nr_sleep_corrections = 0;
 786 }
 787
 788 static void test_calibrations(struct perf_sched *sched)
 789 {
 790         u64 T0, T1;
 791
 792         T0 = get_nsecs();
 793         burn_nsecs(sched, NSEC_PER_MSEC);
 794         T1 = get_nsecs();
 795
 796         printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
 797
 798         T0 = get_nsecs();
 799         sleep_nsecs(NSEC_PER_MSEC);
 800         T1 = get_nsecs();
 801
 802         printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
 803 }
 804
 805 static int
 806 replay_wakeup_event(struct perf_sched *sched,
 807                     struct evsel *evsel, struct perf_sample *sample,
 808                     struct machine *machine __maybe_unused)
 809 {
 810         const char *comm = perf_evsel__strval(evsel, sample, "comm");
 811         const u32 pid    = perf_evsel__intval(evsel, sample, "pid");
 812         struct task_desc *waker, *wakee;
 813
 814         if (verbose > 0) {
 815                 printf("sched_wakeup event %p\n", evsel);
 816
 817                 printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
 818         }
 819
 820         waker = register_pid(sched, sample->tid, "<unknown>");
 821         wakee = register_pid(sched, pid, comm);
 822
 823         add_sched_event_wakeup(sched, waker, sample->time, wakee);
 824         return 0;
 825 }
 826
 827 static int replay_switch_event(struct perf_sched *sched,
 828                                struct evsel *evsel,
 829                                struct perf_sample *sample,
 830                                struct machine *machine __maybe_unused)
 831 {
 832         const char *prev_comm  = perf_evsel__strval(evsel, sample, "prev_comm"),
 833                    *next_comm  = perf_evsel__strval(evsel, sample, "next_comm");
 834         const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
 835                   next_pid = perf_evsel__intval(evsel, sample, "next_pid");
 836         const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
 837         struct task_desc *prev, __maybe_unused *next;
 838         u64 timestamp0, timestamp = sample->time;
 839         int cpu = sample->cpu;
 840         s64 delta;
 841
 842         if (verbose > 0)
 843                 printf("sched_switch event %p\n", evsel);
 844
 845         if (cpu >= MAX_CPUS || cpu < 0)
 846                 return 0;
 847
 848         timestamp0 = sched->cpu_last_switched[cpu];
 849         if (timestamp0)
 850                 delta = timestamp - timestamp0;
 851         else
 852                 delta = 0;
 853
 854         if (delta < 0) {
 855                 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
 856                 return -1;
 857         }
 858
 859         pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
 860                  prev_comm, prev_pid, next_comm, next_pid, delta);
 861
 862         prev = register_pid(sched, prev_pid, prev_comm);
 863         next = register_pid(sched, next_pid, next_comm);
 864
 865         sched->cpu_last_switched[cpu] = timestamp;
 866
 867         add_sched_event_run(sched, prev, timestamp, delta);
 868         add_sched_event_sleep(sched, prev, timestamp, prev_state);
 869
 870         return 0;
 871 }
 872
 873 static int replay_fork_event(struct perf_sched *sched,
 874                              union perf_event *event,
 875                              struct machine *machine)
 876 {
 877         struct thread *child, *parent;
 878
 879         child = machine__findnew_thread(machine, event->fork.pid,
 880                                         event->fork.tid);
 881         parent = machine__findnew_thread(machine, event->fork.ppid,
 882                                          event->fork.ptid);
 883
 884         if (child == NULL || parent == NULL) {
 885                 pr_debug("thread does not exist on fork event: child %p, parent %p\n",
 886                                  child, parent);
 887                 goto out_put;
 888         }
 889
 890         if (verbose > 0) {
 891                 printf("fork event\n");
 892                 printf("... parent: %s/%d\n", thread__comm_str(parent), parent->tid);
 893                 printf("...  child: %s/%d\n", thread__comm_str(child), child->tid);
 894         }
 895
 896         register_pid(sched, parent->tid, thread__comm_str(parent));
 897         register_pid(sched, child->tid, thread__comm_str(child));
 898 out_put:
 899         thread__put(child);
 900         thread__put(parent);
 901         return 0;
 902 }
 903
 904 struct sort_dimension {
 905         const char              *name;
 906         sort_fn_t               cmp;
 907         struct list_head        list;
 908 };
 909
 910 /*
 911  * handle runtime stats saved per thread
 912  */
 913 static struct thread_runtime *thread__init_runtime(struct thread *thread)
 914 {
 915         struct thread_runtime *r;
 916
 917         r = zalloc(sizeof(struct thread_runtime));
 918         if (!r)
 919                 return NULL;
 920
 921         init_stats(&r->run_stats);
 922         thread__set_priv(thread, r);
 923
 924         return r;
 925 }
 926
 927 static struct thread_runtime *thread__get_runtime(struct thread *thread)
 928 {
 929         struct thread_runtime *tr;
 930
 931         tr = thread__priv(thread);
 932         if (tr == NULL) {
 933                 tr = thread__init_runtime(thread);
 934                 if (tr == NULL)
 935                         pr_debug("Failed to malloc memory for runtime data.\n");
 936         }
 937
 938         return tr;
 939 }
 940
 941 static int
 942 thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
 943 {
 944         struct sort_dimension *sort;
 945         int ret = 0;
 946
 947         BUG_ON(list_empty(list));
 948
 949         list_for_each_entry(sort, list, list) {
 950                 ret = sort->cmp(l, r);
 951                 if (ret)
 952                         return ret;
 953         }
 954
 955         return ret;
 956 }
 957
 958 static struct work_atoms *
 959 thread_atoms_search(struct rb_root_cached *root, struct thread *thread,
 960                          struct list_head *sort_list)
 961 {
 962         struct rb_node *node = root->rb_root.rb_node;
 963         struct work_atoms key = { .thread = thread };
 964
 965         while (node) {
 966                 struct work_atoms *atoms;
 967                 int cmp;
 968
 969                 atoms = container_of(node, struct work_atoms, node);
 970
 971                 cmp = thread_lat_cmp(sort_list, &key, atoms);
 972                 if (cmp > 0)
 973                         node = node->rb_left;
 974                 else if (cmp < 0)
 975                         node = node->rb_right;
 976                 else {
 977                         BUG_ON(thread != atoms->thread);
 978                         return atoms;
 979                 }
 980         }
 981         return NULL;
 982 }
 983
 984 static void
 985 __thread_latency_insert(struct rb_root_cached *root, struct work_atoms *data,
 986                          struct list_head *sort_list)
 987 {
 988         struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL;
 989         bool leftmost = true;
 990
 991         while (*new) {
 992                 struct work_atoms *this;
 993                 int cmp;
 994
 995                 this = container_of(*new, struct work_atoms, node);
 996                 parent = *new;
 997
 998                 cmp = thread_lat_cmp(sort_list, data, this);
 999
1000                 if (cmp > 0)
1001                         new = &((*new)->rb_left);
1002                 else {
1003                         new = &((*new)->rb_right);
1004                         leftmost = false;
1005                 }
1006         }
1007
1008         rb_link_node(&data->node, parent, new);
1009         rb_insert_color_cached(&data->node, root, leftmost);
1010 }
1011
1012 static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
1013 {
1014         struct work_atoms *atoms = zalloc(sizeof(*atoms));
1015         if (!atoms) {
1016                 pr_err("No memory at %s\n", __func__);
1017                 return -1;
1018         }
1019
1020         atoms->thread = thread__get(thread);
1021         INIT_LIST_HEAD(&atoms->work_list);
1022         __thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid);
1023         return 0;
1024 }
1025
1026 static char sched_out_state(u64 prev_state)
1027 {
1028         const char *str = TASK_STATE_TO_CHAR_STR;
1029
1030         return str[prev_state];
1031 }
1032
1033 static int
1034 add_sched_out_event(struct work_atoms *atoms,
1035                     char run_state,
1036                     u64 timestamp)
1037 {
1038         struct work_atom *atom = zalloc(sizeof(*atom));
1039         if (!atom) {
1040                 pr_err("Non memory at %s", __func__);
1041                 return -1;
1042         }
1043
1044         atom->sched_out_time = timestamp;
1045
1046         if (run_state == 'R') {
1047                 atom->state = THREAD_WAIT_CPU;
1048                 atom->wake_up_time = atom->sched_out_time;
1049         }
1050
1051         list_add_tail(&atom->list, &atoms->work_list);
1052         return 0;
1053 }
1054
1055 static void
1056 add_runtime_event(struct work_atoms *atoms, u64 delta,
1057                   u64 timestamp __maybe_unused)
1058 {
1059         struct work_atom *atom;
1060
1061         BUG_ON(list_empty(&atoms->work_list));
1062
1063         atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1064
1065         atom->runtime += delta;
1066         atoms->total_runtime += delta;
1067 }
1068
1069 static void
1070 add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
1071 {
1072         struct work_atom *atom;
1073         u64 delta;
1074
1075         if (list_empty(&atoms->work_list))
1076                 return;
1077
1078         atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1079
1080         if (atom->state != THREAD_WAIT_CPU)
1081                 return;
1082
1083         if (timestamp < atom->wake_up_time) {
1084                 atom->state = THREAD_IGNORE;
1085                 return;
1086         }
1087
1088         atom->state = THREAD_SCHED_IN;
1089         atom->sched_in_time = timestamp;
1090
1091         delta = atom->sched_in_time - atom->wake_up_time;
1092         atoms->total_lat += delta;
1093         if (delta > atoms->max_lat) {
1094                 atoms->max_lat = delta;
1095                 atoms->max_lat_at = timestamp;
1096         }
1097         atoms->nb_atoms++;
1098 }
1099
1100 static int latency_switch_event(struct perf_sched *sched,
1101                                 struct evsel *evsel,
1102                                 struct perf_sample *sample,
1103                                 struct machine *machine)
1104 {
1105         const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
1106                   next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1107         const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
1108         struct work_atoms *out_events, *in_events;
1109         struct thread *sched_out, *sched_in;
1110         u64 timestamp0, timestamp = sample->time;
1111         int cpu = sample->cpu, err = -1;
1112         s64 delta;
1113
1114         BUG_ON(cpu >= MAX_CPUS || cpu < 0);
1115
1116         timestamp0 = sched->cpu_last_switched[cpu];
1117         sched->cpu_last_switched[cpu] = timestamp;
1118         if (timestamp0)
1119                 delta = timestamp - timestamp0;
1120         else
1121                 delta = 0;
1122
1123         if (delta < 0) {
1124                 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1125                 return -1;
1126         }
1127
1128         sched_out = machine__findnew_thread(machine, -1, prev_pid);
1129         sched_in = machine__findnew_thread(machine, -1, next_pid);
1130         if (sched_out == NULL || sched_in == NULL)
1131                 goto out_put;
1132
1133         out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
1134         if (!out_events) {
1135                 if (thread_atoms_insert(sched, sched_out))
1136                         goto out_put;
1137                 out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
1138                 if (!out_events) {
1139                         pr_err("out-event: Internal tree error");
1140                         goto out_put;
1141                 }
1142         }
1143         if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
1144                 return -1;
1145
1146         in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
1147         if (!in_events) {
1148                 if (thread_atoms_insert(sched, sched_in))
1149                         goto out_put;
1150                 in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
1151                 if (!in_events) {
1152                         pr_err("in-event: Internal tree error");
1153                         goto out_put;
1154                 }
1155                 /*
1156                  * Take came in we have not heard about yet,
1157                  * add in an initial atom in runnable state:
1158                  */
1159                 if (add_sched_out_event(in_events, 'R', timestamp))
1160                         goto out_put;
1161         }
1162         add_sched_in_event(in_events, timestamp);
1163         err = 0;
1164 out_put:
1165         thread__put(sched_out);
1166         thread__put(sched_in);
1167         return err;
1168 }
1169
1170 static int latency_runtime_event(struct perf_sched *sched,
1171                                  struct evsel *evsel,
1172                                  struct perf_sample *sample,
1173                                  struct machine *machine)
1174 {
1175         const u32 pid      = perf_evsel__intval(evsel, sample, "pid");
1176         const u64 runtime  = perf_evsel__intval(evsel, sample, "runtime");
1177         struct thread *thread = machine__findnew_thread(machine, -1, pid);
1178         struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
1179         u64 timestamp = sample->time;
1180         int cpu = sample->cpu, err = -1;
1181
1182         if (thread == NULL)
1183                 return -1;
1184
1185         BUG_ON(cpu >= MAX_CPUS || cpu < 0);
1186         if (!atoms) {
1187                 if (thread_atoms_insert(sched, thread))
1188                         goto out_put;
1189                 atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
1190                 if (!atoms) {
1191                         pr_err("in-event: Internal tree error");
1192                         goto out_put;
1193                 }
1194                 if (add_sched_out_event(atoms, 'R', timestamp))
1195                         goto out_put;
1196         }
1197
1198         add_runtime_event(atoms, runtime, timestamp);
1199         err = 0;
1200 out_put:
1201         thread__put(thread);
1202         return err;
1203 }
1204
1205 static int latency_wakeup_event(struct perf_sched *sched,
1206                                 struct evsel *evsel,
1207                                 struct perf_sample *sample,
1208                                 struct machine *machine)
1209 {
1210         const u32 pid     = perf_evsel__intval(evsel, sample, "pid");
1211         struct work_atoms *atoms;
1212         struct work_atom *atom;
1213         struct thread *wakee;
1214         u64 timestamp = sample->time;
1215         int err = -1;
1216
1217         wakee = machine__findnew_thread(machine, -1, pid);
1218         if (wakee == NULL)
1219                 return -1;
1220         atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1221         if (!atoms) {
1222                 if (thread_atoms_insert(sched, wakee))
1223                         goto out_put;
1224                 atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1225                 if (!atoms) {
1226                         pr_err("wakeup-event: Internal tree error");
1227                         goto out_put;
1228                 }
1229                 if (add_sched_out_event(atoms, 'S', timestamp))
1230                         goto out_put;
1231         }
1232
1233         BUG_ON(list_empty(&atoms->work_list));
1234
1235         atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1236
1237         /*
1238          * As we do not guarantee the wakeup event happens when
1239          * task is out of run queue, also may happen when task is
1240          * on run queue and wakeup only change ->state to TASK_RUNNING,
1241          * then we should not set the ->wake_up_time when wake up a
1242          * task which is on run queue.
1243          *
1244          * You WILL be missing events if you've recorded only
1245          * one CPU, or are only looking at only one, so don't
1246          * skip in this case.
1247          */
1248         if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING)
1249                 goto out_ok;
1250
1251         sched->nr_timestamps++;
1252         if (atom->sched_out_time > timestamp) {
1253                 sched->nr_unordered_timestamps++;
1254                 goto out_ok;
1255         }
1256
1257         atom->state = THREAD_WAIT_CPU;
1258         atom->wake_up_time = timestamp;
1259 out_ok:
1260         err = 0;
1261 out_put:
1262         thread__put(wakee);
1263         return err;
1264 }
1265
1266 static int latency_migrate_task_event(struct perf_sched *sched,
1267                                       struct evsel *evsel,
1268                                       struct perf_sample *sample,
1269                                       struct machine *machine)
1270 {
1271         const u32 pid = perf_evsel__intval(evsel, sample, "pid");
1272         u64 timestamp = sample->time;
1273         struct work_atoms *atoms;
1274         struct work_atom *atom;
1275         struct thread *migrant;
1276         int err = -1;
1277
1278         /*
1279          * Only need to worry about migration when profiling one CPU.
1280          */
1281         if (sched->profile_cpu == -1)
1282                 return 0;
1283
1284         migrant = machine__findnew_thread(machine, -1, pid);
1285         if (migrant == NULL)
1286                 return -1;
1287         atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1288         if (!atoms) {
1289                 if (thread_atoms_insert(sched, migrant))
1290                         goto out_put;
1291                 register_pid(sched, migrant->tid, thread__comm_str(migrant));
1292                 atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1293                 if (!atoms) {
1294                         pr_err("migration-event: Internal tree error");
1295                         goto out_put;
1296                 }
1297                 if (add_sched_out_event(atoms, 'R', timestamp))
1298                         goto out_put;
1299         }
1300
1301         BUG_ON(list_empty(&atoms->work_list));
1302
1303         atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1304         atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;
1305
1306         sched->nr_timestamps++;
1307
1308         if (atom->sched_out_time > timestamp)
1309                 sched->nr_unordered_timestamps++;
1310         err = 0;
1311 out_put:
1312         thread__put(migrant);
1313         return err;
1314 }
1315
1316 static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
1317 {
1318         int i;
1319         int ret;
1320         u64 avg;
1321         char max_lat_at[32];
1322
1323         if (!work_list->nb_atoms)
1324                 return;
1325         /*
1326          * Ignore idle threads:
1327          */
1328         if (!strcmp(thread__comm_str(work_list->thread), "swapper"))
1329                 return;
1330
1331         sched->all_runtime += work_list->total_runtime;
1332         sched->all_count   += work_list->nb_atoms;
1333
1334         if (work_list->num_merged > 1)
1335                 ret = printf("  %s:(%d) ", thread__comm_str(work_list->thread), work_list->num_merged);
1336         else
1337                 ret = printf("  %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);
1338
1339         for (i = 0; i < 24 - ret; i++)
1340                 printf(" ");
1341
1342         avg = work_list->total_lat / work_list->nb_atoms;
1343         timestamp__scnprintf_usec(work_list->max_lat_at, max_lat_at, sizeof(max_lat_at));
1344
1345         printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %13s s\n",
1346               (double)work_list->total_runtime / NSEC_PER_MSEC,
1347                  work_list->nb_atoms, (double)avg / NSEC_PER_MSEC,
1348                  (double)work_list->max_lat / NSEC_PER_MSEC,
1349                  max_lat_at);
1350 }
1351
1352 static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
1353 {
1354         if (l->thread == r->thread)
1355                 return 0;
1356         if (l->thread->tid < r->thread->tid)
1357                 return -1;
1358         if (l->thread->tid > r->thread->tid)
1359                 return 1;
1360         return (int)(l->thread - r->thread);
1361 }
1362
1363 static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
1364 {
1365         u64 avgl, avgr;
1366
1367         if (!l->nb_atoms)
1368                 return -1;
1369
1370         if (!r->nb_atoms)
1371                 return 1;
1372
1373         avgl = l->total_lat / l->nb_atoms;
1374         avgr = r->total_lat / r->nb_atoms;
1375
1376         if (avgl < avgr)
1377                 return -1;
1378         if (avgl > avgr)
1379                 return 1;
1380
1381         return 0;
1382 }
1383
1384 static int max_cmp(struct work_atoms *l, struct work_atoms *r)
1385 {
1386         if (l->max_lat < r->max_lat)
1387                 return -1;
1388         if (l->max_lat > r->max_lat)
1389                 return 1;
1390
1391         return 0;
1392 }
1393
1394 static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
1395 {
1396         if (l->nb_atoms < r->nb_atoms)
1397                 return -1;
1398         if (l->nb_atoms > r->nb_atoms)
1399                 return 1;
1400
1401         return 0;
1402 }
1403
1404 static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
1405 {
1406         if (l->total_runtime < r->total_runtime)
1407                 return -1;
1408         if (l->total_runtime > r->total_runtime)
1409                 return 1;
1410
1411         return 0;
1412 }
1413
1414 static int sort_dimension__add(const char *tok, struct list_head *list)
1415 {
1416         size_t i;
1417         static struct sort_dimension avg_sort_dimension = {
1418                 .name = "avg",
1419                 .cmp  = avg_cmp,
1420         };
1421         static struct sort_dimension max_sort_dimension = {
1422                 .name = "max",
1423                 .cmp  = max_cmp,
1424         };
1425         static struct sort_dimension pid_sort_dimension = {
1426                 .name = "pid",
1427                 .cmp  = pid_cmp,
1428         };
1429         static struct sort_dimension runtime_sort_dimension = {
1430                 .name = "runtime",
1431                 .cmp  = runtime_cmp,
1432         };
1433         static struct sort_dimension switch_sort_dimension = {
1434                 .name = "switch",
1435                 .cmp  = switch_cmp,
1436         };
1437         struct sort_dimension *available_sorts[] = {
1438                 &pid_sort_dimension,
1439                 &avg_sort_dimension,
1440                 &max_sort_dimension,
1441                 &switch_sort_dimension,
1442                 &runtime_sort_dimension,
1443         };
1444
1445         for (i = 0; i < ARRAY_SIZE(available_sorts); i++) {
1446                 if (!strcmp(available_sorts[i]->name, tok)) {
1447                         list_add_tail(&available_sorts[i]->list, list);
1448
1449                         return 0;
1450                 }
1451         }
1452
1453         return -1;
1454 }
1455
1456 static void perf_sched__sort_lat(struct perf_sched *sched)
1457 {
1458         struct rb_node *node;
1459         struct rb_root_cached *root = &sched->atom_root;
1460 again:
1461         for (;;) {
1462                 struct work_atoms *data;
1463                 node = rb_first_cached(root);
1464                 if (!node)
1465                         break;
1466
1467                 rb_erase_cached(node, root);
1468                 data = rb_entry(node, struct work_atoms, node);
1469                 __thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
1470         }
1471         if (root == &sched->atom_root) {
1472                 root = &sched->merged_atom_root;
1473                 goto again;
1474         }
1475 }
1476
1477 static int process_sched_wakeup_event(struct perf_tool *tool,
1478                                       struct evsel *evsel,
1479                                       struct perf_sample *sample,
1480                                       struct machine *machine)
1481 {
1482         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1483
1484         if (sched->tp_handler->wakeup_event)
1485                 return sched->tp_handler->wakeup_event(sched, evsel, sample, machine);
1486
1487         return 0;
1488 }
1489
1490 union map_priv {
1491         void    *ptr;
1492         bool     color;
1493 };
1494
1495 static bool thread__has_color(struct thread *thread)
1496 {
1497         union map_priv priv = {
1498                 .ptr = thread__priv(thread),
1499         };
1500
1501         return priv.color;
1502 }
1503
1504 static struct thread*
1505 map__findnew_thread(struct perf_sched *sched, struct machine *machine, pid_t pid, pid_t tid)
1506 {
1507         struct thread *thread = machine__findnew_thread(machine, pid, tid);
1508         union map_priv priv = {
1509                 .color = false,
1510         };
1511
1512         if (!sched->map.color_pids || !thread || thread__priv(thread))
1513                 return thread;
1514
1515         if (thread_map__has(sched->map.color_pids, tid))
1516                 priv.color = true;
1517
1518         thread__set_priv(thread, priv.ptr);
1519         return thread;
1520 }
1521
1522 static int map_switch_event(struct perf_sched *sched, struct evsel *evsel,
1523                             struct perf_sample *sample, struct machine *machine)
1524 {
1525         const u32 next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1526         struct thread *sched_in;
1527         struct thread_runtime *tr;
1528         int new_shortname;
1529         u64 timestamp0, timestamp = sample->time;
1530         s64 delta;
1531         int i, this_cpu = sample->cpu;
1532         int cpus_nr;
1533         bool new_cpu = false;
1534         const char *color = PERF_COLOR_NORMAL;
1535         char stimestamp[32];
1536
1537         BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);
1538
1539         if (this_cpu > sched->max_cpu)
1540                 sched->max_cpu = this_cpu;
1541
1542         if (sched->map.comp) {
1543                 cpus_nr = bitmap_weight(sched->map.comp_cpus_mask, MAX_CPUS);
1544                 if (!test_and_set_bit(this_cpu, sched->map.comp_cpus_mask)) {
1545                         sched->map.comp_cpus[cpus_nr++] = this_cpu;
1546                         new_cpu = true;
1547                 }
1548         } else
1549                 cpus_nr = sched->max_cpu;
1550
1551         timestamp0 = sched->cpu_last_switched[this_cpu];
1552         sched->cpu_last_switched[this_cpu] = timestamp;
1553         if (timestamp0)
1554                 delta = timestamp - timestamp0;
1555         else
1556                 delta = 0;
1557
1558         if (delta < 0) {
1559                 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1560                 return -1;
1561         }
1562
1563         sched_in = map__findnew_thread(sched, machine, -1, next_pid);
1564         if (sched_in == NULL)
1565                 return -1;
1566
1567         tr = thread__get_runtime(sched_in);
1568         if (tr == NULL) {
1569                 thread__put(sched_in);
1570                 return -1;
1571         }
1572
1573         sched->curr_thread[this_cpu] = thread__get(sched_in);
1574
1575         printf("  ");
1576
1577         new_shortname = 0;
1578         if (!tr->shortname[0]) {
1579                 if (!strcmp(thread__comm_str(sched_in), "swapper")) {
1580                         /*
1581                          * Don't allocate a letter-number for swapper:0
1582                          * as a shortname. Instead, we use '.' for it.
1583                          */
1584                         tr->shortname[0] = '.';
1585                         tr->shortname[1] = ' ';
1586                 } else {
1587                         tr->shortname[0] = sched->next_shortname1;
1588                         tr->shortname[1] = sched->next_shortname2;
1589
1590                         if (sched->next_shortname1 < 'Z') {
1591                                 sched->next_shortname1++;
1592                         } else {
1593                                 sched->next_shortname1 = 'A';
1594                                 if (sched->next_shortname2 < '9')
1595                                         sched->next_shortname2++;
1596                                 else
1597                                         sched->next_shortname2 = '0';
1598                         }
1599                 }
1600                 new_shortname = 1;
1601         }
1602
1603         for (i = 0; i < cpus_nr; i++) {
1604                 int cpu = sched->map.comp ? sched->map.comp_cpus[i] : i;
1605                 struct thread *curr_thread = sched->curr_thread[cpu];
1606                 struct thread_runtime *curr_tr;
1607                 const char *pid_color = color;
1608                 const char *cpu_color = color;
1609
1610                 if (curr_thread && thread__has_color(curr_thread))
1611                         pid_color = COLOR_PIDS;
1612
1613                 if (sched->map.cpus && !cpu_map__has(sched->map.cpus, cpu))
1614                         continue;
1615
1616                 if (sched->map.color_cpus && cpu_map__has(sched->map.color_cpus, cpu))
1617                         cpu_color = COLOR_CPUS;
1618
1619                 if (cpu != this_cpu)
1620                         color_fprintf(stdout, color, " ");
1621                 else
1622                         color_fprintf(stdout, cpu_color, "*");
1623
1624                 if (sched->curr_thread[cpu]) {
1625                         curr_tr = thread__get_runtime(sched->curr_thread[cpu]);
1626                         if (curr_tr == NULL) {
1627                                 thread__put(sched_in);
1628                                 return -1;
1629                         }
1630                         color_fprintf(stdout, pid_color, "%2s ", curr_tr->shortname);
1631                 } else
1632                         color_fprintf(stdout, color, "   ");
1633         }
1634
1635         if (sched->map.cpus && !cpu_map__has(sched->map.cpus, this_cpu))
1636                 goto out;
1637
1638         timestamp__scnprintf_usec(timestamp, stimestamp, sizeof(stimestamp));
1639         color_fprintf(stdout, color, "  %12s secs ", stimestamp);
1640         if (new_shortname || tr->comm_changed || (verbose > 0 && sched_in->tid)) {
1641                 const char *pid_color = color;
1642
1643                 if (thread__has_color(sched_in))
1644                         pid_color = COLOR_PIDS;
1645
1646                 color_fprintf(stdout, pid_color, "%s => %s:%d",
1647                        tr->shortname, thread__comm_str(sched_in), sched_in->tid);
1648                 tr->comm_changed = false;
1649         }
1650
1651         if (sched->map.comp && new_cpu)
1652                 color_fprintf(stdout, color, " (CPU %d)", this_cpu);
1653
1654 out:
1655         color_fprintf(stdout, color, "\n");
1656
1657         thread__put(sched_in);
1658
1659         return 0;
1660 }
1661
1662 static int process_sched_switch_event(struct perf_tool *tool,
1663                                       struct evsel *evsel,
1664                                       struct perf_sample *sample,
1665                                       struct machine *machine)
1666 {
1667         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1668         int this_cpu = sample->cpu, err = 0;
1669         u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
1670             next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1671
1672         if (sched->curr_pid[this_cpu] != (u32)-1) {
1673                 /*
1674                  * Are we trying to switch away a PID that is
1675                  * not current?
1676                  */
1677                 if (sched->curr_pid[this_cpu] != prev_pid)
1678                         sched->nr_context_switch_bugs++;
1679         }
1680
1681         if (sched->tp_handler->switch_event)
1682                 err = sched->tp_handler->switch_event(sched, evsel, sample, machine);
1683
1684         sched->curr_pid[this_cpu] = next_pid;
1685         return err;
1686 }
1687
1688 static int process_sched_runtime_event(struct perf_tool *tool,
1689                                        struct evsel *evsel,
1690                                        struct perf_sample *sample,
1691                                        struct machine *machine)
1692 {
1693         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1694
1695         if (sched->tp_handler->runtime_event)
1696                 return sched->tp_handler->runtime_event(sched, evsel, sample, machine);
1697
1698         return 0;
1699 }
1700
1701 static int perf_sched__process_fork_event(struct perf_tool *tool,
1702                                           union perf_event *event,
1703                                           struct perf_sample *sample,
1704                                           struct machine *machine)
1705 {
1706         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1707
1708         /* run the fork event through the perf machineruy */
1709         perf_event__process_fork(tool, event, sample, machine);
1710
1711         /* and then run additional processing needed for this command */
1712         if (sched->tp_handler->fork_event)
1713                 return sched->tp_handler->fork_event(sched, event, machine);
1714
1715         return 0;
1716 }
1717
1718 static int process_sched_migrate_task_event(struct perf_tool *tool,
1719                                             struct evsel *evsel,
1720                                             struct perf_sample *sample,
1721                                             struct machine *machine)
1722 {
1723         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1724
1725         if (sched->tp_handler->migrate_task_event)
1726                 return sched->tp_handler->migrate_task_event(sched, evsel, sample, machine);
1727
1728         return 0;
1729 }
1730
1731 typedef int (*tracepoint_handler)(struct perf_tool *tool,
1732                                   struct evsel *evsel,
1733                                   struct perf_sample *sample,
1734                                   struct machine *machine);
1735
1736 static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused,
1737                                                  union perf_event *event __maybe_unused,
1738                                                  struct perf_sample *sample,
1739                                                  struct evsel *evsel,
1740                                                  struct machine *machine)
1741 {
1742         int err = 0;
1743
1744         if (evsel->handler != NULL) {
1745                 tracepoint_handler f = evsel->handler;
1746                 err = f(tool, evsel, sample, machine);
1747         }
1748
1749         return err;
1750 }
1751
1752 static int perf_sched__process_comm(struct perf_tool *tool __maybe_unused,
1753                                     union perf_event *event,
1754                                     struct perf_sample *sample,
1755                                     struct machine *machine)
1756 {
1757         struct thread *thread;
1758         struct thread_runtime *tr;
1759         int err;
1760
1761         err = perf_event__process_comm(tool, event, sample, machine);
1762         if (err)
1763                 return err;
1764
1765         thread = machine__find_thread(machine, sample->pid, sample->tid);
1766         if (!thread) {
1767                 pr_err("Internal error: can't find thread\n");
1768                 return -1;
1769         }
1770
1771         tr = thread__get_runtime(thread);
1772         if (tr == NULL) {
1773                 thread__put(thread);
1774                 return -1;
1775         }
1776
1777         tr->comm_changed = true;
1778         thread__put(thread);
1779
1780         return 0;
1781 }
1782
1783 static int perf_sched__read_events(struct perf_sched *sched)
1784 {
1785         const struct evsel_str_handler handlers[] = {
1786                 { "sched:sched_switch",       process_sched_switch_event, },
1787                 { "sched:sched_stat_runtime", process_sched_runtime_event, },
1788                 { "sched:sched_wakeup",       process_sched_wakeup_event, },
1789                 { "sched:sched_wakeup_new",   process_sched_wakeup_event, },
1790                 { "sched:sched_migrate_task", process_sched_migrate_task_event, },
1791         };
1792         struct perf_session *session;
1793         struct perf_data data = {
1794                 .path  = input_name,
1795                 .mode  = PERF_DATA_MODE_READ,
1796                 .force = sched->force,
1797         };
1798         int rc = -1;
1799
1800         session = perf_session__new(&data, false, &sched->tool);
1801         if (IS_ERR(session)) {
1802                 pr_debug("Error creating perf session");
1803                 return PTR_ERR(session);
1804         }
1805
1806         symbol__init(&session->header.env);
1807
1808         if (perf_session__set_tracepoints_handlers(session, handlers))
1809                 goto out_delete;
1810
1811         if (perf_session__has_traces(session, "record -R")) {
1812                 int err = perf_session__process_events(session);
1813                 if (err) {
1814                         pr_err("Failed to process events, error %d", err);
1815                         goto out_delete;
1816                 }
1817
1818                 sched->nr_events      = session->evlist->stats.nr_events[0];
1819                 sched->nr_lost_events = session->evlist->stats.total_lost;
1820                 sched->nr_lost_chunks = session->evlist->stats.nr_events[PERF_RECORD_LOST];
1821         }
1822
1823         rc = 0;
1824 out_delete:
1825         perf_session__delete(session);
1826         return rc;
1827 }
1828
1829 /*
1830  * scheduling times are printed as msec.usec
1831  */
1832 static inline void print_sched_time(unsigned long long nsecs, int width)
1833 {
1834         unsigned long msecs;
1835         unsigned long usecs;
1836
1837         msecs  = nsecs / NSEC_PER_MSEC;
1838         nsecs -= msecs * NSEC_PER_MSEC;
1839         usecs  = nsecs / NSEC_PER_USEC;
1840         printf("%*lu.%03lu ", width, msecs, usecs);
1841 }
1842
1843 /*
1844  * returns runtime data for event, allocating memory for it the
1845  * first time it is used.
1846  */
1847 static struct evsel_runtime *perf_evsel__get_runtime(struct evsel *evsel)
1848 {
1849         struct evsel_runtime *r = evsel->priv;
1850
1851         if (r == NULL) {
1852                 r = zalloc(sizeof(struct evsel_runtime));
1853                 evsel->priv = r;
1854         }
1855
1856         return r;
1857 }
1858
1859 /*
1860  * save last time event was seen per cpu
1861  */
1862 static void perf_evsel__save_time(struct evsel *evsel,
1863                                   u64 timestamp, u32 cpu)
1864 {
1865         struct evsel_runtime *r = perf_evsel__get_runtime(evsel);
1866
1867         if (r == NULL)
1868                 return;
1869
1870         if ((cpu >= r->ncpu) || (r->last_time == NULL)) {
1871                 int i, n = __roundup_pow_of_two(cpu+1);
1872                 void *p = r->last_time;
1873
1874                 p = realloc(r->last_time, n * sizeof(u64));
1875                 if (!p)
1876                         return;
1877
1878                 r->last_time = p;
1879                 for (i = r->ncpu; i < n; ++i)
1880                         r->last_time[i] = (u64) 0;
1881
1882                 r->ncpu = n;
1883         }
1884
1885         r->last_time[cpu] = timestamp;
1886 }
1887
1888 /* returns last time this event was seen on the given cpu */
1889 static u64 perf_evsel__get_time(struct evsel *evsel, u32 cpu)
1890 {
1891         struct evsel_runtime *r = perf_evsel__get_runtime(evsel);
1892
1893         if ((r == NULL) || (r->last_time == NULL) || (cpu >= r->ncpu))
1894                 return 0;
1895
1896         return r->last_time[cpu];
1897 }
1898
1899 static int comm_width = 30;
1900
1901 static char *timehist_get_commstr(struct thread *thread)
1902 {
1903         static char str[32];
1904         const char *comm = thread__comm_str(thread);
1905         pid_t tid = thread->tid;
1906         pid_t pid = thread->pid_;
1907         int n;
1908
1909         if (pid == 0)
1910                 n = scnprintf(str, sizeof(str), "%s", comm);
1911
1912         else if (tid != pid)
1913                 n = scnprintf(str, sizeof(str), "%s[%d/%d]", comm, tid, pid);
1914
1915         else
1916                 n = scnprintf(str, sizeof(str), "%s[%d]", comm, tid);
1917
1918         if (n > comm_width)
1919                 comm_width = n;
1920
1921         return str;
1922 }
1923
1924 static void timehist_header(struct perf_sched *sched)
1925 {
1926         u32 ncpus = sched->max_cpu + 1;
1927         u32 i, j;
1928
1929         printf("%15s %6s ", "time", "cpu");
1930
1931         if (sched->show_cpu_visual) {
1932                 printf(" ");
1933                 for (i = 0, j = 0; i < ncpus; ++i) {
1934                         printf("%x", j++);
1935                         if (j > 15)
1936                                 j = 0;
1937                 }
1938                 printf(" ");
1939         }
1940
1941         printf(" %-*s  %9s  %9s  %9s", comm_width,
1942                 "task name", "wait time", "sch delay", "run time");
1943
1944         if (sched->show_state)
1945                 printf("  %s", "state");
1946
1947         printf("\n");
1948
1949         /*
1950          * units row
1951          */
1952         printf("%15s %-6s ", "", "");
1953
1954         if (sched->show_cpu_visual)
1955                 printf(" %*s ", ncpus, "");
1956
1957         printf(" %-*s  %9s  %9s  %9s", comm_width,
1958                "[tid/pid]", "(msec)", "(msec)", "(msec)");
1959
1960         if (sched->show_state)
1961                 printf("  %5s", "");
1962
1963         printf("\n");
1964
1965         /*
1966          * separator
1967          */
1968         printf("%.15s %.6s ", graph_dotted_line, graph_dotted_line);
1969
1970         if (sched->show_cpu_visual)
1971                 printf(" %.*s ", ncpus, graph_dotted_line);
1972
1973         printf(" %.*s  %.9s  %.9s  %.9s", comm_width,
1974                 graph_dotted_line, graph_dotted_line, graph_dotted_line,
1975                 graph_dotted_line);
1976
1977         if (sched->show_state)
1978                 printf("  %.5s", graph_dotted_line);
1979
1980         printf("\n");
1981 }
1982
1983 static char task_state_char(struct thread *thread, int state)
1984 {
1985         static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
1986         unsigned bit = state ? ffs(state) : 0;
1987
1988         /* 'I' for idle */
1989         if (thread->tid == 0)
1990                 return 'I';
1991
1992         return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
1993 }
1994
1995 static void timehist_print_sample(struct perf_sched *sched,
1996                                   struct evsel *evsel,
1997                                   struct perf_sample *sample,
1998                                   struct addr_location *al,
1999                                   struct thread *thread,
2000                                   u64 t, int state)
2001 {
2002         struct thread_runtime *tr = thread__priv(thread);
2003         const char *next_comm = perf_evsel__strval(evsel, sample, "next_comm");
2004         const u32 next_pid = perf_evsel__intval(evsel, sample, "next_pid");
2005         u32 max_cpus = sched->max_cpu + 1;
2006         char tstr[64];
2007         char nstr[30];
2008         u64 wait_time;
2009
2010         timestamp__scnprintf_usec(t, tstr, sizeof(tstr));
2011         printf("%15s [%04d] ", tstr, sample->cpu);
2012
2013         if (sched->show_cpu_visual) {
2014                 u32 i;
2015                 char c;
2016
2017                 printf(" ");
2018                 for (i = 0; i < max_cpus; ++i) {
2019                         /* flag idle times with 'i'; others are sched events */
2020                         if (i == sample->cpu)
2021                                 c = (thread->tid == 0) ? 'i' : 's';
2022                         else
2023                                 c = ' ';
2024                         printf("%c", c);
2025                 }
2026                 printf(" ");
2027         }
2028
2029         printf(" %-*s ", comm_width, timehist_get_commstr(thread));
2030
2031         wait_time = tr->dt_sleep + tr->dt_iowait + tr->dt_preempt;
2032         print_sched_time(wait_time, 6);
2033
2034         print_sched_time(tr->dt_delay, 6);
2035         print_sched_time(tr->dt_run, 6);
2036
2037         if (sched->show_state)
2038                 printf(" %5c ", task_state_char(thread, state));
2039
2040         if (sched->show_next) {
2041                 snprintf(nstr, sizeof(nstr), "next: %s[%d]", next_comm, next_pid);
2042                 printf(" %-*s", comm_width, nstr);
2043         }
2044
2045         if (sched->show_wakeups && !sched->show_next)
2046                 printf("  %-*s", comm_width, "");
2047
2048         if (thread->tid == 0)
2049                 goto out;
2050
2051         if (sched->show_callchain)
2052                 printf("  ");
2053
2054         sample__fprintf_sym(sample, al, 0,
2055                             EVSEL__PRINT_SYM | EVSEL__PRINT_ONELINE |
2056                             EVSEL__PRINT_CALLCHAIN_ARROW |
2057                             EVSEL__PRINT_SKIP_IGNORED,
2058                             &callchain_cursor, stdout);
2059
2060 out:
2061         printf("\n");
2062 }
2063
2064 /*
2065  * Explanation of delta-time stats:
2066  *
2067  *            t = time of current schedule out event
2068  *        tprev = time of previous sched out event
2069  *                also time of schedule-in event for current task
2070  *    last_time = time of last sched change event for current task
2071  *                (i.e, time process was last scheduled out)
2072  * ready_to_run = time of wakeup for current task
2073  *
2074  * -----|------------|------------|------------|------
2075  *    last         ready        tprev          t
2076  *    time         to run
2077  *
2078  *      |-------- dt_wait --------|
2079  *                   |- dt_delay -|-- dt_run --|
2080  *
2081  *   dt_run = run time of current task
2082  *  dt_wait = time between last schedule out event for task and tprev
2083  *            represents time spent off the cpu
2084  * dt_delay = time between wakeup and schedule-in of task
2085  */
2086
2087 static void timehist_update_runtime_stats(struct thread_runtime *r,
2088                                          u64 t, u64 tprev)
2089 {
2090         r->dt_delay   = 0;
2091         r->dt_sleep   = 0;
2092         r->dt_iowait  = 0;
2093         r->dt_preempt = 0;
2094         r->dt_run     = 0;
2095
2096         if (tprev) {
2097                 r->dt_run = t - tprev;
2098                 if (r->ready_to_run) {
2099                         if (r->ready_to_run > tprev)
2100                                 pr_debug("time travel: wakeup time for task > previous sched_switch event\n");
2101                         else
2102                                 r->dt_delay = tprev - r->ready_to_run;
2103                 }
2104
2105                 if (r->last_time > tprev)
2106                         pr_debug("time travel: last sched out time for task > previous sched_switch event\n");
2107                 else if (r->last_time) {
2108                         u64 dt_wait = tprev - r->last_time;
2109
2110                         if (r->last_state == TASK_RUNNING)
2111                                 r->dt_preempt = dt_wait;
2112                         else if (r->last_state == TASK_UNINTERRUPTIBLE)
2113                                 r->dt_iowait = dt_wait;
2114                         else
2115                                 r->dt_sleep = dt_wait;
2116                 }
2117         }
2118
2119         update_stats(&r->run_stats, r->dt_run);
2120
2121         r->total_run_time     += r->dt_run;
2122         r->total_delay_time   += r->dt_delay;
2123         r->total_sleep_time   += r->dt_sleep;
2124         r->total_iowait_time  += r->dt_iowait;
2125         r->total_preempt_time += r->dt_preempt;
2126 }
2127
2128 static bool is_idle_sample(struct perf_sample *sample,
2129                            struct evsel *evsel)
2130 {
2131         /* pid 0 == swapper == idle task */
2132         if (strcmp(perf_evsel__name(evsel), "sched:sched_switch") == 0)
2133                 return perf_evsel__intval(evsel, sample, "prev_pid") == 0;
2134
2135         return sample->pid == 0;
2136 }
2137
2138 static void save_task_callchain(struct perf_sched *sched,
2139                                 struct perf_sample *sample,
2140                                 struct evsel *evsel,
2141                                 struct machine *machine)
2142 {
2143         struct callchain_cursor *cursor = &callchain_cursor;
2144         struct thread *thread;
2145
2146         /* want main thread for process - has maps */
2147         thread = machine__findnew_thread(machine, sample->pid, sample->pid);
2148         if (thread == NULL) {
2149                 pr_debug("Failed to get thread for pid %d.\n", sample->pid);
2150                 return;
2151         }
2152
2153         if (!sched->show_callchain || sample->callchain == NULL)
2154                 return;
2155
2156         if (thread__resolve_callchain(thread, cursor, evsel, sample,
2157                                       NULL, NULL, sched->max_stack + 2) != 0) {
2158                 if (verbose > 0)
2159                         pr_err("Failed to resolve callchain. Skipping\n");
2160
2161                 return;
2162         }
2163
2164         callchain_cursor_commit(cursor);
2165
2166         while (true) {
2167                 struct callchain_cursor_node *node;
2168                 struct symbol *sym;
2169
2170                 node = callchain_cursor_current(cursor);
2171                 if (node == NULL)
2172                         break;
2173
2174                 sym = node->sym;
2175                 if (sym) {
2176                         if (!strcmp(sym->name, "schedule") ||
2177                             !strcmp(sym->name, "__schedule") ||
2178                             !strcmp(sym->name, "preempt_schedule"))
2179                                 sym->ignore = 1;
2180                 }
2181
2182                 callchain_cursor_advance(cursor);
2183         }
2184 }
2185
2186 static int init_idle_thread(struct thread *thread)
2187 {
2188         struct idle_thread_runtime *itr;
2189
2190         thread__set_comm(thread, idle_comm, 0);
2191
2192         itr = zalloc(sizeof(*itr));
2193         if (itr == NULL)
2194                 return -ENOMEM;
2195
2196         init_stats(&itr->tr.run_stats);
2197         callchain_init(&itr->callchain);
2198         callchain_cursor_reset(&itr->cursor);
2199         thread__set_priv(thread, itr);
2200
2201         return 0;
2202 }
2203
2204 /*
2205  * Track idle stats per cpu by maintaining a local thread
2206  * struct for the idle task on each cpu.
2207  */
2208 static int init_idle_threads(int ncpu)
2209 {
2210         int i, ret;
2211
2212         idle_threads = zalloc(ncpu * sizeof(struct thread *));
2213         if (!idle_threads)
2214                 return -ENOMEM;
2215
2216         idle_max_cpu = ncpu;
2217
2218         /* allocate the actual thread struct if needed */
2219         for (i = 0; i < ncpu; ++i) {
2220                 idle_threads[i] = thread__new(0, 0);
2221                 if (idle_threads[i] == NULL)
2222                         return -ENOMEM;
2223
2224                 ret = init_idle_thread(idle_threads[i]);
2225                 if (ret < 0)
2226                         return ret;
2227         }
2228
2229         return 0;
2230 }
2231
2232 static void free_idle_threads(void)
2233 {
2234         int i;
2235
2236         if (idle_threads == NULL)
2237                 return;
2238
2239         for (i = 0; i < idle_max_cpu; ++i) {
2240                 if ((idle_threads[i]))
2241                         thread__delete(idle_threads[i]);
2242         }
2243
2244         free(idle_threads);
2245 }
2246
2247 static struct thread *get_idle_thread(int cpu)
2248 {
2249         /*
2250          * expand/allocate array of pointers to local thread
2251          * structs if needed
2252          */
2253         if ((cpu >= idle_max_cpu) || (idle_threads == NULL)) {
2254                 int i, j = __roundup_pow_of_two(cpu+1);
2255                 void *p;
2256
2257                 p = realloc(idle_threads, j * sizeof(struct thread *));
2258                 if (!p)
2259                         return NULL;
2260
2261                 idle_threads = (struct thread **) p;
2262                 for (i = idle_max_cpu; i < j; ++i)
2263                         idle_threads[i] = NULL;
2264
2265                 idle_max_cpu = j;
2266         }
2267
2268         /* allocate a new thread struct if needed */
2269         if (idle_threads[cpu] == NULL) {
2270                 idle_threads[cpu] = thread__new(0, 0);
2271
2272                 if (idle_threads[cpu]) {
2273                         if (init_idle_thread(idle_threads[cpu]) < 0)
2274                                 return NULL;
2275                 }
2276         }
2277
2278         return idle_threads[cpu];
2279 }
2280
2281 static void save_idle_callchain(struct perf_sched *sched,
2282                                 struct idle_thread_runtime *itr,
2283                                 struct perf_sample *sample)
2284 {
2285         if (!sched->show_callchain || sample->callchain == NULL)
2286                 return;
2287
2288         callchain_cursor__copy(&itr->cursor, &callchain_cursor);
2289 }
2290
2291 static struct thread *timehist_get_thread(struct perf_sched *sched,
2292                                           struct perf_sample *sample,
2293                                           struct machine *machine,
2294                                           struct evsel *evsel)
2295 {
2296         struct thread *thread;
2297
2298         if (is_idle_sample(sample, evsel)) {
2299                 thread = get_idle_thread(sample->cpu);
2300                 if (thread == NULL)
2301                         pr_err("Failed to get idle thread for cpu %d.\n", sample->cpu);
2302
2303         } else {
2304                 /* there were samples with tid 0 but non-zero pid */
2305                 thread = machine__findnew_thread(machine, sample->pid,
2306                                                  sample->tid ?: sample->pid);
2307                 if (thread == NULL) {
2308                         pr_debug("Failed to get thread for tid %d. skipping sample.\n",
2309                                  sample->tid);
2310                 }
2311
2312                 save_task_callchain(sched, sample, evsel, machine);
2313                 if (sched->idle_hist) {
2314                         struct thread *idle;
2315                         struct idle_thread_runtime *itr;
2316
2317                         idle = get_idle_thread(sample->cpu);
2318                         if (idle == NULL) {
2319                                 pr_err("Failed to get idle thread for cpu %d.\n", sample->cpu);
2320                                 return NULL;
2321                         }
2322
2323                         itr = thread__priv(idle);
2324                         if (itr == NULL)
2325                                 return NULL;
2326
2327                         itr->last_thread = thread;
2328
2329                         /* copy task callchain when entering to idle */
2330                         if (perf_evsel__intval(evsel, sample, "next_pid") == 0)
2331                                 save_idle_callchain(sched, itr, sample);
2332                 }
2333         }
2334
2335         return thread;
2336 }
2337
2338 static bool timehist_skip_sample(struct perf_sched *sched,
2339                                  struct thread *thread,
2340                                  struct evsel *evsel,
2341                                  struct perf_sample *sample)
2342 {
2343         bool rc = false;
2344
2345         if (thread__is_filtered(thread)) {
2346                 rc = true;
2347                 sched->skipped_samples++;
2348         }
2349
2350         if (sched->idle_hist) {
2351                 if (strcmp(perf_evsel__name(evsel), "sched:sched_switch"))
2352                         rc = true;
2353                 else if (perf_evsel__intval(evsel, sample, "prev_pid") != 0 &&
2354                          perf_evsel__intval(evsel, sample, "next_pid") != 0)
2355                         rc = true;
2356         }
2357
2358         return rc;
2359 }
2360
2361 static void timehist_print_wakeup_event(struct perf_sched *sched,
2362                                         struct evsel *evsel,
2363                                         struct perf_sample *sample,
2364                                         struct machine *machine,
2365                                         struct thread *awakened)
2366 {
2367         struct thread *thread;
2368         char tstr[64];
2369
2370         thread = machine__findnew_thread(machine, sample->pid, sample->tid);
2371         if (thread == NULL)
2372                 return;
2373
2374         /* show wakeup unless both awakee and awaker are filtered */
2375         if (timehist_skip_sample(sched, thread, evsel, sample) &&
2376             timehist_skip_sample(sched, awakened, evsel, sample)) {
2377                 return;
2378         }
2379
2380         timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
2381         printf("%15s [%04d] ", tstr, sample->cpu);
2382         if (sched->show_cpu_visual)
2383                 printf(" %*s ", sched->max_cpu + 1, "");
2384
2385         printf(" %-*s ", comm_width, timehist_get_commstr(thread));
2386
2387         /* dt spacer */
2388         printf("  %9s  %9s  %9s ", "", "", "");
2389
2390         printf("awakened: %s", timehist_get_commstr(awakened));
2391
2392         printf("\n");
2393 }
2394
2395 static int timehist_sched_wakeup_event(struct perf_tool *tool,
2396                                        union perf_event *event __maybe_unused,
2397                                        struct evsel *evsel,
2398                                        struct perf_sample *sample,
2399                                        struct machine *machine)
2400 {
2401         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
2402         struct thread *thread;
2403         struct thread_runtime *tr = NULL;
2404         /* want pid of awakened task not pid in sample */
2405         const u32 pid = perf_evsel__intval(evsel, sample, "pid");
2406
2407         thread = machine__findnew_thread(machine, 0, pid);
2408         if (thread == NULL)
2409                 return -1;
2410
2411         tr = thread__get_runtime(thread);
2412         if (tr == NULL)
2413                 return -1;
2414
2415         if (tr->ready_to_run == 0)
2416                 tr->ready_to_run = sample->time;
2417
2418         /* show wakeups if requested */
2419         if (sched->show_wakeups &&
2420             !perf_time__skip_sample(&sched->ptime, sample->time))
2421                 timehist_print_wakeup_event(sched, evsel, sample, machine, thread);
2422
2423         return 0;
2424 }
2425
2426 static void timehist_print_migration_event(struct perf_sched *sched,
2427                                         struct evsel *evsel,
2428                                         struct perf_sample *sample,
2429                                         struct machine *machine,
2430                                         struct thread *migrated)
2431 {
2432         struct thread *thread;
2433         char tstr[64];
2434         u32 max_cpus = sched->max_cpu + 1;
2435         u32 ocpu, dcpu;
2436
2437         if (sched->summary_only)
2438                 return;
2439
2440         max_cpus = sched->max_cpu + 1;
2441         ocpu = perf_evsel__intval(evsel, sample, "orig_cpu");
2442         dcpu = perf_evsel__intval(evsel, sample, "dest_cpu");
2443
2444         thread = machine__findnew_thread(machine, sample->pid, sample->tid);
2445         if (thread == NULL)
2446                 return;
2447
2448         if (timehist_skip_sample(sched, thread, evsel, sample) &&
2449             timehist_skip_sample(sched, migrated, evsel, sample)) {
2450                 return;
2451         }
2452
2453         timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
2454         printf("%15s [%04d] ", tstr, sample->cpu);
2455
2456         if (sched->show_cpu_visual) {
2457                 u32 i;
2458                 char c;
2459
2460                 printf("  ");
2461                 for (i = 0; i < max_cpus; ++i) {
2462                         c = (i == sample->cpu) ? 'm' : ' ';
2463                         printf("%c", c);
2464                 }
2465                 printf("  ");
2466         }
2467
2468         printf(" %-*s ", comm_width, timehist_get_commstr(thread));
2469
2470         /* dt spacer */
2471         printf("  %9s  %9s  %9s ", "", "", "");
2472
2473         printf("migrated: %s", timehist_get_commstr(migrated));
2474         printf(" cpu %d => %d", ocpu, dcpu);
2475
2476         printf("\n");
2477 }
2478
2479 static int timehist_migrate_task_event(struct perf_tool *tool,
2480                                        union perf_event *event __maybe_unused,
2481                                        struct evsel *evsel,
2482                                        struct perf_sample *sample,
2483                                        struct machine *machine)
2484 {
2485         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
2486         struct thread *thread;
2487         struct thread_runtime *tr = NULL;
2488         /* want pid of migrated task not pid in sample */
2489         const u32 pid = perf_evsel__intval(evsel, sample, "pid");
2490
2491         thread = machine__findnew_thread(machine, 0, pid);
2492         if (thread == NULL)
2493                 return -1;
2494
2495         tr = thread__get_runtime(thread);
2496         if (tr == NULL)
2497                 return -1;
2498
2499         tr->migrations++;
2500
2501         /* show migrations if requested */
2502         timehist_print_migration_event(sched, evsel, sample, machine, thread);
2503
2504         return 0;
2505 }
2506
2507 static int timehist_sched_change_event(struct perf_tool *tool,
2508                                        union perf_event *event,
2509                                        struct evsel *evsel,
2510                                        struct perf_sample *sample,
2511                                        struct machine *machine)
2512 {
2513         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
2514         struct perf_time_interval *ptime = &sched->ptime;
2515         struct addr_location al;
2516         struct thread *thread;
2517         struct thread_runtime *tr = NULL;
2518         u64 tprev, t = sample->time;
2519         int rc = 0;
2520         int state = perf_evsel__intval(evsel, sample, "prev_state");
2521
2522
2523         if (machine__resolve(machine, &al, sample) < 0) {
2524                 pr_err("problem processing %d event. skipping it\n",
2525                        event->header.type);
2526                 rc = -1;
2527                 goto out;
2528         }
2529
2530         thread = timehist_get_thread(sched, sample, machine, evsel);
2531         if (thread == NULL) {
2532                 rc = -1;
2533                 goto out;
2534         }
2535
2536         if (timehist_skip_sample(sched, thread, evsel, sample))
2537                 goto out;
2538
2539         tr = thread__get_runtime(thread);
2540         if (tr == NULL) {
2541                 rc = -1;
2542                 goto out;
2543         }
2544
2545         tprev = perf_evsel__get_time(evsel, sample->cpu);
2546
2547         /*
2548          * If start time given:
2549          * - sample time is under window user cares about - skip sample
2550          * - tprev is under window user cares about  - reset to start of window
2551          */
2552         if (ptime->start && ptime->start > t)
2553                 goto out;
2554
2555         if (tprev && ptime->start > tprev)
2556                 tprev = ptime->start;
2557
2558         /*
2559          * If end time given:
2560          * - previous sched event is out of window - we are done
2561          * - sample time is beyond window user cares about - reset it
2562          *   to close out stats for time window interest
2563          */
2564         if (ptime->end) {
2565                 if (tprev > ptime->end)
2566                         goto out;
2567
2568                 if (t > ptime->end)
2569                         t = ptime->end;
2570         }
2571
2572         if (!sched->idle_hist || thread->tid == 0) {
2573                 timehist_update_runtime_stats(tr, t, tprev);
2574
2575                 if (sched->idle_hist) {
2576                         struct idle_thread_runtime *itr = (void *)tr;
2577                         struct thread_runtime *last_tr;
2578
2579                         BUG_ON(thread->tid != 0);
2580
2581                         if (itr->last_thread == NULL)
2582                                 goto out;
2583
2584                         /* add current idle time as last thread's runtime */
2585                         last_tr = thread__get_runtime(itr->last_thread);
2586                         if (last_tr == NULL)
2587                                 goto out;
2588
2589                         timehist_update_runtime_stats(last_tr, t, tprev);
2590                         /*
2591                          * remove delta time of last thread as it's not updated
2592                          * and otherwise it will show an invalid value next
2593                          * time.  we only care total run time and run stat.
2594                          */
2595                         last_tr->dt_run = 0;
2596                         last_tr->dt_delay = 0;
2597                         last_tr->dt_sleep = 0;
2598                         last_tr->dt_iowait = 0;
2599                         last_tr->dt_preempt = 0;
2600
2601                         if (itr->cursor.nr)
2602                                 callchain_append(&itr->callchain, &itr->cursor, t - tprev);
2603
2604                         itr->last_thread = NULL;
2605                 }
2606         }
2607
2608         if (!sched->summary_only)
2609                 timehist_print_sample(sched, evsel, sample, &al, thread, t, state);
2610
2611 out:
2612         if (sched->hist_time.start == 0 && t >= ptime->start)
2613                 sched->hist_time.start = t;
2614         if (ptime->end == 0 || t <= ptime->end)
2615                 sched->hist_time.end = t;
2616
2617         if (tr) {
2618                 /* time of this sched_switch event becomes last time task seen */
2619                 tr->last_time = sample->time;
2620
2621                 /* last state is used to determine where to account wait time */
2622                 tr->last_state = state;
2623
2624                 /* sched out event for task so reset ready to run time */
2625                 tr->ready_to_run = 0;
2626         }
2627
2628         perf_evsel__save_time(evsel, sample->time, sample->cpu);
2629
2630         return rc;
2631 }
2632
2633 static int timehist_sched_switch_event(struct perf_tool *tool,
2634                              union perf_event *event,
2635                              struct evsel *evsel,
2636                              struct perf_sample *sample,
2637                              struct machine *machine __maybe_unused)
2638 {
2639         return timehist_sched_change_event(tool, event, evsel, sample, machine);
2640 }
2641
2642 static int process_lost(struct perf_tool *tool __maybe_unused,
2643                         union perf_event *event,
2644                         struct perf_sample *sample,
2645                         struct machine *machine __maybe_unused)
2646 {
2647         char tstr[64];
2648
2649         timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
2650         printf("%15s ", tstr);
2651         printf("lost %" PRI_lu64 " events on cpu %d\n", event->lost.lost, sample->cpu);
2652
2653         return 0;
2654 }
2655
2656
2657 static void print_thread_runtime(struct thread *t,
2658                                  struct thread_runtime *r)
2659 {
2660         double mean = avg_stats(&r->run_stats);
2661         float stddev;
2662
2663         printf("%*s   %5d  %9" PRIu64 " ",
2664                comm_width, timehist_get_commstr(t), t->ppid,
2665                (u64) r->run_stats.n);
2666
2667         print_sched_time(r->total_run_time, 8);
2668         stddev = rel_stddev_stats(stddev_stats(&r->run_stats), mean);
2669         print_sched_time(r->run_stats.min, 6);
2670         printf(" ");
2671         print_sched_time((u64) mean, 6);
2672         printf(" ");
2673         print_sched_time(r->run_stats.max, 6);
2674         printf("  ");
2675         printf("%5.2f", stddev);
2676         printf("   %5" PRIu64, r->migrations);
2677         printf("\n");
2678 }
2679
2680 static void print_thread_waittime(struct thread *t,
2681                                   struct thread_runtime *r)
2682 {
2683         printf("%*s   %5d  %9" PRIu64 " ",
2684                comm_width, timehist_get_commstr(t), t->ppid,
2685                (u64) r->run_stats.n);
2686
2687         print_sched_time(r->total_run_time, 8);
2688         print_sched_time(r->total_sleep_time, 6);
2689         printf(" ");
2690         print_sched_time(r->total_iowait_time, 6);
2691         printf(" ");
2692         print_sched_time(r->total_preempt_time, 6);
2693         printf(" ");
2694         print_sched_time(r->total_delay_time, 6);
2695         printf("\n");
2696 }
2697
2698 struct total_run_stats {
2699         struct perf_sched *sched;
2700         u64  sched_count;
2701         u64  task_count;
2702         u64  total_run_time;
2703 };
2704
2705 static int __show_thread_runtime(struct thread *t, void *priv)
2706 {
2707         struct total_run_stats *stats = priv;
2708         struct thread_runtime *r;
2709
2710         if (thread__is_filtered(t))
2711                 return 0;
2712
2713         r = thread__priv(t);
2714         if (r && r->run_stats.n) {
2715                 stats->task_count++;
2716                 stats->sched_count += r->run_stats.n;
2717                 stats->total_run_time += r->total_run_time;
2718
2719                 if (stats->sched->show_state)
2720                         print_thread_waittime(t, r);
2721                 else
2722                         print_thread_runtime(t, r);
2723         }
2724
2725         return 0;
2726 }
2727
2728 static int show_thread_runtime(struct thread *t, void *priv)
2729 {
2730         if (t->dead)
2731                 return 0;
2732
2733         return __show_thread_runtime(t, priv);
2734 }
2735
2736 static int show_deadthread_runtime(struct thread *t, void *priv)
2737 {
2738         if (!t->dead)
2739                 return 0;
2740
2741         return __show_thread_runtime(t, priv);
2742 }
2743
2744 static size_t callchain__fprintf_folded(FILE *fp, struct callchain_node *node)
2745 {
2746         const char *sep = " <- ";
2747         struct callchain_list *chain;
2748         size_t ret = 0;
2749         char bf[1024];
2750         bool first;
2751
2752         if (node == NULL)
2753                 return 0;
2754
2755         ret = callchain__fprintf_folded(fp, node->parent);
2756         first = (ret == 0);
2757
2758         list_for_each_entry(chain, &node->val, list) {
2759                 if (chain->ip >= PERF_CONTEXT_MAX)
2760                         continue;
2761                 if (chain->ms.sym && chain->ms.sym->ignore)
2762                         continue;
2763                 ret += fprintf(fp, "%s%s", first ? "" : sep,
2764                                callchain_list__sym_name(chain, bf, sizeof(bf),
2765                                                         false));
2766                 first = false;
2767         }
2768
2769         return ret;
2770 }
2771
2772 static size_t timehist_print_idlehist_callchain(struct rb_root_cached *root)
2773 {
2774         size_t ret = 0;
2775         FILE *fp = stdout;
2776         struct callchain_node *chain;
2777         struct rb_node *rb_node = rb_first_cached(root);
2778
2779         printf("  %16s  %8s  %s\n", "Idle time (msec)", "Count", "Callchains");
2780         printf("  %.16s  %.8s  %.50s\n", graph_dotted_line, graph_dotted_line,
2781                graph_dotted_line);
2782
2783         while (rb_node) {
2784                 chain = rb_entry(rb_node, struct callchain_node, rb_node);
2785                 rb_node = rb_next(rb_node);
2786
2787                 ret += fprintf(fp, "  ");
2788                 print_sched_time(chain->hit, 12);
2789                 ret += 16;  /* print_sched_time returns 2nd arg + 4 */
2790                 ret += fprintf(fp, " %8d  ", chain->count);
2791                 ret += callchain__fprintf_folded(fp, chain);
2792                 ret += fprintf(fp, "\n");
2793         }
2794
2795         return ret;
2796 }
2797
2798 static void timehist_print_summary(struct perf_sched *sched,
2799                                    struct perf_session *session)
2800 {
2801         struct machine *m = &session->machines.host;
2802         struct total_run_stats totals;
2803         u64 task_count;
2804         struct thread *t;
2805         struct thread_runtime *r;
2806         int i;
2807         u64 hist_time = sched->hist_time.end - sched->hist_time.start;
2808
2809         memset(&totals, 0, sizeof(totals));
2810         totals.sched = sched;
2811
2812         if (sched->idle_hist) {
2813                 printf("\nIdle-time summary\n");
2814                 printf("%*s  parent  sched-out  ", comm_width, "comm");
2815                 printf("  idle-time   min-idle    avg-idle    max-idle  stddev  migrations\n");
2816         } else if (sched->show_state) {
2817                 printf("\nWait-time summary\n");
2818                 printf("%*s  parent   sched-in  ", comm_width, "comm");
2819                 printf("   run-time      sleep      iowait     preempt       delay\n");
2820         } else {
2821                 printf("\nRuntime summary\n");
2822                 printf("%*s  parent   sched-in  ", comm_width, "comm");
2823                 printf("   run-time    min-run     avg-run     max-run  stddev  migrations\n");
2824         }
2825         printf("%*s            (count)  ", comm_width, "");
2826         printf("     (msec)     (msec)      (msec)      (msec)       %s\n",
2827                sched->show_state ? "(msec)" : "%");
2828         printf("%.117s\n", graph_dotted_line);
2829
2830         machine__for_each_thread(m, show_thread_runtime, &totals);
2831         task_count = totals.task_count;
2832         if (!task_count)
2833                 printf("<no still running tasks>\n");
2834
2835         printf("\nTerminated tasks:\n");
2836         machine__for_each_thread(m, show_deadthread_runtime, &totals);
2837         if (task_count == totals.task_count)
2838                 printf("<no terminated tasks>\n");
2839
2840         /* CPU idle stats not tracked when samples were skipped */
2841         if (sched->skipped_samples && !sched->idle_hist)
2842                 return;
2843
2844         printf("\nIdle stats:\n");
2845         for (i = 0; i < idle_max_cpu; ++i) {
2846                 t = idle_threads[i];
2847                 if (!t)
2848                         continue;
2849
2850                 r = thread__priv(t);
2851                 if (r && r->run_stats.n) {
2852                         totals.sched_count += r->run_stats.n;
2853                         printf("    CPU %2d idle for ", i);
2854                         print_sched_time(r->total_run_time, 6);
2855                         printf(" msec  (%6.2f%%)\n", 100.0 * r->total_run_time / hist_time);
2856                 } else
2857                         printf("    CPU %2d idle entire time window\n", i);
2858         }
2859
2860         if (sched->idle_hist && sched->show_callchain) {
2861                 callchain_param.mode  = CHAIN_FOLDED;
2862                 callchain_param.value = CCVAL_PERIOD;
2863
2864                 callchain_register_param(&callchain_param);
2865
2866                 printf("\nIdle stats by callchain:\n");
2867                 for (i = 0; i < idle_max_cpu; ++i) {
2868                         struct idle_thread_runtime *itr;
2869
2870                         t = idle_threads[i];
2871                         if (!t)
2872                                 continue;
2873
2874                         itr = thread__priv(t);
2875                         if (itr == NULL)
2876                                 continue;
2877
2878                         callchain_param.sort(&itr->sorted_root.rb_root, &itr->callchain,
2879                                              0, &callchain_param);
2880
2881                         printf("  CPU %2d:", i);
2882                         print_sched_time(itr->tr.total_run_time, 6);
2883                         printf(" msec\n");
2884                         timehist_print_idlehist_callchain(&itr->sorted_root);
2885                         printf("\n");
2886                 }
2887         }
2888
2889         printf("\n"
2890                "    Total number of unique tasks: %" PRIu64 "\n"
2891                "Total number of context switches: %" PRIu64 "\n",
2892                totals.task_count, totals.sched_count);
2893
2894         printf("           Total run time (msec): ");
2895         print_sched_time(totals.total_run_time, 2);
2896         printf("\n");
2897
2898         printf("    Total scheduling time (msec): ");
2899         print_sched_time(hist_time, 2);
2900         printf(" (x %d)\n", sched->max_cpu);
2901 }
2902
2903 typedef int (*sched_handler)(struct perf_tool *tool,
2904                           union perf_event *event,
2905                           struct evsel *evsel,
2906                           struct perf_sample *sample,
2907                           struct machine *machine);
2908
2909 static int perf_timehist__process_sample(struct perf_tool *tool,
2910                                          union perf_event *event,
2911                                          struct perf_sample *sample,
2912                                          struct evsel *evsel,
2913                                          struct machine *machine)
2914 {
2915         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
2916         int err = 0;
2917         int this_cpu = sample->cpu;
2918
2919         if (this_cpu > sched->max_cpu)
2920                 sched->max_cpu = this_cpu;
2921
2922         if (evsel->handler != NULL) {
2923                 sched_handler f = evsel->handler;
2924
2925                 err = f(tool, event, evsel, sample, machine);
2926         }
2927
2928         return err;
2929 }
2930
2931 static int timehist_check_attr(struct perf_sched *sched,
2932                                struct evlist *evlist)
2933 {
2934         struct evsel *evsel;
2935         struct evsel_runtime *er;
2936
2937         list_for_each_entry(evsel, &evlist->core.entries, core.node) {
2938                 er = perf_evsel__get_runtime(evsel);
2939                 if (er == NULL) {
2940                         pr_err("Failed to allocate memory for evsel runtime data\n");
2941                         return -1;
2942                 }
2943
2944                 if (sched->show_callchain && !evsel__has_callchain(evsel)) {
2945                         pr_info("Samples do not have callchains.\n");
2946                         sched->show_callchain = 0;
2947                         symbol_conf.use_callchain = 0;
2948                 }
2949         }
2950
2951         return 0;
2952 }
2953
2954 static int perf_sched__timehist(struct perf_sched *sched)
2955 {
2956         const struct evsel_str_handler handlers[] = {
2957                 { "sched:sched_switch",       timehist_sched_switch_event, },
2958                 { "sched:sched_wakeup",       timehist_sched_wakeup_event, },
2959                 { "sched:sched_wakeup_new",   timehist_sched_wakeup_event, },
2960         };
2961         const struct evsel_str_handler migrate_handlers[] = {
2962                 { "sched:sched_migrate_task", timehist_migrate_task_event, },
2963         };
2964         struct perf_data data = {
2965                 .path  = input_name,
2966                 .mode  = PERF_DATA_MODE_READ,
2967                 .force = sched->force,
2968         };
2969
2970         struct perf_session *session;
2971         struct evlist *evlist;
2972         int err = -1;
2973
2974         /*
2975          * event handlers for timehist option
2976          */
2977         sched->tool.sample       = perf_timehist__process_sample;
2978         sched->tool.mmap         = perf_event__process_mmap;
2979         sched->tool.comm         = perf_event__process_comm;
2980         sched->tool.exit         = perf_event__process_exit;
2981         sched->tool.fork         = perf_event__process_fork;
2982         sched->tool.lost         = process_lost;
2983         sched->tool.attr         = perf_event__process_attr;
2984         sched->tool.tracing_data = perf_event__process_tracing_data;
2985         sched->tool.build_id     = perf_event__process_build_id;
2986
2987         sched->tool.ordered_events = true;
2988         sched->tool.ordering_requires_timestamps = true;
2989
2990         symbol_conf.use_callchain = sched->show_callchain;
2991
2992         session = perf_session__new(&data, false, &sched->tool);
2993         if (IS_ERR(session))
2994                 return PTR_ERR(session);
2995
2996         evlist = session->evlist;
2997
2998         symbol__init(&session->header.env);
2999
3000         if (perf_time__parse_str(&sched->ptime, sched->time_str) != 0) {
3001                 pr_err("Invalid time string\n");
3002                 return -EINVAL;
3003         }
3004
3005         if (timehist_check_attr(sched, evlist) != 0)
3006                 goto out;
3007
3008         setup_pager();
3009
3010         /* setup per-evsel handlers */
3011         if (perf_session__set_tracepoints_handlers(session, handlers))
3012                 goto out;
3013
3014         /* sched_switch event at a minimum needs to exist */
3015         if (!perf_evlist__find_tracepoint_by_name(session->evlist,
3016                                                   "sched:sched_switch")) {
3017                 pr_err("No sched_switch events found. Have you run 'perf sched record'?\n");
3018                 goto out;
3019         }
3020
3021         if (sched->show_migrations &&
3022             perf_session__set_tracepoints_handlers(session, migrate_handlers))
3023                 goto out;
3024
3025         /* pre-allocate struct for per-CPU idle stats */
3026         sched->max_cpu = session->header.env.nr_cpus_online;
3027         if (sched->max_cpu == 0)
3028                 sched->max_cpu = 4;
3029         if (init_idle_threads(sched->max_cpu))
3030                 goto out;
3031
3032         /* summary_only implies summary option, but don't overwrite summary if set */
3033         if (sched->summary_only)
3034                 sched->summary = sched->summary_only;
3035
3036         if (!sched->summary_only)
3037                 timehist_header(sched);
3038
3039         err = perf_session__process_events(session);
3040         if (err) {
3041                 pr_err("Failed to process events, error %d", err);
3042                 goto out;
3043         }
3044
3045         sched->nr_events      = evlist->stats.nr_events[0];
3046         sched->nr_lost_events = evlist->stats.total_lost;
3047         sched->nr_lost_chunks = evlist->stats.nr_events[PERF_RECORD_LOST];
3048
3049         if (sched->summary)
3050                 timehist_print_summary(sched, session);
3051
3052 out:
3053         free_idle_threads();
3054         perf_session__delete(session);
3055
3056         return err;
3057 }
3058
3059
3060 static void print_bad_events(struct perf_sched *sched)
3061 {
3062         if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
3063                 printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
3064                         (double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0,
3065                         sched->nr_unordered_timestamps, sched->nr_timestamps);
3066         }
3067         if (sched->nr_lost_events && sched->nr_events) {
3068                 printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
3069                         (double)sched->nr_lost_events/(double)sched->nr_events * 100.0,
3070                         sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks);
3071         }
3072         if (sched->nr_context_switch_bugs && sched->nr_timestamps) {
3073                 printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
3074                         (double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0,
3075                         sched->nr_context_switch_bugs, sched->nr_timestamps);
3076                 if (sched->nr_lost_events)
3077                         printf(" (due to lost events?)");
3078                 printf("\n");
3079         }
3080 }
3081
3082 static void __merge_work_atoms(struct rb_root_cached *root, struct work_atoms *data)
3083 {
3084         struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL;
3085         struct work_atoms *this;
3086         const char *comm = thread__comm_str(data->thread), *this_comm;
3087         bool leftmost = true;
3088
3089         while (*new) {
3090                 int cmp;
3091
3092                 this = container_of(*new, struct work_atoms, node);
3093                 parent = *new;
3094
3095                 this_comm = thread__comm_str(this->thread);
3096                 cmp = strcmp(comm, this_comm);
3097                 if (cmp > 0) {
3098                         new = &((*new)->rb_left);
3099                 } else if (cmp < 0) {
3100                         new = &((*new)->rb_right);
3101                         leftmost = false;
3102                 } else {
3103                         this->num_merged++;
3104                         this->total_runtime += data->total_runtime;
3105                         this->nb_atoms += data->nb_atoms;
3106                         this->total_lat += data->total_lat;
3107                         list_splice(&data->work_list, &this->work_list);
3108                         if (this->max_lat < data->max_lat) {
3109                                 this->max_lat = data->max_lat;
3110                                 this->max_lat_at = data->max_lat_at;
3111                         }
3112                         zfree(&data);
3113                         return;
3114                 }
3115         }
3116
3117         data->num_merged++;
3118         rb_link_node(&data->node, parent, new);
3119         rb_insert_color_cached(&data->node, root, leftmost);
3120 }
3121
3122 static void perf_sched__merge_lat(struct perf_sched *sched)
3123 {
3124         struct work_atoms *data;
3125         struct rb_node *node;
3126
3127         if (sched->skip_merge)
3128                 return;
3129
3130         while ((node = rb_first_cached(&sched->atom_root))) {
3131                 rb_erase_cached(node, &sched->atom_root);
3132                 data = rb_entry(node, struct work_atoms, node);
3133                 __merge_work_atoms(&sched->merged_atom_root, data);
3134         }
3135 }
3136
3137 static int perf_sched__lat(struct perf_sched *sched)
3138 {
3139         struct rb_node *next;
3140
3141         setup_pager();
3142
3143         if (perf_sched__read_events(sched))
3144                 return -1;
3145
3146         perf_sched__merge_lat(sched);
3147         perf_sched__sort_lat(sched);
3148
3149         printf("\n -----------------------------------------------------------------------------------------------------------------\n");
3150         printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms | Maximum delay at       |\n");
3151         printf(" -----------------------------------------------------------------------------------------------------------------\n");
3152
3153         next = rb_first_cached(&sched->sorted_atom_root);
3154
3155         while (next) {
3156                 struct work_atoms *work_list;
3157
3158                 work_list = rb_entry(next, struct work_atoms, node);
3159                 output_lat_thread(sched, work_list);
3160                 next = rb_next(next);
3161                 thread__zput(work_list->thread);
3162         }
3163
3164         printf(" -----------------------------------------------------------------------------------------------------------------\n");
3165         printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
3166                 (double)sched->all_runtime / NSEC_PER_MSEC, sched->all_count);
3167
3168         printf(" ---------------------------------------------------\n");
3169
3170         print_bad_events(sched);
3171         printf("\n");
3172
3173         return 0;
3174 }
3175
3176 static int setup_map_cpus(struct perf_sched *sched)
3177 {
3178         struct perf_cpu_map *map;
3179
3180         sched->max_cpu  = sysconf(_SC_NPROCESSORS_CONF);
3181
3182         if (sched->map.comp) {
3183                 sched->map.comp_cpus = zalloc(sched->max_cpu * sizeof(int));
3184                 if (!sched->map.comp_cpus)
3185                         return -1;
3186         }
3187
3188         if (!sched->map.cpus_str)
3189                 return 0;
3190
3191         map = perf_cpu_map__new(sched->map.cpus_str);
3192         if (!map) {
3193                 pr_err("failed to get cpus map from %s\n", sched->map.cpus_str);
3194                 return -1;
3195         }
3196
3197         sched->map.cpus = map;
3198         return 0;
3199 }
3200
3201 static int setup_color_pids(struct perf_sched *sched)
3202 {
3203         struct perf_thread_map *map;
3204
3205         if (!sched->map.color_pids_str)
3206                 return 0;
3207
3208         map = thread_map__new_by_tid_str(sched->map.color_pids_str);
3209         if (!map) {
3210                 pr_err("failed to get thread map from %s\n", sched->map.color_pids_str);
3211                 return -1;
3212         }
3213
3214         sched->map.color_pids = map;
3215         return 0;
3216 }
3217
3218 static int setup_color_cpus(struct perf_sched *sched)
3219 {
3220         struct perf_cpu_map *map;
3221
3222         if (!sched->map.color_cpus_str)
3223                 return 0;
3224
3225         map = perf_cpu_map__new(sched->map.color_cpus_str);
3226         if (!map) {
3227                 pr_err("failed to get thread map from %s\n", sched->map.color_cpus_str);
3228                 return -1;
3229         }
3230
3231         sched->map.color_cpus = map;
3232         return 0;
3233 }
3234
3235 static int perf_sched__map(struct perf_sched *sched)
3236 {
3237         if (setup_map_cpus(sched))
3238                 return -1;
3239
3240         if (setup_color_pids(sched))
3241                 return -1;
3242
3243         if (setup_color_cpus(sched))
3244                 return -1;
3245
3246         setup_pager();
3247         if (perf_sched__read_events(sched))
3248                 return -1;
3249         print_bad_events(sched);
3250         return 0;
3251 }
3252
3253 static int perf_sched__replay(struct perf_sched *sched)
3254 {
3255         unsigned long i;
3256
3257         calibrate_run_measurement_overhead(sched);
3258         calibrate_sleep_measurement_overhead(sched);
3259
3260         test_calibrations(sched);
3261
3262         if (perf_sched__read_events(sched))
3263                 return -1;
3264
3265         printf("nr_run_events:        %ld\n", sched->nr_run_events);
3266         printf("nr_sleep_events:      %ld\n", sched->nr_sleep_events);
3267         printf("nr_wakeup_events:     %ld\n", sched->nr_wakeup_events);
3268
3269         if (sched->targetless_wakeups)
3270                 printf("target-less wakeups:  %ld\n", sched->targetless_wakeups);
3271         if (sched->multitarget_wakeups)
3272                 printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups);
3273         if (sched->nr_run_events_optimized)
3274                 printf("run atoms optimized: %ld\n",
3275                         sched->nr_run_events_optimized);
3276
3277         print_task_traces(sched);
3278         add_cross_task_wakeups(sched);
3279
3280         create_tasks(sched);
3281         printf("------------------------------------------------------------\n");
3282         for (i = 0; i < sched->replay_repeat; i++)
3283                 run_one_test(sched);
3284
3285         return 0;
3286 }
3287
3288 static void setup_sorting(struct perf_sched *sched, const struct option *options,
3289                           const char * const usage_msg[])
3290 {
3291         char *tmp, *tok, *str = strdup(sched->sort_order);
3292
3293         for (tok = strtok_r(str, ", ", &tmp);
3294                         tok; tok = strtok_r(NULL, ", ", &tmp)) {
3295                 if (sort_dimension__add(tok, &sched->sort_list) < 0) {
3296                         usage_with_options_msg(usage_msg, options,
3297                                         "Unknown --sort key: `%s'", tok);
3298                 }
3299         }
3300
3301         free(str);
3302
3303         sort_dimension__add("pid", &sched->cmp_pid);
3304 }
3305
3306 static int __cmd_record(int argc, const char **argv)
3307 {
3308         unsigned int rec_argc, i, j;
3309         const char **rec_argv;
3310         const char * const record_args[] = {
3311                 "record",
3312                 "-a",
3313                 "-R",
3314                 "-m", "1024",
3315                 "-c", "1",
3316                 "-e", "sched:sched_switch",
3317                 "-e", "sched:sched_stat_wait",
3318                 "-e", "sched:sched_stat_sleep",
3319                 "-e", "sched:sched_stat_iowait",
3320                 "-e", "sched:sched_stat_runtime",
3321                 "-e", "sched:sched_process_fork",
3322                 "-e", "sched:sched_wakeup",
3323                 "-e", "sched:sched_wakeup_new",
3324                 "-e", "sched:sched_migrate_task",
3325         };
3326
3327         rec_argc = ARRAY_SIZE(record_args) + argc - 1;
3328         rec_argv = calloc(rec_argc + 1, sizeof(char *));
3329
3330         if (rec_argv == NULL)
3331                 return -ENOMEM;
3332
3333         for (i = 0; i < ARRAY_SIZE(record_args); i++)
3334                 rec_argv[i] = strdup(record_args[i]);
3335
3336         for (j = 1; j < (unsigned int)argc; j++, i++)
3337                 rec_argv[i] = argv[j];
3338
3339         BUG_ON(i != rec_argc);
3340
3341         return cmd_record(i, rec_argv);
3342 }
3343
3344 int cmd_sched(int argc, const char **argv)
3345 {
3346         static const char default_sort_order[] = "avg, max, switch, runtime";
3347         struct perf_sched sched = {
3348                 .tool = {
3349                         .sample          = perf_sched__process_tracepoint_sample,
3350                         .comm            = perf_sched__process_comm,
3351                         .namespaces      = perf_event__process_namespaces,
3352                         .lost            = perf_event__process_lost,
3353                         .fork            = perf_sched__process_fork_event,
3354                         .ordered_events = true,
3355                 },
3356                 .cmp_pid              = LIST_HEAD_INIT(sched.cmp_pid),
3357                 .sort_list            = LIST_HEAD_INIT(sched.sort_list),
3358                 .start_work_mutex     = PTHREAD_MUTEX_INITIALIZER,
3359                 .work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER,
3360                 .sort_order           = default_sort_order,
3361                 .replay_repeat        = 10,
3362                 .profile_cpu          = -1,
3363                 .next_shortname1      = 'A',
3364                 .next_shortname2      = '0',
3365                 .skip_merge           = 0,
3366                 .show_callchain       = 1,
3367                 .max_stack            = 5,
3368         };
3369         const struct option sched_options[] = {
3370         OPT_STRING('i', "input", &input_name, "file",
3371                     "input file name"),
3372         OPT_INCR('v', "verbose", &verbose,
3373                     "be more verbose (show symbol address, etc)"),
3374         OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
3375                     "dump raw trace in ASCII"),
3376         OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"),
3377         OPT_END()
3378         };
3379         const struct option latency_options[] = {
3380         OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
3381                    "sort by key(s): runtime, switch, avg, max"),
3382         OPT_INTEGER('C', "CPU", &sched.profile_cpu,
3383                     "CPU to profile on"),
3384         OPT_BOOLEAN('p', "pids", &sched.skip_merge,
3385                     "latency stats per pid instead of per comm"),
3386         OPT_PARENT(sched_options)
3387         };
3388         const struct option replay_options[] = {
3389         OPT_UINTEGER('r', "repeat", &sched.replay_repeat,
3390                      "repeat the workload replay N times (-1: infinite)"),
3391         OPT_PARENT(sched_options)
3392         };
3393         const struct option map_options[] = {
3394         OPT_BOOLEAN(0, "compact", &sched.map.comp,
3395                     "map output in compact mode"),
3396         OPT_STRING(0, "color-pids", &sched.map.color_pids_str, "pids",
3397                    "highlight given pids in map"),
3398         OPT_STRING(0, "color-cpus", &sched.map.color_cpus_str, "cpus",
3399                     "highlight given CPUs in map"),
3400         OPT_STRING(0, "cpus", &sched.map.cpus_str, "cpus",
3401                     "display given CPUs in map"),
3402         OPT_PARENT(sched_options)
3403         };
3404         const struct option timehist_options[] = {
3405         OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
3406                    "file", "vmlinux pathname"),
3407         OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name,
3408                    "file", "kallsyms pathname"),
3409         OPT_BOOLEAN('g', "call-graph", &sched.show_callchain,
3410                     "Display call chains if present (default on)"),
3411         OPT_UINTEGER(0, "max-stack", &sched.max_stack,
3412                    "Maximum number of functions to display backtrace."),
3413         OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
3414                     "Look for files with symbols relative to this directory"),
3415         OPT_BOOLEAN('s', "summary", &sched.summary_only,
3416                     "Show only syscall summary with statistics"),
3417         OPT_BOOLEAN('S', "with-summary", &sched.summary,
3418                     "Show all syscalls and summary with statistics"),
3419         OPT_BOOLEAN('w', "wakeups", &sched.show_wakeups, "Show wakeup events"),
3420         OPT_BOOLEAN('n', "next", &sched.show_next, "Show next task"),
3421         OPT_BOOLEAN('M', "migrations", &sched.show_migrations, "Show migration events"),
3422         OPT_BOOLEAN('V', "cpu-visual", &sched.show_cpu_visual, "Add CPU visual"),
3423         OPT_BOOLEAN('I', "idle-hist", &sched.idle_hist, "Show idle events only"),
3424         OPT_STRING(0, "time", &sched.time_str, "str",
3425                    "Time span for analysis (start,stop)"),
3426         OPT_BOOLEAN(0, "state", &sched.show_state, "Show task state when sched-out"),
3427         OPT_STRING('p', "pid", &symbol_conf.pid_list_str, "pid[,pid...]",
3428                    "analyze events only for given process id(s)"),
3429         OPT_STRING('t', "tid", &symbol_conf.tid_list_str, "tid[,tid...]",
3430                    "analyze events only for given thread id(s)"),
3431         OPT_PARENT(sched_options)
3432         };
3433
3434         const char * const latency_usage[] = {
3435                 "perf sched latency [<options>]",
3436                 NULL
3437         };
3438         const char * const replay_usage[] = {
3439                 "perf sched replay [<options>]",
3440                 NULL
3441         };
3442         const char * const map_usage[] = {
3443                 "perf sched map [<options>]",
3444                 NULL
3445         };
3446         const char * const timehist_usage[] = {
3447                 "perf sched timehist [<options>]",
3448                 NULL
3449         };
3450         const char *const sched_subcommands[] = { "record", "latency", "map",
3451                                                   "replay", "script",
3452                                                   "timehist", NULL };
3453         const char *sched_usage[] = {
3454                 NULL,
3455                 NULL
3456         };
3457         struct trace_sched_handler lat_ops  = {
3458                 .wakeup_event       = latency_wakeup_event,
3459                 .switch_event       = latency_switch_event,
3460                 .runtime_event      = latency_runtime_event,
3461                 .migrate_task_event = latency_migrate_task_event,
3462         };
3463         struct trace_sched_handler map_ops  = {
3464                 .switch_event       = map_switch_event,
3465         };
3466         struct trace_sched_handler replay_ops  = {
3467                 .wakeup_event       = replay_wakeup_event,
3468                 .switch_event       = replay_switch_event,
3469                 .fork_event         = replay_fork_event,
3470         };
3471         unsigned int i;
3472
3473         for (i = 0; i < ARRAY_SIZE(sched.curr_pid); i++)
3474                 sched.curr_pid[i] = -1;
3475
3476         argc = parse_options_subcommand(argc, argv, sched_options, sched_subcommands,
3477                                         sched_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3478         if (!argc)
3479                 usage_with_options(sched_usage, sched_options);
3480
3481         /*
3482          * Aliased to 'perf script' for now:
3483          */
3484         if (!strcmp(argv[0], "script"))
3485                 return cmd_script(argc, argv);
3486
3487         if (!strncmp(argv[0], "rec", 3)) {
3488                 return __cmd_record(argc, argv);
3489         } else if (!strncmp(argv[0], "lat", 3)) {
3490                 sched.tp_handler = &lat_ops;
3491                 if (argc > 1) {
3492                         argc = parse_options(argc, argv, latency_options, latency_usage, 0);
3493                         if (argc)
3494                                 usage_with_options(latency_usage, latency_options);
3495                 }
3496                 setup_sorting(&sched, latency_options, latency_usage);
3497                 return perf_sched__lat(&sched);
3498         } else if (!strcmp(argv[0], "map")) {
3499                 if (argc) {
3500                         argc = parse_options(argc, argv, map_options, map_usage, 0);
3501                         if (argc)
3502                                 usage_with_options(map_usage, map_options);
3503                 }
3504                 sched.tp_handler = &map_ops;
3505                 setup_sorting(&sched, latency_options, latency_usage);
3506                 return perf_sched__map(&sched);
3507         } else if (!strncmp(argv[0], "rep", 3)) {
3508                 sched.tp_handler = &replay_ops;
3509                 if (argc) {
3510                         argc = parse_options(argc, argv, replay_options, replay_usage, 0);
3511                         if (argc)
3512                                 usage_with_options(replay_usage, replay_options);
3513                 }
3514                 return perf_sched__replay(&sched);
3515         } else if (!strcmp(argv[0], "timehist")) {
3516                 if (argc) {
3517                         argc = parse_options(argc, argv, timehist_options,
3518                                              timehist_usage, 0);
3519                         if (argc)
3520                                 usage_with_options(timehist_usage, timehist_options);
3521                 }
3522                 if ((sched.show_wakeups || sched.show_next) &&
3523                     sched.summary_only) {
3524                         pr_err(" Error: -s and -[n|w] are mutually exclusive.\n");
3525                         parse_options_usage(timehist_usage, timehist_options, "s", true);
3526                         if (sched.show_wakeups)
3527                                 parse_options_usage(NULL, timehist_options, "w", true);
3528                         if (sched.show_next)
3529                                 parse_options_usage(NULL, timehist_options, "n", true);
3530                         return -EINVAL;
3531                 }
3532
3533                 return perf_sched__timehist(&sched);
3534         } else {
3535                 usage_with_options(sched_usage, sched_options);
3536         }
3537
3538         return 0;
3539 }