tools/perf/builtin-sched.c

   1 #include "builtin.h"
   2 #include "perf.h"
   3
   4 #include "util/util.h"
   5 #include "util/evlist.h"
   6 #include "util/cache.h"
   7 #include "util/evsel.h"
   8 #include "util/symbol.h"
   9 #include "util/thread.h"
  10 #include "util/header.h"
  11 #include "util/session.h"
  12 #include "util/tool.h"
  13 #include "util/cloexec.h"
  14 #include "util/thread_map.h"
  15 #include "util/color.h"
  16 #include "util/stat.h"
  17 #include "util/callchain.h"
  18
  19 #include <subcmd/parse-options.h>
  20 #include "util/trace-event.h"
  21
  22 #include "util/debug.h"
  23
  24 #include <linux/log2.h>
  25 #include <sys/prctl.h>
  26 #include <sys/resource.h>
  27
  28 #include <semaphore.h>
  29 #include <pthread.h>
  30 #include <math.h>
  31 #include <api/fs/fs.h>
  32 #include <linux/time64.h>
  33
  34 #define PR_SET_NAME             15               /* Set process name */
  35 #define MAX_CPUS                4096
  36 #define COMM_LEN                20
  37 #define SYM_LEN                 129
  38 #define MAX_PID                 1024000
  39
  40 struct sched_atom;
  41
  42 struct task_desc {
  43         unsigned long           nr;
  44         unsigned long           pid;
  45         char                    comm[COMM_LEN];
  46
  47         unsigned long           nr_events;
  48         unsigned long           curr_event;
  49         struct sched_atom       **atoms;
  50
  51         pthread_t               thread;
  52         sem_t                   sleep_sem;
  53
  54         sem_t                   ready_for_work;
  55         sem_t                   work_done_sem;
  56
  57         u64                     cpu_usage;
  58 };
  59
  60 enum sched_event_type {
  61         SCHED_EVENT_RUN,
  62         SCHED_EVENT_SLEEP,
  63         SCHED_EVENT_WAKEUP,
  64         SCHED_EVENT_MIGRATION,
  65 };
  66
  67 struct sched_atom {
  68         enum sched_event_type   type;
  69         int                     specific_wait;
  70         u64                     timestamp;
  71         u64                     duration;
  72         unsigned long           nr;
  73         sem_t                   *wait_sem;
  74         struct task_desc        *wakee;
  75 };
  76
  77 #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
  78
  79 enum thread_state {
  80         THREAD_SLEEPING = 0,
  81         THREAD_WAIT_CPU,
  82         THREAD_SCHED_IN,
  83         THREAD_IGNORE
  84 };
  85
  86 struct work_atom {
  87         struct list_head        list;
  88         enum thread_state       state;
  89         u64                     sched_out_time;
  90         u64                     wake_up_time;
  91         u64                     sched_in_time;
  92         u64                     runtime;
  93 };
  94
  95 struct work_atoms {
  96         struct list_head        work_list;
  97         struct thread           *thread;
  98         struct rb_node          node;
  99         u64                     max_lat;
 100         u64                     max_lat_at;
 101         u64                     total_lat;
 102         u64                     nb_atoms;
 103         u64                     total_runtime;
 104         int                     num_merged;
 105 };
 106
 107 typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
 108
 109 struct perf_sched;
 110
 111 struct trace_sched_handler {
 112         int (*switch_event)(struct perf_sched *sched, struct perf_evsel *evsel,
 113                             struct perf_sample *sample, struct machine *machine);
 114
 115         int (*runtime_event)(struct perf_sched *sched, struct perf_evsel *evsel,
 116                              struct perf_sample *sample, struct machine *machine);
 117
 118         int (*wakeup_event)(struct perf_sched *sched, struct perf_evsel *evsel,
 119                             struct perf_sample *sample, struct machine *machine);
 120
 121         /* PERF_RECORD_FORK event, not sched_process_fork tracepoint */
 122         int (*fork_event)(struct perf_sched *sched, union perf_event *event,
 123                           struct machine *machine);
 124
 125         int (*migrate_task_event)(struct perf_sched *sched,
 126                                   struct perf_evsel *evsel,
 127                                   struct perf_sample *sample,
 128                                   struct machine *machine);
 129 };
 130
 131 #define COLOR_PIDS PERF_COLOR_BLUE
 132 #define COLOR_CPUS PERF_COLOR_BG_RED
 133
 134 struct perf_sched_map {
 135         DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS);
 136         int                     *comp_cpus;
 137         bool                     comp;
 138         struct thread_map       *color_pids;
 139         const char              *color_pids_str;
 140         struct cpu_map          *color_cpus;
 141         const char              *color_cpus_str;
 142         struct cpu_map          *cpus;
 143         const char              *cpus_str;
 144 };
 145
 146 struct perf_sched {
 147         struct perf_tool tool;
 148         const char       *sort_order;
 149         unsigned long    nr_tasks;
 150         struct task_desc **pid_to_task;
 151         struct task_desc **tasks;
 152         const struct trace_sched_handler *tp_handler;
 153         pthread_mutex_t  start_work_mutex;
 154         pthread_mutex_t  work_done_wait_mutex;
 155         int              profile_cpu;
 156 /*
 157  * Track the current task - that way we can know whether there's any
 158  * weird events, such as a task being switched away that is not current.
 159  */
 160         int              max_cpu;
 161         u32              curr_pid[MAX_CPUS];
 162         struct thread    *curr_thread[MAX_CPUS];
 163         char             next_shortname1;
 164         char             next_shortname2;
 165         unsigned int     replay_repeat;
 166         unsigned long    nr_run_events;
 167         unsigned long    nr_sleep_events;
 168         unsigned long    nr_wakeup_events;
 169         unsigned long    nr_sleep_corrections;
 170         unsigned long    nr_run_events_optimized;
 171         unsigned long    targetless_wakeups;
 172         unsigned long    multitarget_wakeups;
 173         unsigned long    nr_runs;
 174         unsigned long    nr_timestamps;
 175         unsigned long    nr_unordered_timestamps;
 176         unsigned long    nr_context_switch_bugs;
 177         unsigned long    nr_events;
 178         unsigned long    nr_lost_chunks;
 179         unsigned long    nr_lost_events;
 180         u64              run_measurement_overhead;
 181         u64              sleep_measurement_overhead;
 182         u64              start_time;
 183         u64              cpu_usage;
 184         u64              runavg_cpu_usage;
 185         u64              parent_cpu_usage;
 186         u64              runavg_parent_cpu_usage;
 187         u64              sum_runtime;
 188         u64              sum_fluct;
 189         u64              run_avg;
 190         u64              all_runtime;
 191         u64              all_count;
 192         u64              cpu_last_switched[MAX_CPUS];
 193         struct rb_root   atom_root, sorted_atom_root, merged_atom_root;
 194         struct list_head sort_list, cmp_pid;
 195         bool force;
 196         bool skip_merge;
 197         struct perf_sched_map map;
 198
 199         /* options for timehist command */
 200         bool            summary;
 201         bool            summary_only;
 202         bool            show_callchain;
 203         unsigned int    max_stack;
 204         bool            show_cpu_visual;
 205         bool            show_wakeups;
 206         u64             skipped_samples;
 207 };
 208
 209 /* per thread run time data */
 210 struct thread_runtime {
 211         u64 last_time;      /* time of previous sched in/out event */
 212         u64 dt_run;         /* run time */
 213         u64 dt_wait;        /* time between CPU access (off cpu) */
 214         u64 dt_delay;       /* time between wakeup and sched-in */
 215         u64 ready_to_run;   /* time of wakeup */
 216
 217         struct stats run_stats;
 218         u64 total_run_time;
 219 };
 220
 221 /* per event run time data */
 222 struct evsel_runtime {
 223         u64 *last_time; /* time this event was last seen per cpu */
 224         u32 ncpu;       /* highest cpu slot allocated */
 225 };
 226
 227 /* track idle times per cpu */
 228 static struct thread **idle_threads;
 229 static int idle_max_cpu;
 230 static char idle_comm[] = "<idle>";
 231
 232 static u64 get_nsecs(void)
 233 {
 234         struct timespec ts;
 235
 236         clock_gettime(CLOCK_MONOTONIC, &ts);
 237
 238         return ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec;
 239 }
 240
 241 static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
 242 {
 243         u64 T0 = get_nsecs(), T1;
 244
 245         do {
 246                 T1 = get_nsecs();
 247         } while (T1 + sched->run_measurement_overhead < T0 + nsecs);
 248 }
 249
 250 static void sleep_nsecs(u64 nsecs)
 251 {
 252         struct timespec ts;
 253
 254         ts.tv_nsec = nsecs % 999999999;
 255         ts.tv_sec = nsecs / 999999999;
 256
 257         nanosleep(&ts, NULL);
 258 }
 259
 260 static void calibrate_run_measurement_overhead(struct perf_sched *sched)
 261 {
 262         u64 T0, T1, delta, min_delta = NSEC_PER_SEC;
 263         int i;
 264
 265         for (i = 0; i < 10; i++) {
 266                 T0 = get_nsecs();
 267                 burn_nsecs(sched, 0);
 268                 T1 = get_nsecs();
 269                 delta = T1-T0;
 270                 min_delta = min(min_delta, delta);
 271         }
 272         sched->run_measurement_overhead = min_delta;
 273
 274         printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
 275 }
 276
 277 static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
 278 {
 279         u64 T0, T1, delta, min_delta = NSEC_PER_SEC;
 280         int i;
 281
 282         for (i = 0; i < 10; i++) {
 283                 T0 = get_nsecs();
 284                 sleep_nsecs(10000);
 285                 T1 = get_nsecs();
 286                 delta = T1-T0;
 287                 min_delta = min(min_delta, delta);
 288         }
 289         min_delta -= 10000;
 290         sched->sleep_measurement_overhead = min_delta;
 291
 292         printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
 293 }
 294
 295 static struct sched_atom *
 296 get_new_event(struct task_desc *task, u64 timestamp)
 297 {
 298         struct sched_atom *event = zalloc(sizeof(*event));
 299         unsigned long idx = task->nr_events;
 300         size_t size;
 301
 302         event->timestamp = timestamp;
 303         event->nr = idx;
 304
 305         task->nr_events++;
 306         size = sizeof(struct sched_atom *) * task->nr_events;
 307         task->atoms = realloc(task->atoms, size);
 308         BUG_ON(!task->atoms);
 309
 310         task->atoms[idx] = event;
 311
 312         return event;
 313 }
 314
 315 static struct sched_atom *last_event(struct task_desc *task)
 316 {
 317         if (!task->nr_events)
 318                 return NULL;
 319
 320         return task->atoms[task->nr_events - 1];
 321 }
 322
 323 static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task,
 324                                 u64 timestamp, u64 duration)
 325 {
 326         struct sched_atom *event, *curr_event = last_event(task);
 327
 328         /*
 329          * optimize an existing RUN event by merging this one
 330          * to it:
 331          */
 332         if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
 333                 sched->nr_run_events_optimized++;
 334                 curr_event->duration += duration;
 335                 return;
 336         }
 337
 338         event = get_new_event(task, timestamp);
 339
 340         event->type = SCHED_EVENT_RUN;
 341         event->duration = duration;
 342
 343         sched->nr_run_events++;
 344 }
 345
 346 static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task,
 347                                    u64 timestamp, struct task_desc *wakee)
 348 {
 349         struct sched_atom *event, *wakee_event;
 350
 351         event = get_new_event(task, timestamp);
 352         event->type = SCHED_EVENT_WAKEUP;
 353         event->wakee = wakee;
 354
 355         wakee_event = last_event(wakee);
 356         if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
 357                 sched->targetless_wakeups++;
 358                 return;
 359         }
 360         if (wakee_event->wait_sem) {
 361                 sched->multitarget_wakeups++;
 362                 return;
 363         }
 364
 365         wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
 366         sem_init(wakee_event->wait_sem, 0, 0);
 367         wakee_event->specific_wait = 1;
 368         event->wait_sem = wakee_event->wait_sem;
 369
 370         sched->nr_wakeup_events++;
 371 }
 372
 373 static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
 374                                   u64 timestamp, u64 task_state __maybe_unused)
 375 {
 376         struct sched_atom *event = get_new_event(task, timestamp);
 377
 378         event->type = SCHED_EVENT_SLEEP;
 379
 380         sched->nr_sleep_events++;
 381 }
 382
 383 static struct task_desc *register_pid(struct perf_sched *sched,
 384                                       unsigned long pid, const char *comm)
 385 {
 386         struct task_desc *task;
 387         static int pid_max;
 388
 389         if (sched->pid_to_task == NULL) {
 390                 if (sysctl__read_int("kernel/pid_max", &pid_max) < 0)
 391                         pid_max = MAX_PID;
 392                 BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL);
 393         }
 394         if (pid >= (unsigned long)pid_max) {
 395                 BUG_ON((sched->pid_to_task = realloc(sched->pid_to_task, (pid + 1) *
 396                         sizeof(struct task_desc *))) == NULL);
 397                 while (pid >= (unsigned long)pid_max)
 398                         sched->pid_to_task[pid_max++] = NULL;
 399         }
 400
 401         task = sched->pid_to_task[pid];
 402
 403         if (task)
 404                 return task;
 405
 406         task = zalloc(sizeof(*task));
 407         task->pid = pid;
 408         task->nr = sched->nr_tasks;
 409         strcpy(task->comm, comm);
 410         /*
 411          * every task starts in sleeping state - this gets ignored
 412          * if there's no wakeup pointing to this sleep state:
 413          */
 414         add_sched_event_sleep(sched, task, 0, 0);
 415
 416         sched->pid_to_task[pid] = task;
 417         sched->nr_tasks++;
 418         sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_desc *));
 419         BUG_ON(!sched->tasks);
 420         sched->tasks[task->nr] = task;
 421
 422         if (verbose)
 423                 printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
 424
 425         return task;
 426 }
 427
 428
 429 static void print_task_traces(struct perf_sched *sched)
 430 {
 431         struct task_desc *task;
 432         unsigned long i;
 433
 434         for (i = 0; i < sched->nr_tasks; i++) {
 435                 task = sched->tasks[i];
 436                 printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
 437                         task->nr, task->comm, task->pid, task->nr_events);
 438         }
 439 }
 440
 441 static void add_cross_task_wakeups(struct perf_sched *sched)
 442 {
 443         struct task_desc *task1, *task2;
 444         unsigned long i, j;
 445
 446         for (i = 0; i < sched->nr_tasks; i++) {
 447                 task1 = sched->tasks[i];
 448                 j = i + 1;
 449                 if (j == sched->nr_tasks)
 450                         j = 0;
 451                 task2 = sched->tasks[j];
 452                 add_sched_event_wakeup(sched, task1, 0, task2);
 453         }
 454 }
 455
 456 static void perf_sched__process_event(struct perf_sched *sched,
 457                                       struct sched_atom *atom)
 458 {
 459         int ret = 0;
 460
 461         switch (atom->type) {
 462                 case SCHED_EVENT_RUN:
 463                         burn_nsecs(sched, atom->duration);
 464                         break;
 465                 case SCHED_EVENT_SLEEP:
 466                         if (atom->wait_sem)
 467                                 ret = sem_wait(atom->wait_sem);
 468                         BUG_ON(ret);
 469                         break;
 470                 case SCHED_EVENT_WAKEUP:
 471                         if (atom->wait_sem)
 472                                 ret = sem_post(atom->wait_sem);
 473                         BUG_ON(ret);
 474                         break;
 475                 case SCHED_EVENT_MIGRATION:
 476                         break;
 477                 default:
 478                         BUG_ON(1);
 479         }
 480 }
 481
 482 static u64 get_cpu_usage_nsec_parent(void)
 483 {
 484         struct rusage ru;
 485         u64 sum;
 486         int err;
 487
 488         err = getrusage(RUSAGE_SELF, &ru);
 489         BUG_ON(err);
 490
 491         sum =  ru.ru_utime.tv_sec * NSEC_PER_SEC + ru.ru_utime.tv_usec * NSEC_PER_USEC;
 492         sum += ru.ru_stime.tv_sec * NSEC_PER_SEC + ru.ru_stime.tv_usec * NSEC_PER_USEC;
 493
 494         return sum;
 495 }
 496
 497 static int self_open_counters(struct perf_sched *sched, unsigned long cur_task)
 498 {
 499         struct perf_event_attr attr;
 500         char sbuf[STRERR_BUFSIZE], info[STRERR_BUFSIZE];
 501         int fd;
 502         struct rlimit limit;
 503         bool need_privilege = false;
 504
 505         memset(&attr, 0, sizeof(attr));
 506
 507         attr.type = PERF_TYPE_SOFTWARE;
 508         attr.config = PERF_COUNT_SW_TASK_CLOCK;
 509
 510 force_again:
 511         fd = sys_perf_event_open(&attr, 0, -1, -1,
 512                                  perf_event_open_cloexec_flag());
 513
 514         if (fd < 0) {
 515                 if (errno == EMFILE) {
 516                         if (sched->force) {
 517                                 BUG_ON(getrlimit(RLIMIT_NOFILE, &limit) == -1);
 518                                 limit.rlim_cur += sched->nr_tasks - cur_task;
 519                                 if (limit.rlim_cur > limit.rlim_max) {
 520                                         limit.rlim_max = limit.rlim_cur;
 521                                         need_privilege = true;
 522                                 }
 523                                 if (setrlimit(RLIMIT_NOFILE, &limit) == -1) {
 524                                         if (need_privilege && errno == EPERM)
 525                                                 strcpy(info, "Need privilege\n");
 526                                 } else
 527                                         goto force_again;
 528                         } else
 529                                 strcpy(info, "Have a try with -f option\n");
 530                 }
 531                 pr_err("Error: sys_perf_event_open() syscall returned "
 532                        "with %d (%s)\n%s", fd,
 533                        str_error_r(errno, sbuf, sizeof(sbuf)), info);
 534                 exit(EXIT_FAILURE);
 535         }
 536         return fd;
 537 }
 538
 539 static u64 get_cpu_usage_nsec_self(int fd)
 540 {
 541         u64 runtime;
 542         int ret;
 543
 544         ret = read(fd, &runtime, sizeof(runtime));
 545         BUG_ON(ret != sizeof(runtime));
 546
 547         return runtime;
 548 }
 549
 550 struct sched_thread_parms {
 551         struct task_desc  *task;
 552         struct perf_sched *sched;
 553         int fd;
 554 };
 555
 556 static void *thread_func(void *ctx)
 557 {
 558         struct sched_thread_parms *parms = ctx;
 559         struct task_desc *this_task = parms->task;
 560         struct perf_sched *sched = parms->sched;
 561         u64 cpu_usage_0, cpu_usage_1;
 562         unsigned long i, ret;
 563         char comm2[22];
 564         int fd = parms->fd;
 565
 566         zfree(&parms);
 567
 568         sprintf(comm2, ":%s", this_task->comm);
 569         prctl(PR_SET_NAME, comm2);
 570         if (fd < 0)
 571                 return NULL;
 572 again:
 573         ret = sem_post(&this_task->ready_for_work);
 574         BUG_ON(ret);
 575         ret = pthread_mutex_lock(&sched->start_work_mutex);
 576         BUG_ON(ret);
 577         ret = pthread_mutex_unlock(&sched->start_work_mutex);
 578         BUG_ON(ret);
 579
 580         cpu_usage_0 = get_cpu_usage_nsec_self(fd);
 581
 582         for (i = 0; i < this_task->nr_events; i++) {
 583                 this_task->curr_event = i;
 584                 perf_sched__process_event(sched, this_task->atoms[i]);
 585         }
 586
 587         cpu_usage_1 = get_cpu_usage_nsec_self(fd);
 588         this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
 589         ret = sem_post(&this_task->work_done_sem);
 590         BUG_ON(ret);
 591
 592         ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
 593         BUG_ON(ret);
 594         ret = pthread_mutex_unlock(&sched->work_done_wait_mutex);
 595         BUG_ON(ret);
 596
 597         goto again;
 598 }
 599
 600 static void create_tasks(struct perf_sched *sched)
 601 {
 602         struct task_desc *task;
 603         pthread_attr_t attr;
 604         unsigned long i;
 605         int err;
 606
 607         err = pthread_attr_init(&attr);
 608         BUG_ON(err);
 609         err = pthread_attr_setstacksize(&attr,
 610                         (size_t) max(16 * 1024, PTHREAD_STACK_MIN));
 611         BUG_ON(err);
 612         err = pthread_mutex_lock(&sched->start_work_mutex);
 613         BUG_ON(err);
 614         err = pthread_mutex_lock(&sched->work_done_wait_mutex);
 615         BUG_ON(err);
 616         for (i = 0; i < sched->nr_tasks; i++) {
 617                 struct sched_thread_parms *parms = malloc(sizeof(*parms));
 618                 BUG_ON(parms == NULL);
 619                 parms->task = task = sched->tasks[i];
 620                 parms->sched = sched;
 621                 parms->fd = self_open_counters(sched, i);
 622                 sem_init(&task->sleep_sem, 0, 0);
 623                 sem_init(&task->ready_for_work, 0, 0);
 624                 sem_init(&task->work_done_sem, 0, 0);
 625                 task->curr_event = 0;
 626                 err = pthread_create(&task->thread, &attr, thread_func, parms);
 627                 BUG_ON(err);
 628         }
 629 }
 630
 631 static void wait_for_tasks(struct perf_sched *sched)
 632 {
 633         u64 cpu_usage_0, cpu_usage_1;
 634         struct task_desc *task;
 635         unsigned long i, ret;
 636
 637         sched->start_time = get_nsecs();
 638         sched->cpu_usage = 0;
 639         pthread_mutex_unlock(&sched->work_done_wait_mutex);
 640
 641         for (i = 0; i < sched->nr_tasks; i++) {
 642                 task = sched->tasks[i];
 643                 ret = sem_wait(&task->ready_for_work);
 644                 BUG_ON(ret);
 645                 sem_init(&task->ready_for_work, 0, 0);
 646         }
 647         ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
 648         BUG_ON(ret);
 649
 650         cpu_usage_0 = get_cpu_usage_nsec_parent();
 651
 652         pthread_mutex_unlock(&sched->start_work_mutex);
 653
 654         for (i = 0; i < sched->nr_tasks; i++) {
 655                 task = sched->tasks[i];
 656                 ret = sem_wait(&task->work_done_sem);
 657                 BUG_ON(ret);
 658                 sem_init(&task->work_done_sem, 0, 0);
 659                 sched->cpu_usage += task->cpu_usage;
 660                 task->cpu_usage = 0;
 661         }
 662
 663         cpu_usage_1 = get_cpu_usage_nsec_parent();
 664         if (!sched->runavg_cpu_usage)
 665                 sched->runavg_cpu_usage = sched->cpu_usage;
 666         sched->runavg_cpu_usage = (sched->runavg_cpu_usage * (sched->replay_repeat - 1) + sched->cpu_usage) / sched->replay_repeat;
 667
 668         sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
 669         if (!sched->runavg_parent_cpu_usage)
 670                 sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
 671         sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * (sched->replay_repeat - 1) +
 672                                          sched->parent_cpu_usage)/sched->replay_repeat;
 673
 674         ret = pthread_mutex_lock(&sched->start_work_mutex);
 675         BUG_ON(ret);
 676
 677         for (i = 0; i < sched->nr_tasks; i++) {
 678                 task = sched->tasks[i];
 679                 sem_init(&task->sleep_sem, 0, 0);
 680                 task->curr_event = 0;
 681         }
 682 }
 683
 684 static void run_one_test(struct perf_sched *sched)
 685 {
 686         u64 T0, T1, delta, avg_delta, fluct;
 687
 688         T0 = get_nsecs();
 689         wait_for_tasks(sched);
 690         T1 = get_nsecs();
 691
 692         delta = T1 - T0;
 693         sched->sum_runtime += delta;
 694         sched->nr_runs++;
 695
 696         avg_delta = sched->sum_runtime / sched->nr_runs;
 697         if (delta < avg_delta)
 698                 fluct = avg_delta - delta;
 699         else
 700                 fluct = delta - avg_delta;
 701         sched->sum_fluct += fluct;
 702         if (!sched->run_avg)
 703                 sched->run_avg = delta;
 704         sched->run_avg = (sched->run_avg * (sched->replay_repeat - 1) + delta) / sched->replay_repeat;
 705
 706         printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / NSEC_PER_MSEC);
 707
 708         printf("ravg: %0.2f, ", (double)sched->run_avg / NSEC_PER_MSEC);
 709
 710         printf("cpu: %0.2f / %0.2f",
 711                 (double)sched->cpu_usage / NSEC_PER_MSEC, (double)sched->runavg_cpu_usage / NSEC_PER_MSEC);
 712
 713 #if 0
 714         /*
 715          * rusage statistics done by the parent, these are less
 716          * accurate than the sched->sum_exec_runtime based statistics:
 717          */
 718         printf(" [%0.2f / %0.2f]",
 719                 (double)sched->parent_cpu_usage / NSEC_PER_MSEC,
 720                 (double)sched->runavg_parent_cpu_usage / NSEC_PER_MSEC);
 721 #endif
 722
 723         printf("\n");
 724
 725         if (sched->nr_sleep_corrections)
 726                 printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections);
 727         sched->nr_sleep_corrections = 0;
 728 }
 729
 730 static void test_calibrations(struct perf_sched *sched)
 731 {
 732         u64 T0, T1;
 733
 734         T0 = get_nsecs();
 735         burn_nsecs(sched, NSEC_PER_MSEC);
 736         T1 = get_nsecs();
 737
 738         printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
 739
 740         T0 = get_nsecs();
 741         sleep_nsecs(NSEC_PER_MSEC);
 742         T1 = get_nsecs();
 743
 744         printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
 745 }
 746
 747 static int
 748 replay_wakeup_event(struct perf_sched *sched,
 749                     struct perf_evsel *evsel, struct perf_sample *sample,
 750                     struct machine *machine __maybe_unused)
 751 {
 752         const char *comm = perf_evsel__strval(evsel, sample, "comm");
 753         const u32 pid    = perf_evsel__intval(evsel, sample, "pid");
 754         struct task_desc *waker, *wakee;
 755
 756         if (verbose) {
 757                 printf("sched_wakeup event %p\n", evsel);
 758
 759                 printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
 760         }
 761
 762         waker = register_pid(sched, sample->tid, "<unknown>");
 763         wakee = register_pid(sched, pid, comm);
 764
 765         add_sched_event_wakeup(sched, waker, sample->time, wakee);
 766         return 0;
 767 }
 768
 769 static int replay_switch_event(struct perf_sched *sched,
 770                                struct perf_evsel *evsel,
 771                                struct perf_sample *sample,
 772                                struct machine *machine __maybe_unused)
 773 {
 774         const char *prev_comm  = perf_evsel__strval(evsel, sample, "prev_comm"),
 775                    *next_comm  = perf_evsel__strval(evsel, sample, "next_comm");
 776         const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
 777                   next_pid = perf_evsel__intval(evsel, sample, "next_pid");
 778         const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
 779         struct task_desc *prev, __maybe_unused *next;
 780         u64 timestamp0, timestamp = sample->time;
 781         int cpu = sample->cpu;
 782         s64 delta;
 783
 784         if (verbose)
 785                 printf("sched_switch event %p\n", evsel);
 786
 787         if (cpu >= MAX_CPUS || cpu < 0)
 788                 return 0;
 789
 790         timestamp0 = sched->cpu_last_switched[cpu];
 791         if (timestamp0)
 792                 delta = timestamp - timestamp0;
 793         else
 794                 delta = 0;
 795
 796         if (delta < 0) {
 797                 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
 798                 return -1;
 799         }
 800
 801         pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
 802                  prev_comm, prev_pid, next_comm, next_pid, delta);
 803
 804         prev = register_pid(sched, prev_pid, prev_comm);
 805         next = register_pid(sched, next_pid, next_comm);
 806
 807         sched->cpu_last_switched[cpu] = timestamp;
 808
 809         add_sched_event_run(sched, prev, timestamp, delta);
 810         add_sched_event_sleep(sched, prev, timestamp, prev_state);
 811
 812         return 0;
 813 }
 814
 815 static int replay_fork_event(struct perf_sched *sched,
 816                              union perf_event *event,
 817                              struct machine *machine)
 818 {
 819         struct thread *child, *parent;
 820
 821         child = machine__findnew_thread(machine, event->fork.pid,
 822                                         event->fork.tid);
 823         parent = machine__findnew_thread(machine, event->fork.ppid,
 824                                          event->fork.ptid);
 825
 826         if (child == NULL || parent == NULL) {
 827                 pr_debug("thread does not exist on fork event: child %p, parent %p\n",
 828                                  child, parent);
 829                 goto out_put;
 830         }
 831
 832         if (verbose) {
 833                 printf("fork event\n");
 834                 printf("... parent: %s/%d\n", thread__comm_str(parent), parent->tid);
 835                 printf("...  child: %s/%d\n", thread__comm_str(child), child->tid);
 836         }
 837
 838         register_pid(sched, parent->tid, thread__comm_str(parent));
 839         register_pid(sched, child->tid, thread__comm_str(child));
 840 out_put:
 841         thread__put(child);
 842         thread__put(parent);
 843         return 0;
 844 }
 845
 846 struct sort_dimension {
 847         const char              *name;
 848         sort_fn_t               cmp;
 849         struct list_head        list;
 850 };
 851
 852 static int
 853 thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
 854 {
 855         struct sort_dimension *sort;
 856         int ret = 0;
 857
 858         BUG_ON(list_empty(list));
 859
 860         list_for_each_entry(sort, list, list) {
 861                 ret = sort->cmp(l, r);
 862                 if (ret)
 863                         return ret;
 864         }
 865
 866         return ret;
 867 }
 868
 869 static struct work_atoms *
 870 thread_atoms_search(struct rb_root *root, struct thread *thread,
 871                          struct list_head *sort_list)
 872 {
 873         struct rb_node *node = root->rb_node;
 874         struct work_atoms key = { .thread = thread };
 875
 876         while (node) {
 877                 struct work_atoms *atoms;
 878                 int cmp;
 879
 880                 atoms = container_of(node, struct work_atoms, node);
 881
 882                 cmp = thread_lat_cmp(sort_list, &key, atoms);
 883                 if (cmp > 0)
 884                         node = node->rb_left;
 885                 else if (cmp < 0)
 886                         node = node->rb_right;
 887                 else {
 888                         BUG_ON(thread != atoms->thread);
 889                         return atoms;
 890                 }
 891         }
 892         return NULL;
 893 }
 894
 895 static void
 896 __thread_latency_insert(struct rb_root *root, struct work_atoms *data,
 897                          struct list_head *sort_list)
 898 {
 899         struct rb_node **new = &(root->rb_node), *parent = NULL;
 900
 901         while (*new) {
 902                 struct work_atoms *this;
 903                 int cmp;
 904
 905                 this = container_of(*new, struct work_atoms, node);
 906                 parent = *new;
 907
 908                 cmp = thread_lat_cmp(sort_list, data, this);
 909
 910                 if (cmp > 0)
 911                         new = &((*new)->rb_left);
 912                 else
 913                         new = &((*new)->rb_right);
 914         }
 915
 916         rb_link_node(&data->node, parent, new);
 917         rb_insert_color(&data->node, root);
 918 }
 919
 920 static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
 921 {
 922         struct work_atoms *atoms = zalloc(sizeof(*atoms));
 923         if (!atoms) {
 924                 pr_err("No memory at %s\n", __func__);
 925                 return -1;
 926         }
 927
 928         atoms->thread = thread__get(thread);
 929         INIT_LIST_HEAD(&atoms->work_list);
 930         __thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid);
 931         return 0;
 932 }
 933
 934 static char sched_out_state(u64 prev_state)
 935 {
 936         const char *str = TASK_STATE_TO_CHAR_STR;
 937
 938         return str[prev_state];
 939 }
 940
 941 static int
 942 add_sched_out_event(struct work_atoms *atoms,
 943                     char run_state,
 944                     u64 timestamp)
 945 {
 946         struct work_atom *atom = zalloc(sizeof(*atom));
 947         if (!atom) {
 948                 pr_err("Non memory at %s", __func__);
 949                 return -1;
 950         }
 951
 952         atom->sched_out_time = timestamp;
 953
 954         if (run_state == 'R') {
 955                 atom->state = THREAD_WAIT_CPU;
 956                 atom->wake_up_time = atom->sched_out_time;
 957         }
 958
 959         list_add_tail(&atom->list, &atoms->work_list);
 960         return 0;
 961 }
 962
 963 static void
 964 add_runtime_event(struct work_atoms *atoms, u64 delta,
 965                   u64 timestamp __maybe_unused)
 966 {
 967         struct work_atom *atom;
 968
 969         BUG_ON(list_empty(&atoms->work_list));
 970
 971         atom = list_entry(atoms->work_list.prev, struct work_atom, list);
 972
 973         atom->runtime += delta;
 974         atoms->total_runtime += delta;
 975 }
 976
 977 static void
 978 add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
 979 {
 980         struct work_atom *atom;
 981         u64 delta;
 982
 983         if (list_empty(&atoms->work_list))
 984                 return;
 985
 986         atom = list_entry(atoms->work_list.prev, struct work_atom, list);
 987
 988         if (atom->state != THREAD_WAIT_CPU)
 989                 return;
 990
 991         if (timestamp < atom->wake_up_time) {
 992                 atom->state = THREAD_IGNORE;
 993                 return;
 994         }
 995
 996         atom->state = THREAD_SCHED_IN;
 997         atom->sched_in_time = timestamp;
 998
 999         delta = atom->sched_in_time - atom->wake_up_time;
1000         atoms->total_lat += delta;
1001         if (delta > atoms->max_lat) {
1002                 atoms->max_lat = delta;
1003                 atoms->max_lat_at = timestamp;
1004         }
1005         atoms->nb_atoms++;
1006 }
1007
1008 static int latency_switch_event(struct perf_sched *sched,
1009                                 struct perf_evsel *evsel,
1010                                 struct perf_sample *sample,
1011                                 struct machine *machine)
1012 {
1013         const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
1014                   next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1015         const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
1016         struct work_atoms *out_events, *in_events;
1017         struct thread *sched_out, *sched_in;
1018         u64 timestamp0, timestamp = sample->time;
1019         int cpu = sample->cpu, err = -1;
1020         s64 delta;
1021
1022         BUG_ON(cpu >= MAX_CPUS || cpu < 0);
1023
1024         timestamp0 = sched->cpu_last_switched[cpu];
1025         sched->cpu_last_switched[cpu] = timestamp;
1026         if (timestamp0)
1027                 delta = timestamp - timestamp0;
1028         else
1029                 delta = 0;
1030
1031         if (delta < 0) {
1032                 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1033                 return -1;
1034         }
1035
1036         sched_out = machine__findnew_thread(machine, -1, prev_pid);
1037         sched_in = machine__findnew_thread(machine, -1, next_pid);
1038         if (sched_out == NULL || sched_in == NULL)
1039                 goto out_put;
1040
1041         out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
1042         if (!out_events) {
1043                 if (thread_atoms_insert(sched, sched_out))
1044                         goto out_put;
1045                 out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
1046                 if (!out_events) {
1047                         pr_err("out-event: Internal tree error");
1048                         goto out_put;
1049                 }
1050         }
1051         if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
1052                 return -1;
1053
1054         in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
1055         if (!in_events) {
1056                 if (thread_atoms_insert(sched, sched_in))
1057                         goto out_put;
1058                 in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
1059                 if (!in_events) {
1060                         pr_err("in-event: Internal tree error");
1061                         goto out_put;
1062                 }
1063                 /*
1064                  * Take came in we have not heard about yet,
1065                  * add in an initial atom in runnable state:
1066                  */
1067                 if (add_sched_out_event(in_events, 'R', timestamp))
1068                         goto out_put;
1069         }
1070         add_sched_in_event(in_events, timestamp);
1071         err = 0;
1072 out_put:
1073         thread__put(sched_out);
1074         thread__put(sched_in);
1075         return err;
1076 }
1077
1078 static int latency_runtime_event(struct perf_sched *sched,
1079                                  struct perf_evsel *evsel,
1080                                  struct perf_sample *sample,
1081                                  struct machine *machine)
1082 {
1083         const u32 pid      = perf_evsel__intval(evsel, sample, "pid");
1084         const u64 runtime  = perf_evsel__intval(evsel, sample, "runtime");
1085         struct thread *thread = machine__findnew_thread(machine, -1, pid);
1086         struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
1087         u64 timestamp = sample->time;
1088         int cpu = sample->cpu, err = -1;
1089
1090         if (thread == NULL)
1091                 return -1;
1092
1093         BUG_ON(cpu >= MAX_CPUS || cpu < 0);
1094         if (!atoms) {
1095                 if (thread_atoms_insert(sched, thread))
1096                         goto out_put;
1097                 atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
1098                 if (!atoms) {
1099                         pr_err("in-event: Internal tree error");
1100                         goto out_put;
1101                 }
1102                 if (add_sched_out_event(atoms, 'R', timestamp))
1103                         goto out_put;
1104         }
1105
1106         add_runtime_event(atoms, runtime, timestamp);
1107         err = 0;
1108 out_put:
1109         thread__put(thread);
1110         return err;
1111 }
1112
1113 static int latency_wakeup_event(struct perf_sched *sched,
1114                                 struct perf_evsel *evsel,
1115                                 struct perf_sample *sample,
1116                                 struct machine *machine)
1117 {
1118         const u32 pid     = perf_evsel__intval(evsel, sample, "pid");
1119         struct work_atoms *atoms;
1120         struct work_atom *atom;
1121         struct thread *wakee;
1122         u64 timestamp = sample->time;
1123         int err = -1;
1124
1125         wakee = machine__findnew_thread(machine, -1, pid);
1126         if (wakee == NULL)
1127                 return -1;
1128         atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1129         if (!atoms) {
1130                 if (thread_atoms_insert(sched, wakee))
1131                         goto out_put;
1132                 atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1133                 if (!atoms) {
1134                         pr_err("wakeup-event: Internal tree error");
1135                         goto out_put;
1136                 }
1137                 if (add_sched_out_event(atoms, 'S', timestamp))
1138                         goto out_put;
1139         }
1140
1141         BUG_ON(list_empty(&atoms->work_list));
1142
1143         atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1144
1145         /*
1146          * As we do not guarantee the wakeup event happens when
1147          * task is out of run queue, also may happen when task is
1148          * on run queue and wakeup only change ->state to TASK_RUNNING,
1149          * then we should not set the ->wake_up_time when wake up a
1150          * task which is on run queue.
1151          *
1152          * You WILL be missing events if you've recorded only
1153          * one CPU, or are only looking at only one, so don't
1154          * skip in this case.
1155          */
1156         if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING)
1157                 goto out_ok;
1158
1159         sched->nr_timestamps++;
1160         if (atom->sched_out_time > timestamp) {
1161                 sched->nr_unordered_timestamps++;
1162                 goto out_ok;
1163         }
1164
1165         atom->state = THREAD_WAIT_CPU;
1166         atom->wake_up_time = timestamp;
1167 out_ok:
1168         err = 0;
1169 out_put:
1170         thread__put(wakee);
1171         return err;
1172 }
1173
1174 static int latency_migrate_task_event(struct perf_sched *sched,
1175                                       struct perf_evsel *evsel,
1176                                       struct perf_sample *sample,
1177                                       struct machine *machine)
1178 {
1179         const u32 pid = perf_evsel__intval(evsel, sample, "pid");
1180         u64 timestamp = sample->time;
1181         struct work_atoms *atoms;
1182         struct work_atom *atom;
1183         struct thread *migrant;
1184         int err = -1;
1185
1186         /*
1187          * Only need to worry about migration when profiling one CPU.
1188          */
1189         if (sched->profile_cpu == -1)
1190                 return 0;
1191
1192         migrant = machine__findnew_thread(machine, -1, pid);
1193         if (migrant == NULL)
1194                 return -1;
1195         atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1196         if (!atoms) {
1197                 if (thread_atoms_insert(sched, migrant))
1198                         goto out_put;
1199                 register_pid(sched, migrant->tid, thread__comm_str(migrant));
1200                 atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1201                 if (!atoms) {
1202                         pr_err("migration-event: Internal tree error");
1203                         goto out_put;
1204                 }
1205                 if (add_sched_out_event(atoms, 'R', timestamp))
1206                         goto out_put;
1207         }
1208
1209         BUG_ON(list_empty(&atoms->work_list));
1210
1211         atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1212         atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;
1213
1214         sched->nr_timestamps++;
1215
1216         if (atom->sched_out_time > timestamp)
1217                 sched->nr_unordered_timestamps++;
1218         err = 0;
1219 out_put:
1220         thread__put(migrant);
1221         return err;
1222 }
1223
1224 static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
1225 {
1226         int i;
1227         int ret;
1228         u64 avg;
1229         char max_lat_at[32];
1230
1231         if (!work_list->nb_atoms)
1232                 return;
1233         /*
1234          * Ignore idle threads:
1235          */
1236         if (!strcmp(thread__comm_str(work_list->thread), "swapper"))
1237                 return;
1238
1239         sched->all_runtime += work_list->total_runtime;
1240         sched->all_count   += work_list->nb_atoms;
1241
1242         if (work_list->num_merged > 1)
1243                 ret = printf("  %s:(%d) ", thread__comm_str(work_list->thread), work_list->num_merged);
1244         else
1245                 ret = printf("  %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);
1246
1247         for (i = 0; i < 24 - ret; i++)
1248                 printf(" ");
1249
1250         avg = work_list->total_lat / work_list->nb_atoms;
1251         timestamp__scnprintf_usec(work_list->max_lat_at, max_lat_at, sizeof(max_lat_at));
1252
1253         printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %13s s\n",
1254               (double)work_list->total_runtime / NSEC_PER_MSEC,
1255                  work_list->nb_atoms, (double)avg / NSEC_PER_MSEC,
1256                  (double)work_list->max_lat / NSEC_PER_MSEC,
1257                  max_lat_at);
1258 }
1259
1260 static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
1261 {
1262         if (l->thread == r->thread)
1263                 return 0;
1264         if (l->thread->tid < r->thread->tid)
1265                 return -1;
1266         if (l->thread->tid > r->thread->tid)
1267                 return 1;
1268         return (int)(l->thread - r->thread);
1269 }
1270
1271 static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
1272 {
1273         u64 avgl, avgr;
1274
1275         if (!l->nb_atoms)
1276                 return -1;
1277
1278         if (!r->nb_atoms)
1279                 return 1;
1280
1281         avgl = l->total_lat / l->nb_atoms;
1282         avgr = r->total_lat / r->nb_atoms;
1283
1284         if (avgl < avgr)
1285                 return -1;
1286         if (avgl > avgr)
1287                 return 1;
1288
1289         return 0;
1290 }
1291
1292 static int max_cmp(struct work_atoms *l, struct work_atoms *r)
1293 {
1294         if (l->max_lat < r->max_lat)
1295                 return -1;
1296         if (l->max_lat > r->max_lat)
1297                 return 1;
1298
1299         return 0;
1300 }
1301
1302 static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
1303 {
1304         if (l->nb_atoms < r->nb_atoms)
1305                 return -1;
1306         if (l->nb_atoms > r->nb_atoms)
1307                 return 1;
1308
1309         return 0;
1310 }
1311
1312 static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
1313 {
1314         if (l->total_runtime < r->total_runtime)
1315                 return -1;
1316         if (l->total_runtime > r->total_runtime)
1317                 return 1;
1318
1319         return 0;
1320 }
1321
1322 static int sort_dimension__add(const char *tok, struct list_head *list)
1323 {
1324         size_t i;
1325         static struct sort_dimension avg_sort_dimension = {
1326                 .name = "avg",
1327                 .cmp  = avg_cmp,
1328         };
1329         static struct sort_dimension max_sort_dimension = {
1330                 .name = "max",
1331                 .cmp  = max_cmp,
1332         };
1333         static struct sort_dimension pid_sort_dimension = {
1334                 .name = "pid",
1335                 .cmp  = pid_cmp,
1336         };
1337         static struct sort_dimension runtime_sort_dimension = {
1338                 .name = "runtime",
1339                 .cmp  = runtime_cmp,
1340         };
1341         static struct sort_dimension switch_sort_dimension = {
1342                 .name = "switch",
1343                 .cmp  = switch_cmp,
1344         };
1345         struct sort_dimension *available_sorts[] = {
1346                 &pid_sort_dimension,
1347                 &avg_sort_dimension,
1348                 &max_sort_dimension,
1349                 &switch_sort_dimension,
1350                 &runtime_sort_dimension,
1351         };
1352
1353         for (i = 0; i < ARRAY_SIZE(available_sorts); i++) {
1354                 if (!strcmp(available_sorts[i]->name, tok)) {
1355                         list_add_tail(&available_sorts[i]->list, list);
1356
1357                         return 0;
1358                 }
1359         }
1360
1361         return -1;
1362 }
1363
1364 static void perf_sched__sort_lat(struct perf_sched *sched)
1365 {
1366         struct rb_node *node;
1367         struct rb_root *root = &sched->atom_root;
1368 again:
1369         for (;;) {
1370                 struct work_atoms *data;
1371                 node = rb_first(root);
1372                 if (!node)
1373                         break;
1374
1375                 rb_erase(node, root);
1376                 data = rb_entry(node, struct work_atoms, node);
1377                 __thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
1378         }
1379         if (root == &sched->atom_root) {
1380                 root = &sched->merged_atom_root;
1381                 goto again;
1382         }
1383 }
1384
1385 static int process_sched_wakeup_event(struct perf_tool *tool,
1386                                       struct perf_evsel *evsel,
1387                                       struct perf_sample *sample,
1388                                       struct machine *machine)
1389 {
1390         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1391
1392         if (sched->tp_handler->wakeup_event)
1393                 return sched->tp_handler->wakeup_event(sched, evsel, sample, machine);
1394
1395         return 0;
1396 }
1397
1398 union map_priv {
1399         void    *ptr;
1400         bool     color;
1401 };
1402
1403 static bool thread__has_color(struct thread *thread)
1404 {
1405         union map_priv priv = {
1406                 .ptr = thread__priv(thread),
1407         };
1408
1409         return priv.color;
1410 }
1411
1412 static struct thread*
1413 map__findnew_thread(struct perf_sched *sched, struct machine *machine, pid_t pid, pid_t tid)
1414 {
1415         struct thread *thread = machine__findnew_thread(machine, pid, tid);
1416         union map_priv priv = {
1417                 .color = false,
1418         };
1419
1420         if (!sched->map.color_pids || !thread || thread__priv(thread))
1421                 return thread;
1422
1423         if (thread_map__has(sched->map.color_pids, tid))
1424                 priv.color = true;
1425
1426         thread__set_priv(thread, priv.ptr);
1427         return thread;
1428 }
1429
1430 static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
1431                             struct perf_sample *sample, struct machine *machine)
1432 {
1433         const u32 next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1434         struct thread *sched_in;
1435         int new_shortname;
1436         u64 timestamp0, timestamp = sample->time;
1437         s64 delta;
1438         int i, this_cpu = sample->cpu;
1439         int cpus_nr;
1440         bool new_cpu = false;
1441         const char *color = PERF_COLOR_NORMAL;
1442         char stimestamp[32];
1443
1444         BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);
1445
1446         if (this_cpu > sched->max_cpu)
1447                 sched->max_cpu = this_cpu;
1448
1449         if (sched->map.comp) {
1450                 cpus_nr = bitmap_weight(sched->map.comp_cpus_mask, MAX_CPUS);
1451                 if (!test_and_set_bit(this_cpu, sched->map.comp_cpus_mask)) {
1452                         sched->map.comp_cpus[cpus_nr++] = this_cpu;
1453                         new_cpu = true;
1454                 }
1455         } else
1456                 cpus_nr = sched->max_cpu;
1457
1458         timestamp0 = sched->cpu_last_switched[this_cpu];
1459         sched->cpu_last_switched[this_cpu] = timestamp;
1460         if (timestamp0)
1461                 delta = timestamp - timestamp0;
1462         else
1463                 delta = 0;
1464
1465         if (delta < 0) {
1466                 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1467                 return -1;
1468         }
1469
1470         sched_in = map__findnew_thread(sched, machine, -1, next_pid);
1471         if (sched_in == NULL)
1472                 return -1;
1473
1474         sched->curr_thread[this_cpu] = thread__get(sched_in);
1475
1476         printf("  ");
1477
1478         new_shortname = 0;
1479         if (!sched_in->shortname[0]) {
1480                 if (!strcmp(thread__comm_str(sched_in), "swapper")) {
1481                         /*
1482                          * Don't allocate a letter-number for swapper:0
1483                          * as a shortname. Instead, we use '.' for it.
1484                          */
1485                         sched_in->shortname[0] = '.';
1486                         sched_in->shortname[1] = ' ';
1487                 } else {
1488                         sched_in->shortname[0] = sched->next_shortname1;
1489                         sched_in->shortname[1] = sched->next_shortname2;
1490
1491                         if (sched->next_shortname1 < 'Z') {
1492                                 sched->next_shortname1++;
1493                         } else {
1494                                 sched->next_shortname1 = 'A';
1495                                 if (sched->next_shortname2 < '9')
1496                                         sched->next_shortname2++;
1497                                 else
1498                                         sched->next_shortname2 = '0';
1499                         }
1500                 }
1501                 new_shortname = 1;
1502         }
1503
1504         for (i = 0; i < cpus_nr; i++) {
1505                 int cpu = sched->map.comp ? sched->map.comp_cpus[i] : i;
1506                 struct thread *curr_thread = sched->curr_thread[cpu];
1507                 const char *pid_color = color;
1508                 const char *cpu_color = color;
1509
1510                 if (curr_thread && thread__has_color(curr_thread))
1511                         pid_color = COLOR_PIDS;
1512
1513                 if (sched->map.cpus && !cpu_map__has(sched->map.cpus, cpu))
1514                         continue;
1515
1516                 if (sched->map.color_cpus && cpu_map__has(sched->map.color_cpus, cpu))
1517                         cpu_color = COLOR_CPUS;
1518
1519                 if (cpu != this_cpu)
1520                         color_fprintf(stdout, color, " ");
1521                 else
1522                         color_fprintf(stdout, cpu_color, "*");
1523
1524                 if (sched->curr_thread[cpu])
1525                         color_fprintf(stdout, pid_color, "%2s ", sched->curr_thread[cpu]->shortname);
1526                 else
1527                         color_fprintf(stdout, color, "   ");
1528         }
1529
1530         if (sched->map.cpus && !cpu_map__has(sched->map.cpus, this_cpu))
1531                 goto out;
1532
1533         timestamp__scnprintf_usec(timestamp, stimestamp, sizeof(stimestamp));
1534         color_fprintf(stdout, color, "  %12s secs ", stimestamp);
1535         if (new_shortname || (verbose && sched_in->tid)) {
1536                 const char *pid_color = color;
1537
1538                 if (thread__has_color(sched_in))
1539                         pid_color = COLOR_PIDS;
1540
1541                 color_fprintf(stdout, pid_color, "%s => %s:%d",
1542                        sched_in->shortname, thread__comm_str(sched_in), sched_in->tid);
1543         }
1544
1545         if (sched->map.comp && new_cpu)
1546                 color_fprintf(stdout, color, " (CPU %d)", this_cpu);
1547
1548 out:
1549         color_fprintf(stdout, color, "\n");
1550
1551         thread__put(sched_in);
1552
1553         return 0;
1554 }
1555
1556 static int process_sched_switch_event(struct perf_tool *tool,
1557                                       struct perf_evsel *evsel,
1558                                       struct perf_sample *sample,
1559                                       struct machine *machine)
1560 {
1561         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1562         int this_cpu = sample->cpu, err = 0;
1563         u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
1564             next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1565
1566         if (sched->curr_pid[this_cpu] != (u32)-1) {
1567                 /*
1568                  * Are we trying to switch away a PID that is
1569                  * not current?
1570                  */
1571                 if (sched->curr_pid[this_cpu] != prev_pid)
1572                         sched->nr_context_switch_bugs++;
1573         }
1574
1575         if (sched->tp_handler->switch_event)
1576                 err = sched->tp_handler->switch_event(sched, evsel, sample, machine);
1577
1578         sched->curr_pid[this_cpu] = next_pid;
1579         return err;
1580 }
1581
1582 static int process_sched_runtime_event(struct perf_tool *tool,
1583                                        struct perf_evsel *evsel,
1584                                        struct perf_sample *sample,
1585                                        struct machine *machine)
1586 {
1587         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1588
1589         if (sched->tp_handler->runtime_event)
1590                 return sched->tp_handler->runtime_event(sched, evsel, sample, machine);
1591
1592         return 0;
1593 }
1594
1595 static int perf_sched__process_fork_event(struct perf_tool *tool,
1596                                           union perf_event *event,
1597                                           struct perf_sample *sample,
1598                                           struct machine *machine)
1599 {
1600         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1601
1602         /* run the fork event through the perf machineruy */
1603         perf_event__process_fork(tool, event, sample, machine);
1604
1605         /* and then run additional processing needed for this command */
1606         if (sched->tp_handler->fork_event)
1607                 return sched->tp_handler->fork_event(sched, event, machine);
1608
1609         return 0;
1610 }
1611
1612 static int process_sched_migrate_task_event(struct perf_tool *tool,
1613                                             struct perf_evsel *evsel,
1614                                             struct perf_sample *sample,
1615                                             struct machine *machine)
1616 {
1617         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1618
1619         if (sched->tp_handler->migrate_task_event)
1620                 return sched->tp_handler->migrate_task_event(sched, evsel, sample, machine);
1621
1622         return 0;
1623 }
1624
1625 typedef int (*tracepoint_handler)(struct perf_tool *tool,
1626                                   struct perf_evsel *evsel,
1627                                   struct perf_sample *sample,
1628                                   struct machine *machine);
1629
1630 static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused,
1631                                                  union perf_event *event __maybe_unused,
1632                                                  struct perf_sample *sample,
1633                                                  struct perf_evsel *evsel,
1634                                                  struct machine *machine)
1635 {
1636         int err = 0;
1637
1638         if (evsel->handler != NULL) {
1639                 tracepoint_handler f = evsel->handler;
1640                 err = f(tool, evsel, sample, machine);
1641         }
1642
1643         return err;
1644 }
1645
1646 static int perf_sched__read_events(struct perf_sched *sched)
1647 {
1648         const struct perf_evsel_str_handler handlers[] = {
1649                 { "sched:sched_switch",       process_sched_switch_event, },
1650                 { "sched:sched_stat_runtime", process_sched_runtime_event, },
1651                 { "sched:sched_wakeup",       process_sched_wakeup_event, },
1652                 { "sched:sched_wakeup_new",   process_sched_wakeup_event, },
1653                 { "sched:sched_migrate_task", process_sched_migrate_task_event, },
1654         };
1655         struct perf_session *session;
1656         struct perf_data_file file = {
1657                 .path = input_name,
1658                 .mode = PERF_DATA_MODE_READ,
1659                 .force = sched->force,
1660         };
1661         int rc = -1;
1662
1663         session = perf_session__new(&file, false, &sched->tool);
1664         if (session == NULL) {
1665                 pr_debug("No Memory for session\n");
1666                 return -1;
1667         }
1668
1669         symbol__init(&session->header.env);
1670
1671         if (perf_session__set_tracepoints_handlers(session, handlers))
1672                 goto out_delete;
1673
1674         if (perf_session__has_traces(session, "record -R")) {
1675                 int err = perf_session__process_events(session);
1676                 if (err) {
1677                         pr_err("Failed to process events, error %d", err);
1678                         goto out_delete;
1679                 }
1680
1681                 sched->nr_events      = session->evlist->stats.nr_events[0];
1682                 sched->nr_lost_events = session->evlist->stats.total_lost;
1683                 sched->nr_lost_chunks = session->evlist->stats.nr_events[PERF_RECORD_LOST];
1684         }
1685
1686         rc = 0;
1687 out_delete:
1688         perf_session__delete(session);
1689         return rc;
1690 }
1691
1692 /*
1693  * scheduling times are printed as msec.usec
1694  */
1695 static inline void print_sched_time(unsigned long long nsecs, int width)
1696 {
1697         unsigned long msecs;
1698         unsigned long usecs;
1699
1700         msecs  = nsecs / NSEC_PER_MSEC;
1701         nsecs -= msecs * NSEC_PER_MSEC;
1702         usecs  = nsecs / NSEC_PER_USEC;
1703         printf("%*lu.%03lu ", width, msecs, usecs);
1704 }
1705
1706 /*
1707  * returns runtime data for event, allocating memory for it the
1708  * first time it is used.
1709  */
1710 static struct evsel_runtime *perf_evsel__get_runtime(struct perf_evsel *evsel)
1711 {
1712         struct evsel_runtime *r = evsel->priv;
1713
1714         if (r == NULL) {
1715                 r = zalloc(sizeof(struct evsel_runtime));
1716                 evsel->priv = r;
1717         }
1718
1719         return r;
1720 }
1721
1722 /*
1723  * save last time event was seen per cpu
1724  */
1725 static void perf_evsel__save_time(struct perf_evsel *evsel,
1726                                   u64 timestamp, u32 cpu)
1727 {
1728         struct evsel_runtime *r = perf_evsel__get_runtime(evsel);
1729
1730         if (r == NULL)
1731                 return;
1732
1733         if ((cpu >= r->ncpu) || (r->last_time == NULL)) {
1734                 int i, n = __roundup_pow_of_two(cpu+1);
1735                 void *p = r->last_time;
1736
1737                 p = realloc(r->last_time, n * sizeof(u64));
1738                 if (!p)
1739                         return;
1740
1741                 r->last_time = p;
1742                 for (i = r->ncpu; i < n; ++i)
1743                         r->last_time[i] = (u64) 0;
1744
1745                 r->ncpu = n;
1746         }
1747
1748         r->last_time[cpu] = timestamp;
1749 }
1750
1751 /* returns last time this event was seen on the given cpu */
1752 static u64 perf_evsel__get_time(struct perf_evsel *evsel, u32 cpu)
1753 {
1754         struct evsel_runtime *r = perf_evsel__get_runtime(evsel);
1755
1756         if ((r == NULL) || (r->last_time == NULL) || (cpu >= r->ncpu))
1757                 return 0;
1758
1759         return r->last_time[cpu];
1760 }
1761
1762 static int comm_width = 20;
1763
1764 static char *timehist_get_commstr(struct thread *thread)
1765 {
1766         static char str[32];
1767         const char *comm = thread__comm_str(thread);
1768         pid_t tid = thread->tid;
1769         pid_t pid = thread->pid_;
1770         int n;
1771
1772         if (pid == 0)
1773                 n = scnprintf(str, sizeof(str), "%s", comm);
1774
1775         else if (tid != pid)
1776                 n = scnprintf(str, sizeof(str), "%s[%d/%d]", comm, tid, pid);
1777
1778         else
1779                 n = scnprintf(str, sizeof(str), "%s[%d]", comm, tid);
1780
1781         if (n > comm_width)
1782                 comm_width = n;
1783
1784         return str;
1785 }
1786
1787 static void timehist_header(struct perf_sched *sched)
1788 {
1789         u32 ncpus = sched->max_cpu + 1;
1790         u32 i, j;
1791
1792         printf("%15s %6s ", "time", "cpu");
1793
1794         if (sched->show_cpu_visual) {
1795                 printf(" ");
1796                 for (i = 0, j = 0; i < ncpus; ++i) {
1797                         printf("%x", j++);
1798                         if (j > 15)
1799                                 j = 0;
1800                 }
1801                 printf(" ");
1802         }
1803
1804         printf(" %-20s  %9s  %9s  %9s",
1805                 "task name", "wait time", "sch delay", "run time");
1806
1807         printf("\n");
1808
1809         /*
1810          * units row
1811          */
1812         printf("%15s %-6s ", "", "");
1813
1814         if (sched->show_cpu_visual)
1815                 printf(" %*s ", ncpus, "");
1816
1817         printf(" %-20s  %9s  %9s  %9s\n", "[tid/pid]", "(msec)", "(msec)", "(msec)");
1818
1819         /*
1820          * separator
1821          */
1822         printf("%.15s %.6s ", graph_dotted_line, graph_dotted_line);
1823
1824         if (sched->show_cpu_visual)
1825                 printf(" %.*s ", ncpus, graph_dotted_line);
1826
1827         printf(" %.20s  %.9s  %.9s  %.9s",
1828                 graph_dotted_line, graph_dotted_line, graph_dotted_line,
1829                 graph_dotted_line);
1830
1831         printf("\n");
1832 }
1833
1834 static void timehist_print_sample(struct perf_sched *sched,
1835                                   struct perf_sample *sample,
1836                                   struct addr_location *al,
1837                                   struct thread *thread)
1838 {
1839         struct thread_runtime *tr = thread__priv(thread);
1840         u32 max_cpus = sched->max_cpu + 1;
1841         char tstr[64];
1842
1843         timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
1844         printf("%15s [%04d] ", tstr, sample->cpu);
1845
1846         if (sched->show_cpu_visual) {
1847                 u32 i;
1848                 char c;
1849
1850                 printf(" ");
1851                 for (i = 0; i < max_cpus; ++i) {
1852                         /* flag idle times with 'i'; others are sched events */
1853                         if (i == sample->cpu)
1854                                 c = (thread->tid == 0) ? 'i' : 's';
1855                         else
1856                                 c = ' ';
1857                         printf("%c", c);
1858                 }
1859                 printf(" ");
1860         }
1861
1862         printf(" %-*s ", comm_width, timehist_get_commstr(thread));
1863
1864         print_sched_time(tr->dt_wait, 6);
1865         print_sched_time(tr->dt_delay, 6);
1866         print_sched_time(tr->dt_run, 6);
1867
1868         if (sched->show_wakeups)
1869                 printf("  %-*s", comm_width, "");
1870
1871         if (thread->tid == 0)
1872                 goto out;
1873
1874         if (sched->show_callchain)
1875                 printf("  ");
1876
1877         sample__fprintf_sym(sample, al, 0,
1878                             EVSEL__PRINT_SYM | EVSEL__PRINT_ONELINE |
1879                             EVSEL__PRINT_CALLCHAIN_ARROW |
1880                             EVSEL__PRINT_SKIP_IGNORED,
1881                             &callchain_cursor, stdout);
1882
1883 out:
1884         printf("\n");
1885 }
1886
1887 /*
1888  * Explanation of delta-time stats:
1889  *
1890  *            t = time of current schedule out event
1891  *        tprev = time of previous sched out event
1892  *                also time of schedule-in event for current task
1893  *    last_time = time of last sched change event for current task
1894  *                (i.e, time process was last scheduled out)
1895  * ready_to_run = time of wakeup for current task
1896  *
1897  * -----|------------|------------|------------|------
1898  *    last         ready        tprev          t
1899  *    time         to run
1900  *
1901  *      |-------- dt_wait --------|
1902  *                   |- dt_delay -|-- dt_run --|
1903  *
1904  *   dt_run = run time of current task
1905  *  dt_wait = time between last schedule out event for task and tprev
1906  *            represents time spent off the cpu
1907  * dt_delay = time between wakeup and schedule-in of task
1908  */
1909
1910 static void timehist_update_runtime_stats(struct thread_runtime *r,
1911                                          u64 t, u64 tprev)
1912 {
1913         r->dt_delay   = 0;
1914         r->dt_wait    = 0;
1915         r->dt_run     = 0;
1916         if (tprev) {
1917                 r->dt_run = t - tprev;
1918                 if (r->ready_to_run) {
1919                         if (r->ready_to_run > tprev)
1920                                 pr_debug("time travel: wakeup time for task > previous sched_switch event\n");
1921                         else
1922                                 r->dt_delay = tprev - r->ready_to_run;
1923                 }
1924
1925                 if (r->last_time > tprev)
1926                         pr_debug("time travel: last sched out time for task > previous sched_switch event\n");
1927                 else if (r->last_time)
1928                         r->dt_wait = tprev - r->last_time;
1929         }
1930
1931         update_stats(&r->run_stats, r->dt_run);
1932         r->total_run_time += r->dt_run;
1933 }
1934
1935 static bool is_idle_sample(struct perf_sched *sched,
1936                            struct perf_sample *sample,
1937                            struct perf_evsel *evsel,
1938                            struct machine *machine)
1939 {
1940         struct thread *thread;
1941         struct callchain_cursor *cursor = &callchain_cursor;
1942
1943         /* pid 0 == swapper == idle task */
1944         if (sample->pid == 0)
1945                 return true;
1946
1947         if (strcmp(perf_evsel__name(evsel), "sched:sched_switch") == 0) {
1948                 if (perf_evsel__intval(evsel, sample, "prev_pid") == 0)
1949                         return true;
1950         }
1951
1952         /* want main thread for process - has maps */
1953         thread = machine__findnew_thread(machine, sample->pid, sample->pid);
1954         if (thread == NULL) {
1955                 pr_debug("Failed to get thread for pid %d.\n", sample->pid);
1956                 return false;
1957         }
1958
1959         if (!symbol_conf.use_callchain || sample->callchain == NULL)
1960                 return false;
1961
1962         if (thread__resolve_callchain(thread, cursor, evsel, sample,
1963                                       NULL, NULL, sched->max_stack + 2) != 0) {
1964                 if (verbose)
1965                         error("Failed to resolve callchain. Skipping\n");
1966
1967                 return false;
1968         }
1969
1970         callchain_cursor_commit(cursor);
1971
1972         while (true) {
1973                 struct callchain_cursor_node *node;
1974                 struct symbol *sym;
1975
1976                 node = callchain_cursor_current(cursor);
1977                 if (node == NULL)
1978                         break;
1979
1980                 sym = node->sym;
1981                 if (sym && sym->name) {
1982                         if (!strcmp(sym->name, "schedule") ||
1983                             !strcmp(sym->name, "__schedule") ||
1984                             !strcmp(sym->name, "preempt_schedule"))
1985                                 sym->ignore = 1;
1986                 }
1987
1988                 callchain_cursor_advance(cursor);
1989         }
1990
1991         return false;
1992 }
1993
1994 /*
1995  * Track idle stats per cpu by maintaining a local thread
1996  * struct for the idle task on each cpu.
1997  */
1998 static int init_idle_threads(int ncpu)
1999 {
2000         int i;
2001
2002         idle_threads = zalloc(ncpu * sizeof(struct thread *));
2003         if (!idle_threads)
2004                 return -ENOMEM;
2005
2006         idle_max_cpu = ncpu - 1;
2007
2008         /* allocate the actual thread struct if needed */
2009         for (i = 0; i < ncpu; ++i) {
2010                 idle_threads[i] = thread__new(0, 0);
2011                 if (idle_threads[i] == NULL)
2012                         return -ENOMEM;
2013
2014                 thread__set_comm(idle_threads[i], idle_comm, 0);
2015         }
2016
2017         return 0;
2018 }
2019
2020 static void free_idle_threads(void)
2021 {
2022         int i;
2023
2024         if (idle_threads == NULL)
2025                 return;
2026
2027         for (i = 0; i <= idle_max_cpu; ++i) {
2028                 if ((idle_threads[i]))
2029                         thread__delete(idle_threads[i]);
2030         }
2031
2032         free(idle_threads);
2033 }
2034
2035 static struct thread *get_idle_thread(int cpu)
2036 {
2037         /*
2038          * expand/allocate array of pointers to local thread
2039          * structs if needed
2040          */
2041         if ((cpu >= idle_max_cpu) || (idle_threads == NULL)) {
2042                 int i, j = __roundup_pow_of_two(cpu+1);
2043                 void *p;
2044
2045                 p = realloc(idle_threads, j * sizeof(struct thread *));
2046                 if (!p)
2047                         return NULL;
2048
2049                 idle_threads = (struct thread **) p;
2050                 i = idle_max_cpu ? idle_max_cpu + 1 : 0;
2051                 for (; i < j; ++i)
2052                         idle_threads[i] = NULL;
2053
2054                 idle_max_cpu = j;
2055         }
2056
2057         /* allocate a new thread struct if needed */
2058         if (idle_threads[cpu] == NULL) {
2059                 idle_threads[cpu] = thread__new(0, 0);
2060
2061                 if (idle_threads[cpu]) {
2062                         idle_threads[cpu]->tid = 0;
2063                         thread__set_comm(idle_threads[cpu], idle_comm, 0);
2064                 }
2065         }
2066
2067         return idle_threads[cpu];
2068 }
2069
2070 /*
2071  * handle runtime stats saved per thread
2072  */
2073 static struct thread_runtime *thread__init_runtime(struct thread *thread)
2074 {
2075         struct thread_runtime *r;
2076
2077         r = zalloc(sizeof(struct thread_runtime));
2078         if (!r)
2079                 return NULL;
2080
2081         init_stats(&r->run_stats);
2082         thread__set_priv(thread, r);
2083
2084         return r;
2085 }
2086
2087 static struct thread_runtime *thread__get_runtime(struct thread *thread)
2088 {
2089         struct thread_runtime *tr;
2090
2091         tr = thread__priv(thread);
2092         if (tr == NULL) {
2093                 tr = thread__init_runtime(thread);
2094                 if (tr == NULL)
2095                         pr_debug("Failed to malloc memory for runtime data.\n");
2096         }
2097
2098         return tr;
2099 }
2100
2101 static struct thread *timehist_get_thread(struct perf_sched *sched,
2102                                           struct perf_sample *sample,
2103                                           struct machine *machine,
2104                                           struct perf_evsel *evsel)
2105 {
2106         struct thread *thread;
2107
2108         if (is_idle_sample(sched, sample, evsel, machine)) {
2109                 thread = get_idle_thread(sample->cpu);
2110                 if (thread == NULL)
2111                         pr_err("Failed to get idle thread for cpu %d.\n", sample->cpu);
2112
2113         } else {
2114                 thread = machine__findnew_thread(machine, sample->pid, sample->tid);
2115                 if (thread == NULL) {
2116                         pr_debug("Failed to get thread for tid %d. skipping sample.\n",
2117                                  sample->tid);
2118                 }
2119         }
2120
2121         return thread;
2122 }
2123
2124 static bool timehist_skip_sample(struct perf_sched *sched,
2125                                  struct thread *thread)
2126 {
2127         bool rc = false;
2128
2129         if (thread__is_filtered(thread)) {
2130                 rc = true;
2131                 sched->skipped_samples++;
2132         }
2133
2134         return rc;
2135 }
2136
2137 static void timehist_print_wakeup_event(struct perf_sched *sched,
2138                                         struct perf_sample *sample,
2139                                         struct machine *machine,
2140                                         struct thread *awakened)
2141 {
2142         struct thread *thread;
2143         char tstr[64];
2144
2145         thread = machine__findnew_thread(machine, sample->pid, sample->tid);
2146         if (thread == NULL)
2147                 return;
2148
2149         /* show wakeup unless both awakee and awaker are filtered */
2150         if (timehist_skip_sample(sched, thread) &&
2151             timehist_skip_sample(sched, awakened)) {
2152                 return;
2153         }
2154
2155         timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
2156         printf("%15s [%04d] ", tstr, sample->cpu);
2157         if (sched->show_cpu_visual)
2158                 printf(" %*s ", sched->max_cpu + 1, "");
2159
2160         printf(" %-*s ", comm_width, timehist_get_commstr(thread));
2161
2162         /* dt spacer */
2163         printf("  %9s  %9s  %9s ", "", "", "");
2164
2165         printf("awakened: %s", timehist_get_commstr(awakened));
2166
2167         printf("\n");
2168 }
2169
2170 static int timehist_sched_wakeup_event(struct perf_tool *tool,
2171                                        union perf_event *event __maybe_unused,
2172                                        struct perf_evsel *evsel,
2173                                        struct perf_sample *sample,
2174                                        struct machine *machine)
2175 {
2176         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
2177         struct thread *thread;
2178         struct thread_runtime *tr = NULL;
2179         /* want pid of awakened task not pid in sample */
2180         const u32 pid = perf_evsel__intval(evsel, sample, "pid");
2181
2182         thread = machine__findnew_thread(machine, 0, pid);
2183         if (thread == NULL)
2184                 return -1;
2185
2186         tr = thread__get_runtime(thread);
2187         if (tr == NULL)
2188                 return -1;
2189
2190         if (tr->ready_to_run == 0)
2191                 tr->ready_to_run = sample->time;
2192
2193         /* show wakeups if requested */
2194         if (sched->show_wakeups)
2195                 timehist_print_wakeup_event(sched, sample, machine, thread);
2196
2197         return 0;
2198 }
2199
2200 static int timehist_sched_change_event(struct perf_tool *tool,
2201                                        union perf_event *event,
2202                                        struct perf_evsel *evsel,
2203                                        struct perf_sample *sample,
2204                                        struct machine *machine)
2205 {
2206         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
2207         struct addr_location al;
2208         struct thread *thread;
2209         struct thread_runtime *tr = NULL;
2210         u64 tprev;
2211         int rc = 0;
2212
2213         if (machine__resolve(machine, &al, sample) < 0) {
2214                 pr_err("problem processing %d event. skipping it\n",
2215                        event->header.type);
2216                 rc = -1;
2217                 goto out;
2218         }
2219
2220         thread = timehist_get_thread(sched, sample, machine, evsel);
2221         if (thread == NULL) {
2222                 rc = -1;
2223                 goto out;
2224         }
2225
2226         if (timehist_skip_sample(sched, thread))
2227                 goto out;
2228
2229         tr = thread__get_runtime(thread);
2230         if (tr == NULL) {
2231                 rc = -1;
2232                 goto out;
2233         }
2234
2235         tprev = perf_evsel__get_time(evsel, sample->cpu);
2236
2237         timehist_update_runtime_stats(tr, sample->time, tprev);
2238         if (!sched->summary_only)
2239                 timehist_print_sample(sched, sample, &al, thread);
2240
2241 out:
2242         if (tr) {
2243                 /* time of this sched_switch event becomes last time task seen */
2244                 tr->last_time = sample->time;
2245
2246                 /* sched out event for task so reset ready to run time */
2247                 tr->ready_to_run = 0;
2248         }
2249
2250         perf_evsel__save_time(evsel, sample->time, sample->cpu);
2251
2252         return rc;
2253 }
2254
2255 static int timehist_sched_switch_event(struct perf_tool *tool,
2256                              union perf_event *event,
2257                              struct perf_evsel *evsel,
2258                              struct perf_sample *sample,
2259                              struct machine *machine __maybe_unused)
2260 {
2261         return timehist_sched_change_event(tool, event, evsel, sample, machine);
2262 }
2263
2264 static int process_lost(struct perf_tool *tool __maybe_unused,
2265                         union perf_event *event,
2266                         struct perf_sample *sample,
2267                         struct machine *machine __maybe_unused)
2268 {
2269         char tstr[64];
2270
2271         timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
2272         printf("%15s ", tstr);
2273         printf("lost %" PRIu64 " events on cpu %d\n", event->lost.lost, sample->cpu);
2274
2275         return 0;
2276 }
2277
2278
2279 static void print_thread_runtime(struct thread *t,
2280                                  struct thread_runtime *r)
2281 {
2282         double mean = avg_stats(&r->run_stats);
2283         float stddev;
2284
2285         printf("%*s   %5d  %9" PRIu64 " ",
2286                comm_width, timehist_get_commstr(t), t->ppid,
2287                (u64) r->run_stats.n);
2288
2289         print_sched_time(r->total_run_time, 8);
2290         stddev = rel_stddev_stats(stddev_stats(&r->run_stats), mean);
2291         print_sched_time(r->run_stats.min, 6);
2292         printf(" ");
2293         print_sched_time((u64) mean, 6);
2294         printf(" ");
2295         print_sched_time(r->run_stats.max, 6);
2296         printf("  ");
2297         printf("%5.2f", stddev);
2298         printf("\n");
2299 }
2300
2301 struct total_run_stats {
2302         u64  sched_count;
2303         u64  task_count;
2304         u64  total_run_time;
2305 };
2306
2307 static int __show_thread_runtime(struct thread *t, void *priv)
2308 {
2309         struct total_run_stats *stats = priv;
2310         struct thread_runtime *r;
2311
2312         if (thread__is_filtered(t))
2313                 return 0;
2314
2315         r = thread__priv(t);
2316         if (r && r->run_stats.n) {
2317                 stats->task_count++;
2318                 stats->sched_count += r->run_stats.n;
2319                 stats->total_run_time += r->total_run_time;
2320                 print_thread_runtime(t, r);
2321         }
2322
2323         return 0;
2324 }
2325
2326 static int show_thread_runtime(struct thread *t, void *priv)
2327 {
2328         if (t->dead)
2329                 return 0;
2330
2331         return __show_thread_runtime(t, priv);
2332 }
2333
2334 static int show_deadthread_runtime(struct thread *t, void *priv)
2335 {
2336         if (!t->dead)
2337                 return 0;
2338
2339         return __show_thread_runtime(t, priv);
2340 }
2341
2342 static void timehist_print_summary(struct perf_sched *sched,
2343                                    struct perf_session *session)
2344 {
2345         struct machine *m = &session->machines.host;
2346         struct total_run_stats totals;
2347         u64 task_count;
2348         struct thread *t;
2349         struct thread_runtime *r;
2350         int i;
2351
2352         memset(&totals, 0, sizeof(totals));
2353
2354         if (comm_width < 30)
2355                 comm_width = 30;
2356
2357         printf("\nRuntime summary\n");
2358         printf("%*s  parent   sched-in  ", comm_width, "comm");
2359         printf("   run-time    min-run     avg-run     max-run  stddev\n");
2360         printf("%*s            (count)  ", comm_width, "");
2361         printf("     (msec)     (msec)      (msec)      (msec)       %%\n");
2362         printf("%.105s\n", graph_dotted_line);
2363
2364         machine__for_each_thread(m, show_thread_runtime, &totals);
2365         task_count = totals.task_count;
2366         if (!task_count)
2367                 printf("<no still running tasks>\n");
2368
2369         printf("\nTerminated tasks:\n");
2370         machine__for_each_thread(m, show_deadthread_runtime, &totals);
2371         if (task_count == totals.task_count)
2372                 printf("<no terminated tasks>\n");
2373
2374         /* CPU idle stats not tracked when samples were skipped */
2375         if (sched->skipped_samples)
2376                 return;
2377
2378         printf("\nIdle stats:\n");
2379         for (i = 0; i <= idle_max_cpu; ++i) {
2380                 t = idle_threads[i];
2381                 if (!t)
2382                         continue;
2383
2384                 r = thread__priv(t);
2385                 if (r && r->run_stats.n) {
2386                         totals.sched_count += r->run_stats.n;
2387                         printf("    CPU %2d idle for ", i);
2388                         print_sched_time(r->total_run_time, 6);
2389                         printf(" msec\n");
2390                 } else
2391                         printf("    CPU %2d idle entire time window\n", i);
2392         }
2393
2394         printf("\n"
2395                "    Total number of unique tasks: %" PRIu64 "\n"
2396                "Total number of context switches: %" PRIu64 "\n"
2397                "           Total run time (msec): ",
2398                totals.task_count, totals.sched_count);
2399
2400         print_sched_time(totals.total_run_time, 2);
2401         printf("\n");
2402 }
2403
2404 typedef int (*sched_handler)(struct perf_tool *tool,
2405                           union perf_event *event,
2406                           struct perf_evsel *evsel,
2407                           struct perf_sample *sample,
2408                           struct machine *machine);
2409
2410 static int perf_timehist__process_sample(struct perf_tool *tool,
2411                                          union perf_event *event,
2412                                          struct perf_sample *sample,
2413                                          struct perf_evsel *evsel,
2414                                          struct machine *machine)
2415 {
2416         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
2417         int err = 0;
2418         int this_cpu = sample->cpu;
2419
2420         if (this_cpu > sched->max_cpu)
2421                 sched->max_cpu = this_cpu;
2422
2423         if (evsel->handler != NULL) {
2424                 sched_handler f = evsel->handler;
2425
2426                 err = f(tool, event, evsel, sample, machine);
2427         }
2428
2429         return err;
2430 }
2431
2432 static int timehist_check_attr(struct perf_sched *sched,
2433                                struct perf_evlist *evlist)
2434 {
2435         struct perf_evsel *evsel;
2436         struct evsel_runtime *er;
2437
2438         list_for_each_entry(evsel, &evlist->entries, node) {
2439                 er = perf_evsel__get_runtime(evsel);
2440                 if (er == NULL) {
2441                         pr_err("Failed to allocate memory for evsel runtime data\n");
2442                         return -1;
2443                 }
2444
2445                 if (sched->show_callchain &&
2446                     !(evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) {
2447                         pr_info("Samples do not have callchains.\n");
2448                         sched->show_callchain = 0;
2449                         symbol_conf.use_callchain = 0;
2450                 }
2451         }
2452
2453         return 0;
2454 }
2455
2456 static int perf_sched__timehist(struct perf_sched *sched)
2457 {
2458         const struct perf_evsel_str_handler handlers[] = {
2459                 { "sched:sched_switch",       timehist_sched_switch_event, },
2460                 { "sched:sched_wakeup",       timehist_sched_wakeup_event, },
2461                 { "sched:sched_wakeup_new",   timehist_sched_wakeup_event, },
2462         };
2463         struct perf_data_file file = {
2464                 .path = input_name,
2465                 .mode = PERF_DATA_MODE_READ,
2466         };
2467
2468         struct perf_session *session;
2469         struct perf_evlist *evlist;
2470         int err = -1;
2471
2472         /*
2473          * event handlers for timehist option
2474          */
2475         sched->tool.sample       = perf_timehist__process_sample;
2476         sched->tool.mmap         = perf_event__process_mmap;
2477         sched->tool.comm         = perf_event__process_comm;
2478         sched->tool.exit         = perf_event__process_exit;
2479         sched->tool.fork         = perf_event__process_fork;
2480         sched->tool.lost         = process_lost;
2481         sched->tool.attr         = perf_event__process_attr;
2482         sched->tool.tracing_data = perf_event__process_tracing_data;
2483         sched->tool.build_id     = perf_event__process_build_id;
2484
2485         sched->tool.ordered_events = true;
2486         sched->tool.ordering_requires_timestamps = true;
2487
2488         symbol_conf.use_callchain = sched->show_callchain;
2489
2490         session = perf_session__new(&file, false, &sched->tool);
2491         if (session == NULL)
2492                 return -ENOMEM;
2493
2494         evlist = session->evlist;
2495
2496         symbol__init(&session->header.env);
2497
2498         if (timehist_check_attr(sched, evlist) != 0)
2499                 goto out;
2500
2501         setup_pager();
2502
2503         /* setup per-evsel handlers */
2504         if (perf_session__set_tracepoints_handlers(session, handlers))
2505                 goto out;
2506
2507         if (!perf_session__has_traces(session, "record -R"))
2508                 goto out;
2509
2510         /* pre-allocate struct for per-CPU idle stats */
2511         sched->max_cpu = session->header.env.nr_cpus_online;
2512         if (sched->max_cpu == 0)
2513                 sched->max_cpu = 4;
2514         if (init_idle_threads(sched->max_cpu))
2515                 goto out;
2516
2517         /* summary_only implies summary option, but don't overwrite summary if set */
2518         if (sched->summary_only)
2519                 sched->summary = sched->summary_only;
2520
2521         if (!sched->summary_only)
2522                 timehist_header(sched);
2523
2524         err = perf_session__process_events(session);
2525         if (err) {
2526                 pr_err("Failed to process events, error %d", err);
2527                 goto out;
2528         }
2529
2530         sched->nr_events      = evlist->stats.nr_events[0];
2531         sched->nr_lost_events = evlist->stats.total_lost;
2532         sched->nr_lost_chunks = evlist->stats.nr_events[PERF_RECORD_LOST];
2533
2534         if (sched->summary)
2535                 timehist_print_summary(sched, session);
2536
2537 out:
2538         free_idle_threads();
2539         perf_session__delete(session);
2540
2541         return err;
2542 }
2543
2544
2545 static void print_bad_events(struct perf_sched *sched)
2546 {
2547         if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
2548                 printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
2549                         (double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0,
2550                         sched->nr_unordered_timestamps, sched->nr_timestamps);
2551         }
2552         if (sched->nr_lost_events && sched->nr_events) {
2553                 printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
2554                         (double)sched->nr_lost_events/(double)sched->nr_events * 100.0,
2555                         sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks);
2556         }
2557         if (sched->nr_context_switch_bugs && sched->nr_timestamps) {
2558                 printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
2559                         (double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0,
2560                         sched->nr_context_switch_bugs, sched->nr_timestamps);
2561                 if (sched->nr_lost_events)
2562                         printf(" (due to lost events?)");
2563                 printf("\n");
2564         }
2565 }
2566
2567 static void __merge_work_atoms(struct rb_root *root, struct work_atoms *data)
2568 {
2569         struct rb_node **new = &(root->rb_node), *parent = NULL;
2570         struct work_atoms *this;
2571         const char *comm = thread__comm_str(data->thread), *this_comm;
2572
2573         while (*new) {
2574                 int cmp;
2575
2576                 this = container_of(*new, struct work_atoms, node);
2577                 parent = *new;
2578
2579                 this_comm = thread__comm_str(this->thread);
2580                 cmp = strcmp(comm, this_comm);
2581                 if (cmp > 0) {
2582                         new = &((*new)->rb_left);
2583                 } else if (cmp < 0) {
2584                         new = &((*new)->rb_right);
2585                 } else {
2586                         this->num_merged++;
2587                         this->total_runtime += data->total_runtime;
2588                         this->nb_atoms += data->nb_atoms;
2589                         this->total_lat += data->total_lat;
2590                         list_splice(&data->work_list, &this->work_list);
2591                         if (this->max_lat < data->max_lat) {
2592                                 this->max_lat = data->max_lat;
2593                                 this->max_lat_at = data->max_lat_at;
2594                         }
2595                         zfree(&data);
2596                         return;
2597                 }
2598         }
2599
2600         data->num_merged++;
2601         rb_link_node(&data->node, parent, new);
2602         rb_insert_color(&data->node, root);
2603 }
2604
2605 static void perf_sched__merge_lat(struct perf_sched *sched)
2606 {
2607         struct work_atoms *data;
2608         struct rb_node *node;
2609
2610         if (sched->skip_merge)
2611                 return;
2612
2613         while ((node = rb_first(&sched->atom_root))) {
2614                 rb_erase(node, &sched->atom_root);
2615                 data = rb_entry(node, struct work_atoms, node);
2616                 __merge_work_atoms(&sched->merged_atom_root, data);
2617         }
2618 }
2619
2620 static int perf_sched__lat(struct perf_sched *sched)
2621 {
2622         struct rb_node *next;
2623
2624         setup_pager();
2625
2626         if (perf_sched__read_events(sched))
2627                 return -1;
2628
2629         perf_sched__merge_lat(sched);
2630         perf_sched__sort_lat(sched);
2631
2632         printf("\n -----------------------------------------------------------------------------------------------------------------\n");
2633         printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms | Maximum delay at       |\n");
2634         printf(" -----------------------------------------------------------------------------------------------------------------\n");
2635
2636         next = rb_first(&sched->sorted_atom_root);
2637
2638         while (next) {
2639                 struct work_atoms *work_list;
2640
2641                 work_list = rb_entry(next, struct work_atoms, node);
2642                 output_lat_thread(sched, work_list);
2643                 next = rb_next(next);
2644                 thread__zput(work_list->thread);
2645         }
2646
2647         printf(" -----------------------------------------------------------------------------------------------------------------\n");
2648         printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
2649                 (double)sched->all_runtime / NSEC_PER_MSEC, sched->all_count);
2650
2651         printf(" ---------------------------------------------------\n");
2652
2653         print_bad_events(sched);
2654         printf("\n");
2655
2656         return 0;
2657 }
2658
2659 static int setup_map_cpus(struct perf_sched *sched)
2660 {
2661         struct cpu_map *map;
2662
2663         sched->max_cpu  = sysconf(_SC_NPROCESSORS_CONF);
2664
2665         if (sched->map.comp) {
2666                 sched->map.comp_cpus = zalloc(sched->max_cpu * sizeof(int));
2667                 if (!sched->map.comp_cpus)
2668                         return -1;
2669         }
2670
2671         if (!sched->map.cpus_str)
2672                 return 0;
2673
2674         map = cpu_map__new(sched->map.cpus_str);
2675         if (!map) {
2676                 pr_err("failed to get cpus map from %s\n", sched->map.cpus_str);
2677                 return -1;
2678         }
2679
2680         sched->map.cpus = map;
2681         return 0;
2682 }
2683
2684 static int setup_color_pids(struct perf_sched *sched)
2685 {
2686         struct thread_map *map;
2687
2688         if (!sched->map.color_pids_str)
2689                 return 0;
2690
2691         map = thread_map__new_by_tid_str(sched->map.color_pids_str);
2692         if (!map) {
2693                 pr_err("failed to get thread map from %s\n", sched->map.color_pids_str);
2694                 return -1;
2695         }
2696
2697         sched->map.color_pids = map;
2698         return 0;
2699 }
2700
2701 static int setup_color_cpus(struct perf_sched *sched)
2702 {
2703         struct cpu_map *map;
2704
2705         if (!sched->map.color_cpus_str)
2706                 return 0;
2707
2708         map = cpu_map__new(sched->map.color_cpus_str);
2709         if (!map) {
2710                 pr_err("failed to get thread map from %s\n", sched->map.color_cpus_str);
2711                 return -1;
2712         }
2713
2714         sched->map.color_cpus = map;
2715         return 0;
2716 }
2717
2718 static int perf_sched__map(struct perf_sched *sched)
2719 {
2720         if (setup_map_cpus(sched))
2721                 return -1;
2722
2723         if (setup_color_pids(sched))
2724                 return -1;
2725
2726         if (setup_color_cpus(sched))
2727                 return -1;
2728
2729         setup_pager();
2730         if (perf_sched__read_events(sched))
2731                 return -1;
2732         print_bad_events(sched);
2733         return 0;
2734 }
2735
2736 static int perf_sched__replay(struct perf_sched *sched)
2737 {
2738         unsigned long i;
2739
2740         calibrate_run_measurement_overhead(sched);
2741         calibrate_sleep_measurement_overhead(sched);
2742
2743         test_calibrations(sched);
2744
2745         if (perf_sched__read_events(sched))
2746                 return -1;
2747
2748         printf("nr_run_events:        %ld\n", sched->nr_run_events);
2749         printf("nr_sleep_events:      %ld\n", sched->nr_sleep_events);
2750         printf("nr_wakeup_events:     %ld\n", sched->nr_wakeup_events);
2751
2752         if (sched->targetless_wakeups)
2753                 printf("target-less wakeups:  %ld\n", sched->targetless_wakeups);
2754         if (sched->multitarget_wakeups)
2755                 printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups);
2756         if (sched->nr_run_events_optimized)
2757                 printf("run atoms optimized: %ld\n",
2758                         sched->nr_run_events_optimized);
2759
2760         print_task_traces(sched);
2761         add_cross_task_wakeups(sched);
2762
2763         create_tasks(sched);
2764         printf("------------------------------------------------------------\n");
2765         for (i = 0; i < sched->replay_repeat; i++)
2766                 run_one_test(sched);
2767
2768         return 0;
2769 }
2770
2771 static void setup_sorting(struct perf_sched *sched, const struct option *options,
2772                           const char * const usage_msg[])
2773 {
2774         char *tmp, *tok, *str = strdup(sched->sort_order);
2775
2776         for (tok = strtok_r(str, ", ", &tmp);
2777                         tok; tok = strtok_r(NULL, ", ", &tmp)) {
2778                 if (sort_dimension__add(tok, &sched->sort_list) < 0) {
2779                         usage_with_options_msg(usage_msg, options,
2780                                         "Unknown --sort key: `%s'", tok);
2781                 }
2782         }
2783
2784         free(str);
2785
2786         sort_dimension__add("pid", &sched->cmp_pid);
2787 }
2788
2789 static int __cmd_record(int argc, const char **argv)
2790 {
2791         unsigned int rec_argc, i, j;
2792         const char **rec_argv;
2793         const char * const record_args[] = {
2794                 "record",
2795                 "-a",
2796                 "-R",
2797                 "-m", "1024",
2798                 "-c", "1",
2799                 "-e", "sched:sched_switch",
2800                 "-e", "sched:sched_stat_wait",
2801                 "-e", "sched:sched_stat_sleep",
2802                 "-e", "sched:sched_stat_iowait",
2803                 "-e", "sched:sched_stat_runtime",
2804                 "-e", "sched:sched_process_fork",
2805                 "-e", "sched:sched_wakeup",
2806                 "-e", "sched:sched_wakeup_new",
2807                 "-e", "sched:sched_migrate_task",
2808         };
2809
2810         rec_argc = ARRAY_SIZE(record_args) + argc - 1;
2811         rec_argv = calloc(rec_argc + 1, sizeof(char *));
2812
2813         if (rec_argv == NULL)
2814                 return -ENOMEM;
2815
2816         for (i = 0; i < ARRAY_SIZE(record_args); i++)
2817                 rec_argv[i] = strdup(record_args[i]);
2818
2819         for (j = 1; j < (unsigned int)argc; j++, i++)
2820                 rec_argv[i] = argv[j];
2821
2822         BUG_ON(i != rec_argc);
2823
2824         return cmd_record(i, rec_argv, NULL);
2825 }
2826
2827 int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
2828 {
2829         const char default_sort_order[] = "avg, max, switch, runtime";
2830         struct perf_sched sched = {
2831                 .tool = {
2832                         .sample          = perf_sched__process_tracepoint_sample,
2833                         .comm            = perf_event__process_comm,
2834                         .lost            = perf_event__process_lost,
2835                         .fork            = perf_sched__process_fork_event,
2836                         .ordered_events = true,
2837                 },
2838                 .cmp_pid              = LIST_HEAD_INIT(sched.cmp_pid),
2839                 .sort_list            = LIST_HEAD_INIT(sched.sort_list),
2840                 .start_work_mutex     = PTHREAD_MUTEX_INITIALIZER,
2841                 .work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER,
2842                 .sort_order           = default_sort_order,
2843                 .replay_repeat        = 10,
2844                 .profile_cpu          = -1,
2845                 .next_shortname1      = 'A',
2846                 .next_shortname2      = '0',
2847                 .skip_merge           = 0,
2848                 .show_callchain       = 1,
2849                 .max_stack            = 5,
2850         };
2851         const struct option sched_options[] = {
2852         OPT_STRING('i', "input", &input_name, "file",
2853                     "input file name"),
2854         OPT_INCR('v', "verbose", &verbose,
2855                     "be more verbose (show symbol address, etc)"),
2856         OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
2857                     "dump raw trace in ASCII"),
2858         OPT_END()
2859         };
2860         const struct option latency_options[] = {
2861         OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
2862                    "sort by key(s): runtime, switch, avg, max"),
2863         OPT_INTEGER('C', "CPU", &sched.profile_cpu,
2864                     "CPU to profile on"),
2865         OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
2866                     "dump raw trace in ASCII"),
2867         OPT_BOOLEAN('p', "pids", &sched.skip_merge,
2868                     "latency stats per pid instead of per comm"),
2869         OPT_PARENT(sched_options)
2870         };
2871         const struct option replay_options[] = {
2872         OPT_UINTEGER('r', "repeat", &sched.replay_repeat,
2873                      "repeat the workload replay N times (-1: infinite)"),
2874         OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
2875                     "dump raw trace in ASCII"),
2876         OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"),
2877         OPT_PARENT(sched_options)
2878         };
2879         const struct option map_options[] = {
2880         OPT_BOOLEAN(0, "compact", &sched.map.comp,
2881                     "map output in compact mode"),
2882         OPT_STRING(0, "color-pids", &sched.map.color_pids_str, "pids",
2883                    "highlight given pids in map"),
2884         OPT_STRING(0, "color-cpus", &sched.map.color_cpus_str, "cpus",
2885                     "highlight given CPUs in map"),
2886         OPT_STRING(0, "cpus", &sched.map.cpus_str, "cpus",
2887                     "display given CPUs in map"),
2888         OPT_PARENT(sched_options)
2889         };
2890         const struct option timehist_options[] = {
2891         OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
2892                    "file", "vmlinux pathname"),
2893         OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name,
2894                    "file", "kallsyms pathname"),
2895         OPT_BOOLEAN('g', "call-graph", &sched.show_callchain,
2896                     "Display call chains if present (default on)"),
2897         OPT_UINTEGER(0, "max-stack", &sched.max_stack,
2898                    "Maximum number of functions to display backtrace."),
2899         OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
2900                     "Look for files with symbols relative to this directory"),
2901         OPT_BOOLEAN('s', "summary", &sched.summary_only,
2902                     "Show only syscall summary with statistics"),
2903         OPT_BOOLEAN('S', "with-summary", &sched.summary,
2904                     "Show all syscalls and summary with statistics"),
2905         OPT_BOOLEAN('w', "wakeups", &sched.show_wakeups, "Show wakeup events"),
2906         OPT_BOOLEAN('V', "cpu-visual", &sched.show_cpu_visual, "Add CPU visual"),
2907         OPT_PARENT(sched_options)
2908         };
2909
2910         const char * const latency_usage[] = {
2911                 "perf sched latency [<options>]",
2912                 NULL
2913         };
2914         const char * const replay_usage[] = {
2915                 "perf sched replay [<options>]",
2916                 NULL
2917         };
2918         const char * const map_usage[] = {
2919                 "perf sched map [<options>]",
2920                 NULL
2921         };
2922         const char * const timehist_usage[] = {
2923                 "perf sched timehist [<options>]",
2924                 NULL
2925         };
2926         const char *const sched_subcommands[] = { "record", "latency", "map",
2927                                                   "replay", "script",
2928                                                   "timehist", NULL };
2929         const char *sched_usage[] = {
2930                 NULL,
2931                 NULL
2932         };
2933         struct trace_sched_handler lat_ops  = {
2934                 .wakeup_event       = latency_wakeup_event,
2935                 .switch_event       = latency_switch_event,
2936                 .runtime_event      = latency_runtime_event,
2937                 .migrate_task_event = latency_migrate_task_event,
2938         };
2939         struct trace_sched_handler map_ops  = {
2940                 .switch_event       = map_switch_event,
2941         };
2942         struct trace_sched_handler replay_ops  = {
2943                 .wakeup_event       = replay_wakeup_event,
2944                 .switch_event       = replay_switch_event,
2945                 .fork_event         = replay_fork_event,
2946         };
2947         unsigned int i;
2948
2949         for (i = 0; i < ARRAY_SIZE(sched.curr_pid); i++)
2950                 sched.curr_pid[i] = -1;
2951
2952         argc = parse_options_subcommand(argc, argv, sched_options, sched_subcommands,
2953                                         sched_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2954         if (!argc)
2955                 usage_with_options(sched_usage, sched_options);
2956
2957         /*
2958          * Aliased to 'perf script' for now:
2959          */
2960         if (!strcmp(argv[0], "script"))
2961                 return cmd_script(argc, argv, prefix);
2962
2963         if (!strncmp(argv[0], "rec", 3)) {
2964                 return __cmd_record(argc, argv);
2965         } else if (!strncmp(argv[0], "lat", 3)) {
2966                 sched.tp_handler = &lat_ops;
2967                 if (argc > 1) {
2968                         argc = parse_options(argc, argv, latency_options, latency_usage, 0);
2969                         if (argc)
2970                                 usage_with_options(latency_usage, latency_options);
2971                 }
2972                 setup_sorting(&sched, latency_options, latency_usage);
2973                 return perf_sched__lat(&sched);
2974         } else if (!strcmp(argv[0], "map")) {
2975                 if (argc) {
2976                         argc = parse_options(argc, argv, map_options, map_usage, 0);
2977                         if (argc)
2978                                 usage_with_options(map_usage, map_options);
2979                 }
2980                 sched.tp_handler = &map_ops;
2981                 setup_sorting(&sched, latency_options, latency_usage);
2982                 return perf_sched__map(&sched);
2983         } else if (!strncmp(argv[0], "rep", 3)) {
2984                 sched.tp_handler = &replay_ops;
2985                 if (argc) {
2986                         argc = parse_options(argc, argv, replay_options, replay_usage, 0);
2987                         if (argc)
2988                                 usage_with_options(replay_usage, replay_options);
2989                 }
2990                 return perf_sched__replay(&sched);
2991         } else if (!strcmp(argv[0], "timehist")) {
2992                 if (argc) {
2993                         argc = parse_options(argc, argv, timehist_options,
2994                                              timehist_usage, 0);
2995                         if (argc)
2996                                 usage_with_options(timehist_usage, timehist_options);
2997                 }
2998                 if (sched.show_wakeups && sched.summary_only) {
2999                         pr_err(" Error: -s and -w are mutually exclusive.\n");
3000                         parse_options_usage(timehist_usage, timehist_options, "s", true);
3001                         parse_options_usage(NULL, timehist_options, "w", true);
3002                         return -EINVAL;
3003                 }
3004
3005                 return perf_sched__timehist(&sched);
3006         } else {
3007                 usage_with_options(sched_usage, sched_options);
3008         }
3009
3010         return 0;
3011 }