kernel/events/core.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Performance events core code:
   4  *
   5  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
   6  *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
   7  *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
   8  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   9  */
  10
  11 #include <linux/fs.h>
  12 #include <linux/mm.h>
  13 #include <linux/cpu.h>
  14 #include <linux/smp.h>
  15 #include <linux/idr.h>
  16 #include <linux/file.h>
  17 #include <linux/poll.h>
  18 #include <linux/slab.h>
  19 #include <linux/hash.h>
  20 #include <linux/tick.h>
  21 #include <linux/sysfs.h>
  22 #include <linux/dcache.h>
  23 #include <linux/percpu.h>
  24 #include <linux/ptrace.h>
  25 #include <linux/reboot.h>
  26 #include <linux/vmstat.h>
  27 #include <linux/device.h>
  28 #include <linux/export.h>
  29 #include <linux/vmalloc.h>
  30 #include <linux/hardirq.h>
  31 #include <linux/rculist.h>
  32 #include <linux/uaccess.h>
  33 #include <linux/syscalls.h>
  34 #include <linux/anon_inodes.h>
  35 #include <linux/kernel_stat.h>
  36 #include <linux/cgroup.h>
  37 #include <linux/perf_event.h>
  38 #include <linux/trace_events.h>
  39 #include <linux/hw_breakpoint.h>
  40 #include <linux/mm_types.h>
  41 #include <linux/module.h>
  42 #include <linux/mman.h>
  43 #include <linux/compat.h>
  44 #include <linux/bpf.h>
  45 #include <linux/filter.h>
  46 #include <linux/namei.h>
  47 #include <linux/parser.h>
  48 #include <linux/sched/clock.h>
  49 #include <linux/sched/mm.h>
  50 #include <linux/proc_ns.h>
  51 #include <linux/mount.h>
  52
  53 #include "internal.h"
  54
  55 #include <asm/irq_regs.h>
  56
  57 typedef int (*remote_function_f)(void *);
  58
  59 struct remote_function_call {
  60         struct task_struct      *p;
  61         remote_function_f       func;
  62         void                    *info;
  63         int                     ret;
  64 };
  65
  66 static void remote_function(void *data)
  67 {
  68         struct remote_function_call *tfc = data;
  69         struct task_struct *p = tfc->p;
  70
  71         if (p) {
  72                 /* -EAGAIN */
  73                 if (task_cpu(p) != smp_processor_id())
  74                         return;
  75
  76                 /*
  77                  * Now that we're on right CPU with IRQs disabled, we can test
  78                  * if we hit the right task without races.
  79                  */
  80
  81                 tfc->ret = -ESRCH; /* No such (running) process */
  82                 if (p != current)
  83                         return;
  84         }
  85
  86         tfc->ret = tfc->func(tfc->info);
  87 }
  88
  89 /**
  90  * task_function_call - call a function on the cpu on which a task runs
  91  * @p:          the task to evaluate
  92  * @func:       the function to be called
  93  * @info:       the function call argument
  94  *
  95  * Calls the function @func when the task is currently running. This might
  96  * be on the current CPU, which just calls the function directly
  97  *
  98  * returns: @func return value, or
  99  *          -ESRCH  - when the process isn't running
 100  *          -EAGAIN - when the process moved away
 101  */
 102 static int
 103 task_function_call(struct task_struct *p, remote_function_f func, void *info)
 104 {
 105         struct remote_function_call data = {
 106                 .p      = p,
 107                 .func   = func,
 108                 .info   = info,
 109                 .ret    = -EAGAIN,
 110         };
 111         int ret;
 112
 113         do {
 114                 ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
 115                 if (!ret)
 116                         ret = data.ret;
 117         } while (ret == -EAGAIN);
 118
 119         return ret;
 120 }
 121
 122 /**
 123  * cpu_function_call - call a function on the cpu
 124  * @func:       the function to be called
 125  * @info:       the function call argument
 126  *
 127  * Calls the function @func on the remote cpu.
 128  *
 129  * returns: @func return value or -ENXIO when the cpu is offline
 130  */
 131 static int cpu_function_call(int cpu, remote_function_f func, void *info)
 132 {
 133         struct remote_function_call data = {
 134                 .p      = NULL,
 135                 .func   = func,
 136                 .info   = info,
 137                 .ret    = -ENXIO, /* No such CPU */
 138         };
 139
 140         smp_call_function_single(cpu, remote_function, &data, 1);
 141
 142         return data.ret;
 143 }
 144
 145 static inline struct perf_cpu_context *
 146 __get_cpu_context(struct perf_event_context *ctx)
 147 {
 148         return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
 149 }
 150
 151 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
 152                           struct perf_event_context *ctx)
 153 {
 154         raw_spin_lock(&cpuctx->ctx.lock);
 155         if (ctx)
 156                 raw_spin_lock(&ctx->lock);
 157 }
 158
 159 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
 160                             struct perf_event_context *ctx)
 161 {
 162         if (ctx)
 163                 raw_spin_unlock(&ctx->lock);
 164         raw_spin_unlock(&cpuctx->ctx.lock);
 165 }
 166
 167 #define TASK_TOMBSTONE ((void *)-1L)
 168
 169 static bool is_kernel_event(struct perf_event *event)
 170 {
 171         return READ_ONCE(event->owner) == TASK_TOMBSTONE;
 172 }
 173
 174 /*
 175  * On task ctx scheduling...
 176  *
 177  * When !ctx->nr_events a task context will not be scheduled. This means
 178  * we can disable the scheduler hooks (for performance) without leaving
 179  * pending task ctx state.
 180  *
 181  * This however results in two special cases:
 182  *
 183  *  - removing the last event from a task ctx; this is relatively straight
 184  *    forward and is done in __perf_remove_from_context.
 185  *
 186  *  - adding the first event to a task ctx; this is tricky because we cannot
 187  *    rely on ctx->is_active and therefore cannot use event_function_call().
 188  *    See perf_install_in_context().
 189  *
 190  * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
 191  */
 192
 193 typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
 194                         struct perf_event_context *, void *);
 195
 196 struct event_function_struct {
 197         struct perf_event *event;
 198         event_f func;
 199         void *data;
 200 };
 201
 202 static int event_function(void *info)
 203 {
 204         struct event_function_struct *efs = info;
 205         struct perf_event *event = efs->event;
 206         struct perf_event_context *ctx = event->ctx;
 207         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 208         struct perf_event_context *task_ctx = cpuctx->task_ctx;
 209         int ret = 0;
 210
 211         lockdep_assert_irqs_disabled();
 212
 213         perf_ctx_lock(cpuctx, task_ctx);
 214         /*
 215          * Since we do the IPI call without holding ctx->lock things can have
 216          * changed, double check we hit the task we set out to hit.
 217          */
 218         if (ctx->task) {
 219                 if (ctx->task != current) {
 220                         ret = -ESRCH;
 221                         goto unlock;
 222                 }
 223
 224                 /*
 225                  * We only use event_function_call() on established contexts,
 226                  * and event_function() is only ever called when active (or
 227                  * rather, we'll have bailed in task_function_call() or the
 228                  * above ctx->task != current test), therefore we must have
 229                  * ctx->is_active here.
 230                  */
 231                 WARN_ON_ONCE(!ctx->is_active);
 232                 /*
 233                  * And since we have ctx->is_active, cpuctx->task_ctx must
 234                  * match.
 235                  */
 236                 WARN_ON_ONCE(task_ctx != ctx);
 237         } else {
 238                 WARN_ON_ONCE(&cpuctx->ctx != ctx);
 239         }
 240
 241         efs->func(event, cpuctx, ctx, efs->data);
 242 unlock:
 243         perf_ctx_unlock(cpuctx, task_ctx);
 244
 245         return ret;
 246 }
 247
 248 static void event_function_call(struct perf_event *event, event_f func, void *data)
 249 {
 250         struct perf_event_context *ctx = event->ctx;
 251         struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
 252         struct event_function_struct efs = {
 253                 .event = event,
 254                 .func = func,
 255                 .data = data,
 256         };
 257
 258         if (!event->parent) {
 259                 /*
 260                  * If this is a !child event, we must hold ctx::mutex to
 261                  * stabilize the the event->ctx relation. See
 262                  * perf_event_ctx_lock().
 263                  */
 264                 lockdep_assert_held(&ctx->mutex);
 265         }
 266
 267         if (!task) {
 268                 cpu_function_call(event->cpu, event_function, &efs);
 269                 return;
 270         }
 271
 272         if (task == TASK_TOMBSTONE)
 273                 return;
 274
 275 again:
 276         if (!task_function_call(task, event_function, &efs))
 277                 return;
 278
 279         raw_spin_lock_irq(&ctx->lock);
 280         /*
 281          * Reload the task pointer, it might have been changed by
 282          * a concurrent perf_event_context_sched_out().
 283          */
 284         task = ctx->task;
 285         if (task == TASK_TOMBSTONE) {
 286                 raw_spin_unlock_irq(&ctx->lock);
 287                 return;
 288         }
 289         if (ctx->is_active) {
 290                 raw_spin_unlock_irq(&ctx->lock);
 291                 goto again;
 292         }
 293         func(event, NULL, ctx, data);
 294         raw_spin_unlock_irq(&ctx->lock);
 295 }
 296
 297 /*
 298  * Similar to event_function_call() + event_function(), but hard assumes IRQs
 299  * are already disabled and we're on the right CPU.
 300  */
 301 static void event_function_local(struct perf_event *event, event_f func, void *data)
 302 {
 303         struct perf_event_context *ctx = event->ctx;
 304         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 305         struct task_struct *task = READ_ONCE(ctx->task);
 306         struct perf_event_context *task_ctx = NULL;
 307
 308         lockdep_assert_irqs_disabled();
 309
 310         if (task) {
 311                 if (task == TASK_TOMBSTONE)
 312                         return;
 313
 314                 task_ctx = ctx;
 315         }
 316
 317         perf_ctx_lock(cpuctx, task_ctx);
 318
 319         task = ctx->task;
 320         if (task == TASK_TOMBSTONE)
 321                 goto unlock;
 322
 323         if (task) {
 324                 /*
 325                  * We must be either inactive or active and the right task,
 326                  * otherwise we're screwed, since we cannot IPI to somewhere
 327                  * else.
 328                  */
 329                 if (ctx->is_active) {
 330                         if (WARN_ON_ONCE(task != current))
 331                                 goto unlock;
 332
 333                         if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
 334                                 goto unlock;
 335                 }
 336         } else {
 337                 WARN_ON_ONCE(&cpuctx->ctx != ctx);
 338         }
 339
 340         func(event, cpuctx, ctx, data);
 341 unlock:
 342         perf_ctx_unlock(cpuctx, task_ctx);
 343 }
 344
 345 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
 346                        PERF_FLAG_FD_OUTPUT  |\
 347                        PERF_FLAG_PID_CGROUP |\
 348                        PERF_FLAG_FD_CLOEXEC)
 349
 350 /*
 351  * branch priv levels that need permission checks
 352  */
 353 #define PERF_SAMPLE_BRANCH_PERM_PLM \
 354         (PERF_SAMPLE_BRANCH_KERNEL |\
 355          PERF_SAMPLE_BRANCH_HV)
 356
 357 enum event_type_t {
 358         EVENT_FLEXIBLE = 0x1,
 359         EVENT_PINNED = 0x2,
 360         EVENT_TIME = 0x4,
 361         /* see ctx_resched() for details */
 362         EVENT_CPU = 0x8,
 363         EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 364 };
 365
 366 /*
 367  * perf_sched_events : >0 events exist
 368  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
 369  */
 370
 371 static void perf_sched_delayed(struct work_struct *work);
 372 DEFINE_STATIC_KEY_FALSE(perf_sched_events);
 373 static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
 374 static DEFINE_MUTEX(perf_sched_mutex);
 375 static atomic_t perf_sched_count;
 376
 377 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 378 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 379 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 380
 381 static atomic_t nr_mmap_events __read_mostly;
 382 static atomic_t nr_comm_events __read_mostly;
 383 static atomic_t nr_namespaces_events __read_mostly;
 384 static atomic_t nr_task_events __read_mostly;
 385 static atomic_t nr_freq_events __read_mostly;
 386 static atomic_t nr_switch_events __read_mostly;
 387 static atomic_t nr_ksymbol_events __read_mostly;
 388 static atomic_t nr_bpf_events __read_mostly;
 389
 390 static LIST_HEAD(pmus);
 391 static DEFINE_MUTEX(pmus_lock);
 392 static struct srcu_struct pmus_srcu;
 393 static cpumask_var_t perf_online_mask;
 394
 395 /*
 396  * perf event paranoia level:
 397  *  -1 - not paranoid at all
 398  *   0 - disallow raw tracepoint access for unpriv
 399  *   1 - disallow cpu events for unpriv
 400  *   2 - disallow kernel profiling for unpriv
 401  */
 402 int sysctl_perf_event_paranoid __read_mostly = 2;
 403
 404 /* Minimum for 512 kiB + 1 user control page */
 405 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
 406
 407 /*
 408  * max perf event sample rate
 409  */
 410 #define DEFAULT_MAX_SAMPLE_RATE         100000
 411 #define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
 412 #define DEFAULT_CPU_TIME_MAX_PERCENT    25
 413
 414 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
 415
 416 static int max_samples_per_tick __read_mostly   = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
 417 static int perf_sample_period_ns __read_mostly  = DEFAULT_SAMPLE_PERIOD_NS;
 418
 419 static int perf_sample_allowed_ns __read_mostly =
 420         DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
 421
 422 static void update_perf_cpu_limits(void)
 423 {
 424         u64 tmp = perf_sample_period_ns;
 425
 426         tmp *= sysctl_perf_cpu_time_max_percent;
 427         tmp = div_u64(tmp, 100);
 428         if (!tmp)
 429                 tmp = 1;
 430
 431         WRITE_ONCE(perf_sample_allowed_ns, tmp);
 432 }
 433
 434 static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
 435
 436 int perf_proc_update_handler(struct ctl_table *table, int write,
 437                 void __user *buffer, size_t *lenp,
 438                 loff_t *ppos)
 439 {
 440         int ret;
 441         int perf_cpu = sysctl_perf_cpu_time_max_percent;
 442         /*
 443          * If throttling is disabled don't allow the write:
 444          */
 445         if (write && (perf_cpu == 100 || perf_cpu == 0))
 446                 return -EINVAL;
 447
 448         ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 449         if (ret || !write)
 450                 return ret;
 451
 452         max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
 453         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 454         update_perf_cpu_limits();
 455
 456         return 0;
 457 }
 458
 459 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
 460
 461 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 462                                 void __user *buffer, size_t *lenp,
 463                                 loff_t *ppos)
 464 {
 465         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 466
 467         if (ret || !write)
 468                 return ret;
 469
 470         if (sysctl_perf_cpu_time_max_percent == 100 ||
 471             sysctl_perf_cpu_time_max_percent == 0) {
 472                 printk(KERN_WARNING
 473                        "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
 474                 WRITE_ONCE(perf_sample_allowed_ns, 0);
 475         } else {
 476                 update_perf_cpu_limits();
 477         }
 478
 479         return 0;
 480 }
 481
 482 /*
 483  * perf samples are done in some very critical code paths (NMIs).
 484  * If they take too much CPU time, the system can lock up and not
 485  * get any real work done.  This will drop the sample rate when
 486  * we detect that events are taking too long.
 487  */
 488 #define NR_ACCUMULATED_SAMPLES 128
 489 static DEFINE_PER_CPU(u64, running_sample_length);
 490
 491 static u64 __report_avg;
 492 static u64 __report_allowed;
 493
 494 static void perf_duration_warn(struct irq_work *w)
 495 {
 496         printk_ratelimited(KERN_INFO
 497                 "perf: interrupt took too long (%lld > %lld), lowering "
 498                 "kernel.perf_event_max_sample_rate to %d\n",
 499                 __report_avg, __report_allowed,
 500                 sysctl_perf_event_sample_rate);
 501 }
 502
 503 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
 504
 505 void perf_sample_event_took(u64 sample_len_ns)
 506 {
 507         u64 max_len = READ_ONCE(perf_sample_allowed_ns);
 508         u64 running_len;
 509         u64 avg_len;
 510         u32 max;
 511
 512         if (max_len == 0)
 513                 return;
 514
 515         /* Decay the counter by 1 average sample. */
 516         running_len = __this_cpu_read(running_sample_length);
 517         running_len -= running_len/NR_ACCUMULATED_SAMPLES;
 518         running_len += sample_len_ns;
 519         __this_cpu_write(running_sample_length, running_len);
 520
 521         /*
 522          * Note: this will be biased artifically low until we have
 523          * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
 524          * from having to maintain a count.
 525          */
 526         avg_len = running_len/NR_ACCUMULATED_SAMPLES;
 527         if (avg_len <= max_len)
 528                 return;
 529
 530         __report_avg = avg_len;
 531         __report_allowed = max_len;
 532
 533         /*
 534          * Compute a throttle threshold 25% below the current duration.
 535          */
 536         avg_len += avg_len / 4;
 537         max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
 538         if (avg_len < max)
 539                 max /= (u32)avg_len;
 540         else
 541                 max = 1;
 542
 543         WRITE_ONCE(perf_sample_allowed_ns, avg_len);
 544         WRITE_ONCE(max_samples_per_tick, max);
 545
 546         sysctl_perf_event_sample_rate = max * HZ;
 547         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 548
 549         if (!irq_work_queue(&perf_duration_work)) {
 550                 early_printk("perf: interrupt took too long (%lld > %lld), lowering "
 551                              "kernel.perf_event_max_sample_rate to %d\n",
 552                              __report_avg, __report_allowed,
 553                              sysctl_perf_event_sample_rate);
 554         }
 555 }
 556
 557 static atomic64_t perf_event_id;
 558
 559 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 560                               enum event_type_t event_type);
 561
 562 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 563                              enum event_type_t event_type,
 564                              struct task_struct *task);
 565
 566 static void update_context_time(struct perf_event_context *ctx);
 567 static u64 perf_event_time(struct perf_event *event);
 568
 569 void __weak perf_event_print_debug(void)        { }
 570
 571 extern __weak const char *perf_pmu_name(void)
 572 {
 573         return "pmu";
 574 }
 575
 576 static inline u64 perf_clock(void)
 577 {
 578         return local_clock();
 579 }
 580
 581 static inline u64 perf_event_clock(struct perf_event *event)
 582 {
 583         return event->clock();
 584 }
 585
 586 /*
 587  * State based event timekeeping...
 588  *
 589  * The basic idea is to use event->state to determine which (if any) time
 590  * fields to increment with the current delta. This means we only need to
 591  * update timestamps when we change state or when they are explicitly requested
 592  * (read).
 593  *
 594  * Event groups make things a little more complicated, but not terribly so. The
 595  * rules for a group are that if the group leader is OFF the entire group is
 596  * OFF, irrespecive of what the group member states are. This results in
 597  * __perf_effective_state().
 598  *
 599  * A futher ramification is that when a group leader flips between OFF and
 600  * !OFF, we need to update all group member times.
 601  *
 602  *
 603  * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
 604  * need to make sure the relevant context time is updated before we try and
 605  * update our timestamps.
 606  */
 607
 608 static __always_inline enum perf_event_state
 609 __perf_effective_state(struct perf_event *event)
 610 {
 611         struct perf_event *leader = event->group_leader;
 612
 613         if (leader->state <= PERF_EVENT_STATE_OFF)
 614                 return leader->state;
 615
 616         return event->state;
 617 }
 618
 619 static __always_inline void
 620 __perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
 621 {
 622         enum perf_event_state state = __perf_effective_state(event);
 623         u64 delta = now - event->tstamp;
 624
 625         *enabled = event->total_time_enabled;
 626         if (state >= PERF_EVENT_STATE_INACTIVE)
 627                 *enabled += delta;
 628
 629         *running = event->total_time_running;
 630         if (state >= PERF_EVENT_STATE_ACTIVE)
 631                 *running += delta;
 632 }
 633
 634 static void perf_event_update_time(struct perf_event *event)
 635 {
 636         u64 now = perf_event_time(event);
 637
 638         __perf_update_times(event, now, &event->total_time_enabled,
 639                                         &event->total_time_running);
 640         event->tstamp = now;
 641 }
 642
 643 static void perf_event_update_sibling_time(struct perf_event *leader)
 644 {
 645         struct perf_event *sibling;
 646
 647         for_each_sibling_event(sibling, leader)
 648                 perf_event_update_time(sibling);
 649 }
 650
 651 static void
 652 perf_event_set_state(struct perf_event *event, enum perf_event_state state)
 653 {
 654         if (event->state == state)
 655                 return;
 656
 657         perf_event_update_time(event);
 658         /*
 659          * If a group leader gets enabled/disabled all its siblings
 660          * are affected too.
 661          */
 662         if ((event->state < 0) ^ (state < 0))
 663                 perf_event_update_sibling_time(event);
 664
 665         WRITE_ONCE(event->state, state);
 666 }
 667
 668 #ifdef CONFIG_CGROUP_PERF
 669
 670 static inline bool
 671 perf_cgroup_match(struct perf_event *event)
 672 {
 673         struct perf_event_context *ctx = event->ctx;
 674         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 675
 676         /* @event doesn't care about cgroup */
 677         if (!event->cgrp)
 678                 return true;
 679
 680         /* wants specific cgroup scope but @cpuctx isn't associated with any */
 681         if (!cpuctx->cgrp)
 682                 return false;
 683
 684         /*
 685          * Cgroup scoping is recursive.  An event enabled for a cgroup is
 686          * also enabled for all its descendant cgroups.  If @cpuctx's
 687          * cgroup is a descendant of @event's (the test covers identity
 688          * case), it's a match.
 689          */
 690         return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
 691                                     event->cgrp->css.cgroup);
 692 }
 693
 694 static inline void perf_detach_cgroup(struct perf_event *event)
 695 {
 696         css_put(&event->cgrp->css);
 697         event->cgrp = NULL;
 698 }
 699
 700 static inline int is_cgroup_event(struct perf_event *event)
 701 {
 702         return event->cgrp != NULL;
 703 }
 704
 705 static inline u64 perf_cgroup_event_time(struct perf_event *event)
 706 {
 707         struct perf_cgroup_info *t;
 708
 709         t = per_cpu_ptr(event->cgrp->info, event->cpu);
 710         return t->time;
 711 }
 712
 713 static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
 714 {
 715         struct perf_cgroup_info *info;
 716         u64 now;
 717
 718         now = perf_clock();
 719
 720         info = this_cpu_ptr(cgrp->info);
 721
 722         info->time += now - info->timestamp;
 723         info->timestamp = now;
 724 }
 725
 726 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
 727 {
 728         struct perf_cgroup *cgrp = cpuctx->cgrp;
 729         struct cgroup_subsys_state *css;
 730
 731         if (cgrp) {
 732                 for (css = &cgrp->css; css; css = css->parent) {
 733                         cgrp = container_of(css, struct perf_cgroup, css);
 734                         __update_cgrp_time(cgrp);
 735                 }
 736         }
 737 }
 738
 739 static inline void update_cgrp_time_from_event(struct perf_event *event)
 740 {
 741         struct perf_cgroup *cgrp;
 742
 743         /*
 744          * ensure we access cgroup data only when needed and
 745          * when we know the cgroup is pinned (css_get)
 746          */
 747         if (!is_cgroup_event(event))
 748                 return;
 749
 750         cgrp = perf_cgroup_from_task(current, event->ctx);
 751         /*
 752          * Do not update time when cgroup is not active
 753          */
 754         if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
 755                 __update_cgrp_time(event->cgrp);
 756 }
 757
 758 static inline void
 759 perf_cgroup_set_timestamp(struct task_struct *task,
 760                           struct perf_event_context *ctx)
 761 {
 762         struct perf_cgroup *cgrp;
 763         struct perf_cgroup_info *info;
 764         struct cgroup_subsys_state *css;
 765
 766         /*
 767          * ctx->lock held by caller
 768          * ensure we do not access cgroup data
 769          * unless we have the cgroup pinned (css_get)
 770          */
 771         if (!task || !ctx->nr_cgroups)
 772                 return;
 773
 774         cgrp = perf_cgroup_from_task(task, ctx);
 775
 776         for (css = &cgrp->css; css; css = css->parent) {
 777                 cgrp = container_of(css, struct perf_cgroup, css);
 778                 info = this_cpu_ptr(cgrp->info);
 779                 info->timestamp = ctx->timestamp;
 780         }
 781 }
 782
 783 static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
 784
 785 #define PERF_CGROUP_SWOUT       0x1 /* cgroup switch out every event */
 786 #define PERF_CGROUP_SWIN        0x2 /* cgroup switch in events based on task */
 787
 788 /*
 789  * reschedule events based on the cgroup constraint of task.
 790  *
 791  * mode SWOUT : schedule out everything
 792  * mode SWIN : schedule in based on cgroup for next
 793  */
 794 static void perf_cgroup_switch(struct task_struct *task, int mode)
 795 {
 796         struct perf_cpu_context *cpuctx;
 797         struct list_head *list;
 798         unsigned long flags;
 799
 800         /*
 801          * Disable interrupts and preemption to avoid this CPU's
 802          * cgrp_cpuctx_entry to change under us.
 803          */
 804         local_irq_save(flags);
 805
 806         list = this_cpu_ptr(&cgrp_cpuctx_list);
 807         list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
 808                 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
 809
 810                 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 811                 perf_pmu_disable(cpuctx->ctx.pmu);
 812
 813                 if (mode & PERF_CGROUP_SWOUT) {
 814                         cpu_ctx_sched_out(cpuctx, EVENT_ALL);
 815                         /*
 816                          * must not be done before ctxswout due
 817                          * to event_filter_match() in event_sched_out()
 818                          */
 819                         cpuctx->cgrp = NULL;
 820                 }
 821
 822                 if (mode & PERF_CGROUP_SWIN) {
 823                         WARN_ON_ONCE(cpuctx->cgrp);
 824                         /*
 825                          * set cgrp before ctxsw in to allow
 826                          * event_filter_match() to not have to pass
 827                          * task around
 828                          * we pass the cpuctx->ctx to perf_cgroup_from_task()
 829                          * because cgorup events are only per-cpu
 830                          */
 831                         cpuctx->cgrp = perf_cgroup_from_task(task,
 832                                                              &cpuctx->ctx);
 833                         cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
 834                 }
 835                 perf_pmu_enable(cpuctx->ctx.pmu);
 836                 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 837         }
 838
 839         local_irq_restore(flags);
 840 }
 841
 842 static inline void perf_cgroup_sched_out(struct task_struct *task,
 843                                          struct task_struct *next)
 844 {
 845         struct perf_cgroup *cgrp1;
 846         struct perf_cgroup *cgrp2 = NULL;
 847
 848         rcu_read_lock();
 849         /*
 850          * we come here when we know perf_cgroup_events > 0
 851          * we do not need to pass the ctx here because we know
 852          * we are holding the rcu lock
 853          */
 854         cgrp1 = perf_cgroup_from_task(task, NULL);
 855         cgrp2 = perf_cgroup_from_task(next, NULL);
 856
 857         /*
 858          * only schedule out current cgroup events if we know
 859          * that we are switching to a different cgroup. Otherwise,
 860          * do no touch the cgroup events.
 861          */
 862         if (cgrp1 != cgrp2)
 863                 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
 864
 865         rcu_read_unlock();
 866 }
 867
 868 static inline void perf_cgroup_sched_in(struct task_struct *prev,
 869                                         struct task_struct *task)
 870 {
 871         struct perf_cgroup *cgrp1;
 872         struct perf_cgroup *cgrp2 = NULL;
 873
 874         rcu_read_lock();
 875         /*
 876          * we come here when we know perf_cgroup_events > 0
 877          * we do not need to pass the ctx here because we know
 878          * we are holding the rcu lock
 879          */
 880         cgrp1 = perf_cgroup_from_task(task, NULL);
 881         cgrp2 = perf_cgroup_from_task(prev, NULL);
 882
 883         /*
 884          * only need to schedule in cgroup events if we are changing
 885          * cgroup during ctxsw. Cgroup events were not scheduled
 886          * out of ctxsw out if that was not the case.
 887          */
 888         if (cgrp1 != cgrp2)
 889                 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
 890
 891         rcu_read_unlock();
 892 }
 893
 894 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 895                                       struct perf_event_attr *attr,
 896                                       struct perf_event *group_leader)
 897 {
 898         struct perf_cgroup *cgrp;
 899         struct cgroup_subsys_state *css;
 900         struct fd f = fdget(fd);
 901         int ret = 0;
 902
 903         if (!f.file)
 904                 return -EBADF;
 905
 906         css = css_tryget_online_from_dir(f.file->f_path.dentry,
 907                                          &perf_event_cgrp_subsys);
 908         if (IS_ERR(css)) {
 909                 ret = PTR_ERR(css);
 910                 goto out;
 911         }
 912
 913         cgrp = container_of(css, struct perf_cgroup, css);
 914         event->cgrp = cgrp;
 915
 916         /*
 917          * all events in a group must monitor
 918          * the same cgroup because a task belongs
 919          * to only one perf cgroup at a time
 920          */
 921         if (group_leader && group_leader->cgrp != cgrp) {
 922                 perf_detach_cgroup(event);
 923                 ret = -EINVAL;
 924         }
 925 out:
 926         fdput(f);
 927         return ret;
 928 }
 929
 930 static inline void
 931 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
 932 {
 933         struct perf_cgroup_info *t;
 934         t = per_cpu_ptr(event->cgrp->info, event->cpu);
 935         event->shadow_ctx_time = now - t->timestamp;
 936 }
 937
 938 /*
 939  * Update cpuctx->cgrp so that it is set when first cgroup event is added and
 940  * cleared when last cgroup event is removed.
 941  */
 942 static inline void
 943 list_update_cgroup_event(struct perf_event *event,
 944                          struct perf_event_context *ctx, bool add)
 945 {
 946         struct perf_cpu_context *cpuctx;
 947         struct list_head *cpuctx_entry;
 948
 949         if (!is_cgroup_event(event))
 950                 return;
 951
 952         /*
 953          * Because cgroup events are always per-cpu events,
 954          * this will always be called from the right CPU.
 955          */
 956         cpuctx = __get_cpu_context(ctx);
 957
 958         /*
 959          * Since setting cpuctx->cgrp is conditional on the current @cgrp
 960          * matching the event's cgroup, we must do this for every new event,
 961          * because if the first would mismatch, the second would not try again
 962          * and we would leave cpuctx->cgrp unset.
 963          */
 964         if (add && !cpuctx->cgrp) {
 965                 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
 966
 967                 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
 968                         cpuctx->cgrp = cgrp;
 969         }
 970
 971         if (add && ctx->nr_cgroups++)
 972                 return;
 973         else if (!add && --ctx->nr_cgroups)
 974                 return;
 975
 976         /* no cgroup running */
 977         if (!add)
 978                 cpuctx->cgrp = NULL;
 979
 980         cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
 981         if (add)
 982                 list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
 983         else
 984                 list_del(cpuctx_entry);
 985 }
 986
 987 #else /* !CONFIG_CGROUP_PERF */
 988
 989 static inline bool
 990 perf_cgroup_match(struct perf_event *event)
 991 {
 992         return true;
 993 }
 994
 995 static inline void perf_detach_cgroup(struct perf_event *event)
 996 {}
 997
 998 static inline int is_cgroup_event(struct perf_event *event)
 999 {
1000         return 0;
1001 }
1002
1003 static inline void update_cgrp_time_from_event(struct perf_event *event)
1004 {
1005 }
1006
1007 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
1008 {
1009 }
1010
1011 static inline void perf_cgroup_sched_out(struct task_struct *task,
1012                                          struct task_struct *next)
1013 {
1014 }
1015
1016 static inline void perf_cgroup_sched_in(struct task_struct *prev,
1017                                         struct task_struct *task)
1018 {
1019 }
1020
1021 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
1022                                       struct perf_event_attr *attr,
1023                                       struct perf_event *group_leader)
1024 {
1025         return -EINVAL;
1026 }
1027
1028 static inline void
1029 perf_cgroup_set_timestamp(struct task_struct *task,
1030                           struct perf_event_context *ctx)
1031 {
1032 }
1033
1034 void
1035 perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
1036 {
1037 }
1038
1039 static inline void
1040 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
1041 {
1042 }
1043
1044 static inline u64 perf_cgroup_event_time(struct perf_event *event)
1045 {
1046         return 0;
1047 }
1048
1049 static inline void
1050 list_update_cgroup_event(struct perf_event *event,
1051                          struct perf_event_context *ctx, bool add)
1052 {
1053 }
1054
1055 #endif
1056
1057 /*
1058  * set default to be dependent on timer tick just
1059  * like original code
1060  */
1061 #define PERF_CPU_HRTIMER (1000 / HZ)
1062 /*
1063  * function must be called with interrupts disabled
1064  */
1065 static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1066 {
1067         struct perf_cpu_context *cpuctx;
1068         bool rotations;
1069
1070         lockdep_assert_irqs_disabled();
1071
1072         cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
1073         rotations = perf_rotate_context(cpuctx);
1074
1075         raw_spin_lock(&cpuctx->hrtimer_lock);
1076         if (rotations)
1077                 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
1078         else
1079                 cpuctx->hrtimer_active = 0;
1080         raw_spin_unlock(&cpuctx->hrtimer_lock);
1081
1082         return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1083 }
1084
1085 static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
1086 {
1087         struct hrtimer *timer = &cpuctx->hrtimer;
1088         struct pmu *pmu = cpuctx->ctx.pmu;
1089         u64 interval;
1090
1091         /* no multiplexing needed for SW PMU */
1092         if (pmu->task_ctx_nr == perf_sw_context)
1093                 return;
1094
1095         /*
1096          * check default is sane, if not set then force to
1097          * default interval (1/tick)
1098          */
1099         interval = pmu->hrtimer_interval_ms;
1100         if (interval < 1)
1101                 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1102
1103         cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1104
1105         raw_spin_lock_init(&cpuctx->hrtimer_lock);
1106         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1107         timer->function = perf_mux_hrtimer_handler;
1108 }
1109
1110 static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
1111 {
1112         struct hrtimer *timer = &cpuctx->hrtimer;
1113         struct pmu *pmu = cpuctx->ctx.pmu;
1114         unsigned long flags;
1115
1116         /* not for SW PMU */
1117         if (pmu->task_ctx_nr == perf_sw_context)
1118                 return 0;
1119
1120         raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
1121         if (!cpuctx->hrtimer_active) {
1122                 cpuctx->hrtimer_active = 1;
1123                 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1124                 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
1125         }
1126         raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1127
1128         return 0;
1129 }
1130
1131 void perf_pmu_disable(struct pmu *pmu)
1132 {
1133         int *count = this_cpu_ptr(pmu->pmu_disable_count);
1134         if (!(*count)++)
1135                 pmu->pmu_disable(pmu);
1136 }
1137
1138 void perf_pmu_enable(struct pmu *pmu)
1139 {
1140         int *count = this_cpu_ptr(pmu->pmu_disable_count);
1141         if (!--(*count))
1142                 pmu->pmu_enable(pmu);
1143 }
1144
1145 static DEFINE_PER_CPU(struct list_head, active_ctx_list);
1146
1147 /*
1148  * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
1149  * perf_event_task_tick() are fully serialized because they're strictly cpu
1150  * affine and perf_event_ctx{activate,deactivate} are called with IRQs
1151  * disabled, while perf_event_task_tick is called from IRQ context.
1152  */
1153 static void perf_event_ctx_activate(struct perf_event_context *ctx)
1154 {
1155         struct list_head *head = this_cpu_ptr(&active_ctx_list);
1156
1157         lockdep_assert_irqs_disabled();
1158
1159         WARN_ON(!list_empty(&ctx->active_ctx_list));
1160
1161         list_add(&ctx->active_ctx_list, head);
1162 }
1163
1164 static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1165 {
1166         lockdep_assert_irqs_disabled();
1167
1168         WARN_ON(list_empty(&ctx->active_ctx_list));
1169
1170         list_del_init(&ctx->active_ctx_list);
1171 }
1172
1173 static void get_ctx(struct perf_event_context *ctx)
1174 {
1175         refcount_inc(&ctx->refcount);
1176 }
1177
1178 static void free_ctx(struct rcu_head *head)
1179 {
1180         struct perf_event_context *ctx;
1181
1182         ctx = container_of(head, struct perf_event_context, rcu_head);
1183         kfree(ctx->task_ctx_data);
1184         kfree(ctx);
1185 }
1186
1187 static void put_ctx(struct perf_event_context *ctx)
1188 {
1189         if (refcount_dec_and_test(&ctx->refcount)) {
1190                 if (ctx->parent_ctx)
1191                         put_ctx(ctx->parent_ctx);
1192                 if (ctx->task && ctx->task != TASK_TOMBSTONE)
1193                         put_task_struct(ctx->task);
1194                 call_rcu(&ctx->rcu_head, free_ctx);
1195         }
1196 }
1197
1198 /*
1199  * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
1200  * perf_pmu_migrate_context() we need some magic.
1201  *
1202  * Those places that change perf_event::ctx will hold both
1203  * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
1204  *
1205  * Lock ordering is by mutex address. There are two other sites where
1206  * perf_event_context::mutex nests and those are:
1207  *
1208  *  - perf_event_exit_task_context()    [ child , 0 ]
1209  *      perf_event_exit_event()
1210  *        put_event()                   [ parent, 1 ]
1211  *
1212  *  - perf_event_init_context()         [ parent, 0 ]
1213  *      inherit_task_group()
1214  *        inherit_group()
1215  *          inherit_event()
1216  *            perf_event_alloc()
1217  *              perf_init_event()
1218  *                perf_try_init_event() [ child , 1 ]
1219  *
1220  * While it appears there is an obvious deadlock here -- the parent and child
1221  * nesting levels are inverted between the two. This is in fact safe because
1222  * life-time rules separate them. That is an exiting task cannot fork, and a
1223  * spawning task cannot (yet) exit.
1224  *
1225  * But remember that that these are parent<->child context relations, and
1226  * migration does not affect children, therefore these two orderings should not
1227  * interact.
1228  *
1229  * The change in perf_event::ctx does not affect children (as claimed above)
1230  * because the sys_perf_event_open() case will install a new event and break
1231  * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
1232  * concerned with cpuctx and that doesn't have children.
1233  *
1234  * The places that change perf_event::ctx will issue:
1235  *
1236  *   perf_remove_from_context();
1237  *   synchronize_rcu();
1238  *   perf_install_in_context();
1239  *
1240  * to affect the change. The remove_from_context() + synchronize_rcu() should
1241  * quiesce the event, after which we can install it in the new location. This
1242  * means that only external vectors (perf_fops, prctl) can perturb the event
1243  * while in transit. Therefore all such accessors should also acquire
1244  * perf_event_context::mutex to serialize against this.
1245  *
1246  * However; because event->ctx can change while we're waiting to acquire
1247  * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
1248  * function.
1249  *
1250  * Lock order:
1251  *    cred_guard_mutex
1252  *      task_struct::perf_event_mutex
1253  *        perf_event_context::mutex
1254  *          perf_event::child_mutex;
1255  *            perf_event_context::lock
1256  *          perf_event::mmap_mutex
1257  *          mmap_sem
1258  *
1259  *    cpu_hotplug_lock
1260  *      pmus_lock
1261  *        cpuctx->mutex / perf_event_context::mutex
1262  */
1263 static struct perf_event_context *
1264 perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1265 {
1266         struct perf_event_context *ctx;
1267
1268 again:
1269         rcu_read_lock();
1270         ctx = READ_ONCE(event->ctx);
1271         if (!refcount_inc_not_zero(&ctx->refcount)) {
1272                 rcu_read_unlock();
1273                 goto again;
1274         }
1275         rcu_read_unlock();
1276
1277         mutex_lock_nested(&ctx->mutex, nesting);
1278         if (event->ctx != ctx) {
1279                 mutex_unlock(&ctx->mutex);
1280                 put_ctx(ctx);
1281                 goto again;
1282         }
1283
1284         return ctx;
1285 }
1286
1287 static inline struct perf_event_context *
1288 perf_event_ctx_lock(struct perf_event *event)
1289 {
1290         return perf_event_ctx_lock_nested(event, 0);
1291 }
1292
1293 static void perf_event_ctx_unlock(struct perf_event *event,
1294                                   struct perf_event_context *ctx)
1295 {
1296         mutex_unlock(&ctx->mutex);
1297         put_ctx(ctx);
1298 }
1299
1300 /*
1301  * This must be done under the ctx->lock, such as to serialize against
1302  * context_equiv(), therefore we cannot call put_ctx() since that might end up
1303  * calling scheduler related locks and ctx->lock nests inside those.
1304  */
1305 static __must_check struct perf_event_context *
1306 unclone_ctx(struct perf_event_context *ctx)
1307 {
1308         struct perf_event_context *parent_ctx = ctx->parent_ctx;
1309
1310         lockdep_assert_held(&ctx->lock);
1311
1312         if (parent_ctx)
1313                 ctx->parent_ctx = NULL;
1314         ctx->generation++;
1315
1316         return parent_ctx;
1317 }
1318
1319 static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
1320                                 enum pid_type type)
1321 {
1322         u32 nr;
1323         /*
1324          * only top level events have the pid namespace they were created in
1325          */
1326         if (event->parent)
1327                 event = event->parent;
1328
1329         nr = __task_pid_nr_ns(p, type, event->ns);
1330         /* avoid -1 if it is idle thread or runs in another ns */
1331         if (!nr && !pid_alive(p))
1332                 nr = -1;
1333         return nr;
1334 }
1335
1336 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1337 {
1338         return perf_event_pid_type(event, p, PIDTYPE_TGID);
1339 }
1340
1341 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1342 {
1343         return perf_event_pid_type(event, p, PIDTYPE_PID);
1344 }
1345
1346 /*
1347  * If we inherit events we want to return the parent event id
1348  * to userspace.
1349  */
1350 static u64 primary_event_id(struct perf_event *event)
1351 {
1352         u64 id = event->id;
1353
1354         if (event->parent)
1355                 id = event->parent->id;
1356
1357         return id;
1358 }
1359
1360 /*
1361  * Get the perf_event_context for a task and lock it.
1362  *
1363  * This has to cope with with the fact that until it is locked,
1364  * the context could get moved to another task.
1365  */
1366 static struct perf_event_context *
1367 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1368 {
1369         struct perf_event_context *ctx;
1370
1371 retry:
1372         /*
1373          * One of the few rules of preemptible RCU is that one cannot do
1374          * rcu_read_unlock() while holding a scheduler (or nested) lock when
1375          * part of the read side critical section was irqs-enabled -- see
1376          * rcu_read_unlock_special().
1377          *
1378          * Since ctx->lock nests under rq->lock we must ensure the entire read
1379          * side critical section has interrupts disabled.
1380          */
1381         local_irq_save(*flags);
1382         rcu_read_lock();
1383         ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1384         if (ctx) {
1385                 /*
1386                  * If this context is a clone of another, it might
1387                  * get swapped for another underneath us by
1388                  * perf_event_task_sched_out, though the
1389                  * rcu_read_lock() protects us from any context
1390                  * getting freed.  Lock the context and check if it
1391                  * got swapped before we could get the lock, and retry
1392                  * if so.  If we locked the right context, then it
1393                  * can't get swapped on us any more.
1394                  */
1395                 raw_spin_lock(&ctx->lock);
1396                 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1397                         raw_spin_unlock(&ctx->lock);
1398                         rcu_read_unlock();
1399                         local_irq_restore(*flags);
1400                         goto retry;
1401                 }
1402
1403                 if (ctx->task == TASK_TOMBSTONE ||
1404                     !refcount_inc_not_zero(&ctx->refcount)) {
1405                         raw_spin_unlock(&ctx->lock);
1406                         ctx = NULL;
1407                 } else {
1408                         WARN_ON_ONCE(ctx->task != task);
1409                 }
1410         }
1411         rcu_read_unlock();
1412         if (!ctx)
1413                 local_irq_restore(*flags);
1414         return ctx;
1415 }
1416
1417 /*
1418  * Get the context for a task and increment its pin_count so it
1419  * can't get swapped to another task.  This also increments its
1420  * reference count so that the context can't get freed.
1421  */
1422 static struct perf_event_context *
1423 perf_pin_task_context(struct task_struct *task, int ctxn)
1424 {
1425         struct perf_event_context *ctx;
1426         unsigned long flags;
1427
1428         ctx = perf_lock_task_context(task, ctxn, &flags);
1429         if (ctx) {
1430                 ++ctx->pin_count;
1431                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1432         }
1433         return ctx;
1434 }
1435
1436 static void perf_unpin_context(struct perf_event_context *ctx)
1437 {
1438         unsigned long flags;
1439
1440         raw_spin_lock_irqsave(&ctx->lock, flags);
1441         --ctx->pin_count;
1442         raw_spin_unlock_irqrestore(&ctx->lock, flags);
1443 }
1444
1445 /*
1446  * Update the record of the current time in a context.
1447  */
1448 static void update_context_time(struct perf_event_context *ctx)
1449 {
1450         u64 now = perf_clock();
1451
1452         ctx->time += now - ctx->timestamp;
1453         ctx->timestamp = now;
1454 }
1455
1456 static u64 perf_event_time(struct perf_event *event)
1457 {
1458         struct perf_event_context *ctx = event->ctx;
1459
1460         if (is_cgroup_event(event))
1461                 return perf_cgroup_event_time(event);
1462
1463         return ctx ? ctx->time : 0;
1464 }
1465
1466 static enum event_type_t get_event_type(struct perf_event *event)
1467 {
1468         struct perf_event_context *ctx = event->ctx;
1469         enum event_type_t event_type;
1470
1471         lockdep_assert_held(&ctx->lock);
1472
1473         /*
1474          * It's 'group type', really, because if our group leader is
1475          * pinned, so are we.
1476          */
1477         if (event->group_leader != event)
1478                 event = event->group_leader;
1479
1480         event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
1481         if (!ctx->task)
1482                 event_type |= EVENT_CPU;
1483
1484         return event_type;
1485 }
1486
1487 /*
1488  * Helper function to initialize event group nodes.
1489  */
1490 static void init_event_group(struct perf_event *event)
1491 {
1492         RB_CLEAR_NODE(&event->group_node);
1493         event->group_index = 0;
1494 }
1495
1496 /*
1497  * Extract pinned or flexible groups from the context
1498  * based on event attrs bits.
1499  */
1500 static struct perf_event_groups *
1501 get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
1502 {
1503         if (event->attr.pinned)
1504                 return &ctx->pinned_groups;
1505         else
1506                 return &ctx->flexible_groups;
1507 }
1508
1509 /*
1510  * Helper function to initializes perf_event_group trees.
1511  */
1512 static void perf_event_groups_init(struct perf_event_groups *groups)
1513 {
1514         groups->tree = RB_ROOT;
1515         groups->index = 0;
1516 }
1517
1518 /*
1519  * Compare function for event groups;
1520  *
1521  * Implements complex key that first sorts by CPU and then by virtual index
1522  * which provides ordering when rotating groups for the same CPU.
1523  */
1524 static bool
1525 perf_event_groups_less(struct perf_event *left, struct perf_event *right)
1526 {
1527         if (left->cpu < right->cpu)
1528                 return true;
1529         if (left->cpu > right->cpu)
1530                 return false;
1531
1532         if (left->group_index < right->group_index)
1533                 return true;
1534         if (left->group_index > right->group_index)
1535                 return false;
1536
1537         return false;
1538 }
1539
1540 /*
1541  * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
1542  * key (see perf_event_groups_less). This places it last inside the CPU
1543  * subtree.
1544  */
1545 static void
1546 perf_event_groups_insert(struct perf_event_groups *groups,
1547                          struct perf_event *event)
1548 {
1549         struct perf_event *node_event;
1550         struct rb_node *parent;
1551         struct rb_node **node;
1552
1553         event->group_index = ++groups->index;
1554
1555         node = &groups->tree.rb_node;
1556         parent = *node;
1557
1558         while (*node) {
1559                 parent = *node;
1560                 node_event = container_of(*node, struct perf_event, group_node);
1561
1562                 if (perf_event_groups_less(event, node_event))
1563                         node = &parent->rb_left;
1564                 else
1565                         node = &parent->rb_right;
1566         }
1567
1568         rb_link_node(&event->group_node, parent, node);
1569         rb_insert_color(&event->group_node, &groups->tree);
1570 }
1571
1572 /*
1573  * Helper function to insert event into the pinned or flexible groups.
1574  */
1575 static void
1576 add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
1577 {
1578         struct perf_event_groups *groups;
1579
1580         groups = get_event_groups(event, ctx);
1581         perf_event_groups_insert(groups, event);
1582 }
1583
1584 /*
1585  * Delete a group from a tree.
1586  */
1587 static void
1588 perf_event_groups_delete(struct perf_event_groups *groups,
1589                          struct perf_event *event)
1590 {
1591         WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
1592                      RB_EMPTY_ROOT(&groups->tree));
1593
1594         rb_erase(&event->group_node, &groups->tree);
1595         init_event_group(event);
1596 }
1597
1598 /*
1599  * Helper function to delete event from its groups.
1600  */
1601 static void
1602 del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
1603 {
1604         struct perf_event_groups *groups;
1605
1606         groups = get_event_groups(event, ctx);
1607         perf_event_groups_delete(groups, event);
1608 }
1609
1610 /*
1611  * Get the leftmost event in the @cpu subtree.
1612  */
1613 static struct perf_event *
1614 perf_event_groups_first(struct perf_event_groups *groups, int cpu)
1615 {
1616         struct perf_event *node_event = NULL, *match = NULL;
1617         struct rb_node *node = groups->tree.rb_node;
1618
1619         while (node) {
1620                 node_event = container_of(node, struct perf_event, group_node);
1621
1622                 if (cpu < node_event->cpu) {
1623                         node = node->rb_left;
1624                 } else if (cpu > node_event->cpu) {
1625                         node = node->rb_right;
1626                 } else {
1627                         match = node_event;
1628                         node = node->rb_left;
1629                 }
1630         }
1631
1632         return match;
1633 }
1634
1635 /*
1636  * Like rb_entry_next_safe() for the @cpu subtree.
1637  */
1638 static struct perf_event *
1639 perf_event_groups_next(struct perf_event *event)
1640 {
1641         struct perf_event *next;
1642
1643         next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
1644         if (next && next->cpu == event->cpu)
1645                 return next;
1646
1647         return NULL;
1648 }
1649
1650 /*
1651  * Iterate through the whole groups tree.
1652  */
1653 #define perf_event_groups_for_each(event, groups)                       \
1654         for (event = rb_entry_safe(rb_first(&((groups)->tree)),         \
1655                                 typeof(*event), group_node); event;     \
1656                 event = rb_entry_safe(rb_next(&event->group_node),      \
1657                                 typeof(*event), group_node))
1658
1659 /*
1660  * Add an event from the lists for its context.
1661  * Must be called with ctx->mutex and ctx->lock held.
1662  */
1663 static void
1664 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1665 {
1666         lockdep_assert_held(&ctx->lock);
1667
1668         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1669         event->attach_state |= PERF_ATTACH_CONTEXT;
1670
1671         event->tstamp = perf_event_time(event);
1672
1673         /*
1674          * If we're a stand alone event or group leader, we go to the context
1675          * list, group events are kept attached to the group so that
1676          * perf_group_detach can, at all times, locate all siblings.
1677          */
1678         if (event->group_leader == event) {
1679                 event->group_caps = event->event_caps;
1680                 add_event_to_groups(event, ctx);
1681         }
1682
1683         list_update_cgroup_event(event, ctx, true);
1684
1685         list_add_rcu(&event->event_entry, &ctx->event_list);
1686         ctx->nr_events++;
1687         if (event->attr.inherit_stat)
1688                 ctx->nr_stat++;
1689
1690         ctx->generation++;
1691 }
1692
1693 /*
1694  * Initialize event state based on the perf_event_attr::disabled.
1695  */
1696 static inline void perf_event__state_init(struct perf_event *event)
1697 {
1698         event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1699                                               PERF_EVENT_STATE_INACTIVE;
1700 }
1701
1702 static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1703 {
1704         int entry = sizeof(u64); /* value */
1705         int size = 0;
1706         int nr = 1;
1707
1708         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1709                 size += sizeof(u64);
1710
1711         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1712                 size += sizeof(u64);
1713
1714         if (event->attr.read_format & PERF_FORMAT_ID)
1715                 entry += sizeof(u64);
1716
1717         if (event->attr.read_format & PERF_FORMAT_GROUP) {
1718                 nr += nr_siblings;
1719                 size += sizeof(u64);
1720         }
1721
1722         size += entry * nr;
1723         event->read_size = size;
1724 }
1725
1726 static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1727 {
1728         struct perf_sample_data *data;
1729         u16 size = 0;
1730
1731         if (sample_type & PERF_SAMPLE_IP)
1732                 size += sizeof(data->ip);
1733
1734         if (sample_type & PERF_SAMPLE_ADDR)
1735                 size += sizeof(data->addr);
1736
1737         if (sample_type & PERF_SAMPLE_PERIOD)
1738                 size += sizeof(data->period);
1739
1740         if (sample_type & PERF_SAMPLE_WEIGHT)
1741                 size += sizeof(data->weight);
1742
1743         if (sample_type & PERF_SAMPLE_READ)
1744                 size += event->read_size;
1745
1746         if (sample_type & PERF_SAMPLE_DATA_SRC)
1747                 size += sizeof(data->data_src.val);
1748
1749         if (sample_type & PERF_SAMPLE_TRANSACTION)
1750                 size += sizeof(data->txn);
1751
1752         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
1753                 size += sizeof(data->phys_addr);
1754
1755         event->header_size = size;
1756 }
1757
1758 /*
1759  * Called at perf_event creation and when events are attached/detached from a
1760  * group.
1761  */
1762 static void perf_event__header_size(struct perf_event *event)
1763 {
1764         __perf_event_read_size(event,
1765                                event->group_leader->nr_siblings);
1766         __perf_event_header_size(event, event->attr.sample_type);
1767 }
1768
1769 static void perf_event__id_header_size(struct perf_event *event)
1770 {
1771         struct perf_sample_data *data;
1772         u64 sample_type = event->attr.sample_type;
1773         u16 size = 0;
1774
1775         if (sample_type & PERF_SAMPLE_TID)
1776                 size += sizeof(data->tid_entry);
1777
1778         if (sample_type & PERF_SAMPLE_TIME)
1779                 size += sizeof(data->time);
1780
1781         if (sample_type & PERF_SAMPLE_IDENTIFIER)
1782                 size += sizeof(data->id);
1783
1784         if (sample_type & PERF_SAMPLE_ID)
1785                 size += sizeof(data->id);
1786
1787         if (sample_type & PERF_SAMPLE_STREAM_ID)
1788                 size += sizeof(data->stream_id);
1789
1790         if (sample_type & PERF_SAMPLE_CPU)
1791                 size += sizeof(data->cpu_entry);
1792
1793         event->id_header_size = size;
1794 }
1795
1796 static bool perf_event_validate_size(struct perf_event *event)
1797 {
1798         /*
1799          * The values computed here will be over-written when we actually
1800          * attach the event.
1801          */
1802         __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1803         __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1804         perf_event__id_header_size(event);
1805
1806         /*
1807          * Sum the lot; should not exceed the 64k limit we have on records.
1808          * Conservative limit to allow for callchains and other variable fields.
1809          */
1810         if (event->read_size + event->header_size +
1811             event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1812                 return false;
1813
1814         return true;
1815 }
1816
1817 static void perf_group_attach(struct perf_event *event)
1818 {
1819         struct perf_event *group_leader = event->group_leader, *pos;
1820
1821         lockdep_assert_held(&event->ctx->lock);
1822
1823         /*
1824          * We can have double attach due to group movement in perf_event_open.
1825          */
1826         if (event->attach_state & PERF_ATTACH_GROUP)
1827                 return;
1828
1829         event->attach_state |= PERF_ATTACH_GROUP;
1830
1831         if (group_leader == event)
1832                 return;
1833
1834         WARN_ON_ONCE(group_leader->ctx != event->ctx);
1835
1836         group_leader->group_caps &= event->event_caps;
1837
1838         list_add_tail(&event->sibling_list, &group_leader->sibling_list);
1839         group_leader->nr_siblings++;
1840
1841         perf_event__header_size(group_leader);
1842
1843         for_each_sibling_event(pos, group_leader)
1844                 perf_event__header_size(pos);
1845 }
1846
1847 /*
1848  * Remove an event from the lists for its context.
1849  * Must be called with ctx->mutex and ctx->lock held.
1850  */
1851 static void
1852 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1853 {
1854         WARN_ON_ONCE(event->ctx != ctx);
1855         lockdep_assert_held(&ctx->lock);
1856
1857         /*
1858          * We can have double detach due to exit/hot-unplug + close.
1859          */
1860         if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1861                 return;
1862
1863         event->attach_state &= ~PERF_ATTACH_CONTEXT;
1864
1865         list_update_cgroup_event(event, ctx, false);
1866
1867         ctx->nr_events--;
1868         if (event->attr.inherit_stat)
1869                 ctx->nr_stat--;
1870
1871         list_del_rcu(&event->event_entry);
1872
1873         if (event->group_leader == event)
1874                 del_event_from_groups(event, ctx);
1875
1876         /*
1877          * If event was in error state, then keep it
1878          * that way, otherwise bogus counts will be
1879          * returned on read(). The only way to get out
1880          * of error state is by explicit re-enabling
1881          * of the event
1882          */
1883         if (event->state > PERF_EVENT_STATE_OFF)
1884                 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
1885
1886         ctx->generation++;
1887 }
1888
1889 static void perf_group_detach(struct perf_event *event)
1890 {
1891         struct perf_event *sibling, *tmp;
1892         struct perf_event_context *ctx = event->ctx;
1893
1894         lockdep_assert_held(&ctx->lock);
1895
1896         /*
1897          * We can have double detach due to exit/hot-unplug + close.
1898          */
1899         if (!(event->attach_state & PERF_ATTACH_GROUP))
1900                 return;
1901
1902         event->attach_state &= ~PERF_ATTACH_GROUP;
1903
1904         /*
1905          * If this is a sibling, remove it from its group.
1906          */
1907         if (event->group_leader != event) {
1908                 list_del_init(&event->sibling_list);
1909                 event->group_leader->nr_siblings--;
1910                 goto out;
1911         }
1912
1913         /*
1914          * If this was a group event with sibling events then
1915          * upgrade the siblings to singleton events by adding them
1916          * to whatever list we are on.
1917          */
1918         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
1919
1920                 sibling->group_leader = sibling;
1921                 list_del_init(&sibling->sibling_list);
1922
1923                 /* Inherit group flags from the previous leader */
1924                 sibling->group_caps = event->group_caps;
1925
1926                 if (!RB_EMPTY_NODE(&event->group_node)) {
1927                         add_event_to_groups(sibling, event->ctx);
1928
1929                         if (sibling->state == PERF_EVENT_STATE_ACTIVE) {
1930                                 struct list_head *list = sibling->attr.pinned ?
1931                                         &ctx->pinned_active : &ctx->flexible_active;
1932
1933                                 list_add_tail(&sibling->active_list, list);
1934                         }
1935                 }
1936
1937                 WARN_ON_ONCE(sibling->ctx != event->ctx);
1938         }
1939
1940 out:
1941         perf_event__header_size(event->group_leader);
1942
1943         for_each_sibling_event(tmp, event->group_leader)
1944                 perf_event__header_size(tmp);
1945 }
1946
1947 static bool is_orphaned_event(struct perf_event *event)
1948 {
1949         return event->state == PERF_EVENT_STATE_DEAD;
1950 }
1951
1952 static inline int __pmu_filter_match(struct perf_event *event)
1953 {
1954         struct pmu *pmu = event->pmu;
1955         return pmu->filter_match ? pmu->filter_match(event) : 1;
1956 }
1957
1958 /*
1959  * Check whether we should attempt to schedule an event group based on
1960  * PMU-specific filtering. An event group can consist of HW and SW events,
1961  * potentially with a SW leader, so we must check all the filters, to
1962  * determine whether a group is schedulable:
1963  */
1964 static inline int pmu_filter_match(struct perf_event *event)
1965 {
1966         struct perf_event *sibling;
1967
1968         if (!__pmu_filter_match(event))
1969                 return 0;
1970
1971         for_each_sibling_event(sibling, event) {
1972                 if (!__pmu_filter_match(sibling))
1973                         return 0;
1974         }
1975
1976         return 1;
1977 }
1978
1979 static inline int
1980 event_filter_match(struct perf_event *event)
1981 {
1982         return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
1983                perf_cgroup_match(event) && pmu_filter_match(event);
1984 }
1985
1986 static void
1987 event_sched_out(struct perf_event *event,
1988                   struct perf_cpu_context *cpuctx,
1989                   struct perf_event_context *ctx)
1990 {
1991         enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
1992
1993         WARN_ON_ONCE(event->ctx != ctx);
1994         lockdep_assert_held(&ctx->lock);
1995
1996         if (event->state != PERF_EVENT_STATE_ACTIVE)
1997                 return;
1998
1999         /*
2000          * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
2001          * we can schedule events _OUT_ individually through things like
2002          * __perf_remove_from_context().
2003          */
2004         list_del_init(&event->active_list);
2005
2006         perf_pmu_disable(event->pmu);
2007
2008         event->pmu->del(event, 0);
2009         event->oncpu = -1;
2010
2011         if (event->pending_disable) {
2012                 event->pending_disable = 0;
2013                 state = PERF_EVENT_STATE_OFF;
2014         }
2015         perf_event_set_state(event, state);
2016
2017         if (!is_software_event(event))
2018                 cpuctx->active_oncpu--;
2019         if (!--ctx->nr_active)
2020                 perf_event_ctx_deactivate(ctx);
2021         if (event->attr.freq && event->attr.sample_freq)
2022                 ctx->nr_freq--;
2023         if (event->attr.exclusive || !cpuctx->active_oncpu)
2024                 cpuctx->exclusive = 0;
2025
2026         perf_pmu_enable(event->pmu);
2027 }
2028
2029 static void
2030 group_sched_out(struct perf_event *group_event,
2031                 struct perf_cpu_context *cpuctx,
2032                 struct perf_event_context *ctx)
2033 {
2034         struct perf_event *event;
2035
2036         if (group_event->state != PERF_EVENT_STATE_ACTIVE)
2037                 return;
2038
2039         perf_pmu_disable(ctx->pmu);
2040
2041         event_sched_out(group_event, cpuctx, ctx);
2042
2043         /*
2044          * Schedule out siblings (if any):
2045          */
2046         for_each_sibling_event(event, group_event)
2047                 event_sched_out(event, cpuctx, ctx);
2048
2049         perf_pmu_enable(ctx->pmu);
2050
2051         if (group_event->attr.exclusive)
2052                 cpuctx->exclusive = 0;
2053 }
2054
2055 #define DETACH_GROUP    0x01UL
2056
2057 /*
2058  * Cross CPU call to remove a performance event
2059  *
2060  * We disable the event on the hardware level first. After that we
2061  * remove it from the context list.
2062  */
2063 static void
2064 __perf_remove_from_context(struct perf_event *event,
2065                            struct perf_cpu_context *cpuctx,
2066                            struct perf_event_context *ctx,
2067                            void *info)
2068 {
2069         unsigned long flags = (unsigned long)info;
2070
2071         if (ctx->is_active & EVENT_TIME) {
2072                 update_context_time(ctx);
2073                 update_cgrp_time_from_cpuctx(cpuctx);
2074         }
2075
2076         event_sched_out(event, cpuctx, ctx);
2077         if (flags & DETACH_GROUP)
2078                 perf_group_detach(event);
2079         list_del_event(event, ctx);
2080
2081         if (!ctx->nr_events && ctx->is_active) {
2082                 ctx->is_active = 0;
2083                 if (ctx->task) {
2084                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2085                         cpuctx->task_ctx = NULL;
2086                 }
2087         }
2088 }
2089
2090 /*
2091  * Remove the event from a task's (or a CPU's) list of events.
2092  *
2093  * If event->ctx is a cloned context, callers must make sure that
2094  * every task struct that event->ctx->task could possibly point to
2095  * remains valid.  This is OK when called from perf_release since
2096  * that only calls us on the top-level context, which can't be a clone.
2097  * When called from perf_event_exit_task, it's OK because the
2098  * context has been detached from its task.
2099  */
2100 static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
2101 {
2102         struct perf_event_context *ctx = event->ctx;
2103
2104         lockdep_assert_held(&ctx->mutex);
2105
2106         event_function_call(event, __perf_remove_from_context, (void *)flags);
2107
2108         /*
2109          * The above event_function_call() can NO-OP when it hits
2110          * TASK_TOMBSTONE. In that case we must already have been detached
2111          * from the context (by perf_event_exit_event()) but the grouping
2112          * might still be in-tact.
2113          */
2114         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
2115         if ((flags & DETACH_GROUP) &&
2116             (event->attach_state & PERF_ATTACH_GROUP)) {
2117                 /*
2118                  * Since in that case we cannot possibly be scheduled, simply
2119                  * detach now.
2120                  */
2121                 raw_spin_lock_irq(&ctx->lock);
2122                 perf_group_detach(event);
2123                 raw_spin_unlock_irq(&ctx->lock);
2124         }
2125 }
2126
2127 /*
2128  * Cross CPU call to disable a performance event
2129  */
2130 static void __perf_event_disable(struct perf_event *event,
2131                                  struct perf_cpu_context *cpuctx,
2132                                  struct perf_event_context *ctx,
2133                                  void *info)
2134 {
2135         if (event->state < PERF_EVENT_STATE_INACTIVE)
2136                 return;
2137
2138         if (ctx->is_active & EVENT_TIME) {
2139                 update_context_time(ctx);
2140                 update_cgrp_time_from_event(event);
2141         }
2142
2143         if (event == event->group_leader)
2144                 group_sched_out(event, cpuctx, ctx);
2145         else
2146                 event_sched_out(event, cpuctx, ctx);
2147
2148         perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2149 }
2150
2151 /*
2152  * Disable an event.
2153  *
2154  * If event->ctx is a cloned context, callers must make sure that
2155  * every task struct that event->ctx->task could possibly point to
2156  * remains valid.  This condition is satisifed when called through
2157  * perf_event_for_each_child or perf_event_for_each because they
2158  * hold the top-level event's child_mutex, so any descendant that
2159  * goes to exit will block in perf_event_exit_event().
2160  *
2161  * When called from perf_pending_event it's OK because event->ctx
2162  * is the current context on this CPU and preemption is disabled,
2163  * hence we can't get into perf_event_task_sched_out for this context.
2164  */
2165 static void _perf_event_disable(struct perf_event *event)
2166 {
2167         struct perf_event_context *ctx = event->ctx;
2168
2169         raw_spin_lock_irq(&ctx->lock);
2170         if (event->state <= PERF_EVENT_STATE_OFF) {
2171                 raw_spin_unlock_irq(&ctx->lock);
2172                 return;
2173         }
2174         raw_spin_unlock_irq(&ctx->lock);
2175
2176         event_function_call(event, __perf_event_disable, NULL);
2177 }
2178
2179 void perf_event_disable_local(struct perf_event *event)
2180 {
2181         event_function_local(event, __perf_event_disable, NULL);
2182 }
2183
2184 /*
2185  * Strictly speaking kernel users cannot create groups and therefore this
2186  * interface does not need the perf_event_ctx_lock() magic.
2187  */
2188 void perf_event_disable(struct perf_event *event)
2189 {
2190         struct perf_event_context *ctx;
2191
2192         ctx = perf_event_ctx_lock(event);
2193         _perf_event_disable(event);
2194         perf_event_ctx_unlock(event, ctx);
2195 }
2196 EXPORT_SYMBOL_GPL(perf_event_disable);
2197
2198 void perf_event_disable_inatomic(struct perf_event *event)
2199 {
2200         event->pending_disable = 1;
2201         irq_work_queue(&event->pending);
2202 }
2203
2204 static void perf_set_shadow_time(struct perf_event *event,
2205                                  struct perf_event_context *ctx)
2206 {
2207         /*
2208          * use the correct time source for the time snapshot
2209          *
2210          * We could get by without this by leveraging the
2211          * fact that to get to this function, the caller
2212          * has most likely already called update_context_time()
2213          * and update_cgrp_time_xx() and thus both timestamp
2214          * are identical (or very close). Given that tstamp is,
2215          * already adjusted for cgroup, we could say that:
2216          *    tstamp - ctx->timestamp
2217          * is equivalent to
2218          *    tstamp - cgrp->timestamp.
2219          *
2220          * Then, in perf_output_read(), the calculation would
2221          * work with no changes because:
2222          * - event is guaranteed scheduled in
2223          * - no scheduled out in between
2224          * - thus the timestamp would be the same
2225          *
2226          * But this is a bit hairy.
2227          *
2228          * So instead, we have an explicit cgroup call to remain
2229          * within the time time source all along. We believe it
2230          * is cleaner and simpler to understand.
2231          */
2232         if (is_cgroup_event(event))
2233                 perf_cgroup_set_shadow_time(event, event->tstamp);
2234         else
2235                 event->shadow_ctx_time = event->tstamp - ctx->timestamp;
2236 }
2237
2238 #define MAX_INTERRUPTS (~0ULL)
2239
2240 static void perf_log_throttle(struct perf_event *event, int enable);
2241 static void perf_log_itrace_start(struct perf_event *event);
2242
2243 static int
2244 event_sched_in(struct perf_event *event,
2245                  struct perf_cpu_context *cpuctx,
2246                  struct perf_event_context *ctx)
2247 {
2248         int ret = 0;
2249
2250         lockdep_assert_held(&ctx->lock);
2251
2252         if (event->state <= PERF_EVENT_STATE_OFF)
2253                 return 0;
2254
2255         WRITE_ONCE(event->oncpu, smp_processor_id());
2256         /*
2257          * Order event::oncpu write to happen before the ACTIVE state is
2258          * visible. This allows perf_event_{stop,read}() to observe the correct
2259          * ->oncpu if it sees ACTIVE.
2260          */
2261         smp_wmb();
2262         perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
2263
2264         /*
2265          * Unthrottle events, since we scheduled we might have missed several
2266          * ticks already, also for a heavily scheduling task there is little
2267          * guarantee it'll get a tick in a timely manner.
2268          */
2269         if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2270                 perf_log_throttle(event, 1);
2271                 event->hw.interrupts = 0;
2272         }
2273
2274         perf_pmu_disable(event->pmu);
2275
2276         perf_set_shadow_time(event, ctx);
2277
2278         perf_log_itrace_start(event);
2279
2280         if (event->pmu->add(event, PERF_EF_START)) {
2281                 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2282                 event->oncpu = -1;
2283                 ret = -EAGAIN;
2284                 goto out;
2285         }
2286
2287         if (!is_software_event(event))
2288                 cpuctx->active_oncpu++;
2289         if (!ctx->nr_active++)
2290                 perf_event_ctx_activate(ctx);
2291         if (event->attr.freq && event->attr.sample_freq)
2292                 ctx->nr_freq++;
2293
2294         if (event->attr.exclusive)
2295                 cpuctx->exclusive = 1;
2296
2297 out:
2298         perf_pmu_enable(event->pmu);
2299
2300         return ret;
2301 }
2302
2303 static int
2304 group_sched_in(struct perf_event *group_event,
2305                struct perf_cpu_context *cpuctx,
2306                struct perf_event_context *ctx)
2307 {
2308         struct perf_event *event, *partial_group = NULL;
2309         struct pmu *pmu = ctx->pmu;
2310
2311         if (group_event->state == PERF_EVENT_STATE_OFF)
2312                 return 0;
2313
2314         pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2315
2316         if (event_sched_in(group_event, cpuctx, ctx)) {
2317                 pmu->cancel_txn(pmu);
2318                 perf_mux_hrtimer_restart(cpuctx);
2319                 return -EAGAIN;
2320         }
2321
2322         /*
2323          * Schedule in siblings as one group (if any):
2324          */
2325         for_each_sibling_event(event, group_event) {
2326                 if (event_sched_in(event, cpuctx, ctx)) {
2327                         partial_group = event;
2328                         goto group_error;
2329                 }
2330         }
2331
2332         if (!pmu->commit_txn(pmu))
2333                 return 0;
2334
2335 group_error:
2336         /*
2337          * Groups can be scheduled in as one unit only, so undo any
2338          * partial group before returning:
2339          * The events up to the failed event are scheduled out normally.
2340          */
2341         for_each_sibling_event(event, group_event) {
2342                 if (event == partial_group)
2343                         break;
2344
2345                 event_sched_out(event, cpuctx, ctx);
2346         }
2347         event_sched_out(group_event, cpuctx, ctx);
2348
2349         pmu->cancel_txn(pmu);
2350
2351         perf_mux_hrtimer_restart(cpuctx);
2352
2353         return -EAGAIN;
2354 }
2355
2356 /*
2357  * Work out whether we can put this event group on the CPU now.
2358  */
2359 static int group_can_go_on(struct perf_event *event,
2360                            struct perf_cpu_context *cpuctx,
2361                            int can_add_hw)
2362 {
2363         /*
2364          * Groups consisting entirely of software events can always go on.
2365          */
2366         if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2367                 return 1;
2368         /*
2369          * If an exclusive group is already on, no other hardware
2370          * events can go on.
2371          */
2372         if (cpuctx->exclusive)
2373                 return 0;
2374         /*
2375          * If this group is exclusive and there are already
2376          * events on the CPU, it can't go on.
2377          */
2378         if (event->attr.exclusive && cpuctx->active_oncpu)
2379                 return 0;
2380         /*
2381          * Otherwise, try to add it if all previous groups were able
2382          * to go on.
2383          */
2384         return can_add_hw;
2385 }
2386
2387 static void add_event_to_ctx(struct perf_event *event,
2388                                struct perf_event_context *ctx)
2389 {
2390         list_add_event(event, ctx);
2391         perf_group_attach(event);
2392 }
2393
2394 static void ctx_sched_out(struct perf_event_context *ctx,
2395                           struct perf_cpu_context *cpuctx,
2396                           enum event_type_t event_type);
2397 static void
2398 ctx_sched_in(struct perf_event_context *ctx,
2399              struct perf_cpu_context *cpuctx,
2400              enum event_type_t event_type,
2401              struct task_struct *task);
2402
2403 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2404                                struct perf_event_context *ctx,
2405                                enum event_type_t event_type)
2406 {
2407         if (!cpuctx->task_ctx)
2408                 return;
2409
2410         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2411                 return;
2412
2413         ctx_sched_out(ctx, cpuctx, event_type);
2414 }
2415
2416 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2417                                 struct perf_event_context *ctx,
2418                                 struct task_struct *task)
2419 {
2420         cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2421         if (ctx)
2422                 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2423         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2424         if (ctx)
2425                 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2426 }
2427
2428 /*
2429  * We want to maintain the following priority of scheduling:
2430  *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
2431  *  - task pinned (EVENT_PINNED)
2432  *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
2433  *  - task flexible (EVENT_FLEXIBLE).
2434  *
2435  * In order to avoid unscheduling and scheduling back in everything every
2436  * time an event is added, only do it for the groups of equal priority and
2437  * below.
2438  *
2439  * This can be called after a batch operation on task events, in which case
2440  * event_type is a bit mask of the types of events involved. For CPU events,
2441  * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
2442  */
2443 static void ctx_resched(struct perf_cpu_context *cpuctx,
2444                         struct perf_event_context *task_ctx,
2445                         enum event_type_t event_type)
2446 {
2447         enum event_type_t ctx_event_type;
2448         bool cpu_event = !!(event_type & EVENT_CPU);
2449
2450         /*
2451          * If pinned groups are involved, flexible groups also need to be
2452          * scheduled out.
2453          */
2454         if (event_type & EVENT_PINNED)
2455                 event_type |= EVENT_FLEXIBLE;
2456
2457         ctx_event_type = event_type & EVENT_ALL;
2458
2459         perf_pmu_disable(cpuctx->ctx.pmu);
2460         if (task_ctx)
2461                 task_ctx_sched_out(cpuctx, task_ctx, event_type);
2462
2463         /*
2464          * Decide which cpu ctx groups to schedule out based on the types
2465          * of events that caused rescheduling:
2466          *  - EVENT_CPU: schedule out corresponding groups;
2467          *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
2468          *  - otherwise, do nothing more.
2469          */
2470         if (cpu_event)
2471                 cpu_ctx_sched_out(cpuctx, ctx_event_type);
2472         else if (ctx_event_type & EVENT_PINNED)
2473                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2474
2475         perf_event_sched_in(cpuctx, task_ctx, current);
2476         perf_pmu_enable(cpuctx->ctx.pmu);
2477 }
2478
2479 /*
2480  * Cross CPU call to install and enable a performance event
2481  *
2482  * Very similar to remote_function() + event_function() but cannot assume that
2483  * things like ctx->is_active and cpuctx->task_ctx are set.
2484  */
2485 static int  __perf_install_in_context(void *info)
2486 {
2487         struct perf_event *event = info;
2488         struct perf_event_context *ctx = event->ctx;
2489         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2490         struct perf_event_context *task_ctx = cpuctx->task_ctx;
2491         bool reprogram = true;
2492         int ret = 0;
2493
2494         raw_spin_lock(&cpuctx->ctx.lock);
2495         if (ctx->task) {
2496                 raw_spin_lock(&ctx->lock);
2497                 task_ctx = ctx;
2498
2499                 reprogram = (ctx->task == current);
2500
2501                 /*
2502                  * If the task is running, it must be running on this CPU,
2503                  * otherwise we cannot reprogram things.
2504                  *
2505                  * If its not running, we don't care, ctx->lock will
2506                  * serialize against it becoming runnable.
2507                  */
2508                 if (task_curr(ctx->task) && !reprogram) {
2509                         ret = -ESRCH;
2510                         goto unlock;
2511                 }
2512
2513                 WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2514         } else if (task_ctx) {
2515                 raw_spin_lock(&task_ctx->lock);
2516         }
2517
2518 #ifdef CONFIG_CGROUP_PERF
2519         if (is_cgroup_event(event)) {
2520                 /*
2521                  * If the current cgroup doesn't match the event's
2522                  * cgroup, we should not try to schedule it.
2523                  */
2524                 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
2525                 reprogram = cgroup_is_descendant(cgrp->css.cgroup,
2526                                         event->cgrp->css.cgroup);
2527         }
2528 #endif
2529
2530         if (reprogram) {
2531                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2532                 add_event_to_ctx(event, ctx);
2533                 ctx_resched(cpuctx, task_ctx, get_event_type(event));
2534         } else {
2535                 add_event_to_ctx(event, ctx);
2536         }
2537
2538 unlock:
2539         perf_ctx_unlock(cpuctx, task_ctx);
2540
2541         return ret;
2542 }
2543
2544 /*
2545  * Attach a performance event to a context.
2546  *
2547  * Very similar to event_function_call, see comment there.
2548  */
2549 static void
2550 perf_install_in_context(struct perf_event_context *ctx,
2551                         struct perf_event *event,
2552                         int cpu)
2553 {
2554         struct task_struct *task = READ_ONCE(ctx->task);
2555
2556         lockdep_assert_held(&ctx->mutex);
2557
2558         if (event->cpu != -1)
2559                 event->cpu = cpu;
2560
2561         /*
2562          * Ensures that if we can observe event->ctx, both the event and ctx
2563          * will be 'complete'. See perf_iterate_sb_cpu().
2564          */
2565         smp_store_release(&event->ctx, ctx);
2566
2567         if (!task) {
2568                 cpu_function_call(cpu, __perf_install_in_context, event);
2569                 return;
2570         }
2571
2572         /*
2573          * Should not happen, we validate the ctx is still alive before calling.
2574          */
2575         if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2576                 return;
2577
2578         /*
2579          * Installing events is tricky because we cannot rely on ctx->is_active
2580          * to be set in case this is the nr_events 0 -> 1 transition.
2581          *
2582          * Instead we use task_curr(), which tells us if the task is running.
2583          * However, since we use task_curr() outside of rq::lock, we can race
2584          * against the actual state. This means the result can be wrong.
2585          *
2586          * If we get a false positive, we retry, this is harmless.
2587          *
2588          * If we get a false negative, things are complicated. If we are after
2589          * perf_event_context_sched_in() ctx::lock will serialize us, and the
2590          * value must be correct. If we're before, it doesn't matter since
2591          * perf_event_context_sched_in() will program the counter.
2592          *
2593          * However, this hinges on the remote context switch having observed
2594          * our task->perf_event_ctxp[] store, such that it will in fact take
2595          * ctx::lock in perf_event_context_sched_in().
2596          *
2597          * We do this by task_function_call(), if the IPI fails to hit the task
2598          * we know any future context switch of task must see the
2599          * perf_event_ctpx[] store.
2600          */
2601
2602         /*
2603          * This smp_mb() orders the task->perf_event_ctxp[] store with the
2604          * task_cpu() load, such that if the IPI then does not find the task
2605          * running, a future context switch of that task must observe the
2606          * store.
2607          */
2608         smp_mb();
2609 again:
2610         if (!task_function_call(task, __perf_install_in_context, event))
2611                 return;
2612
2613         raw_spin_lock_irq(&ctx->lock);
2614         task = ctx->task;
2615         if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2616                 /*
2617                  * Cannot happen because we already checked above (which also
2618                  * cannot happen), and we hold ctx->mutex, which serializes us
2619                  * against perf_event_exit_task_context().
2620                  */
2621                 raw_spin_unlock_irq(&ctx->lock);
2622                 return;
2623         }
2624         /*
2625          * If the task is not running, ctx->lock will avoid it becoming so,
2626          * thus we can safely install the event.
2627          */
2628         if (task_curr(task)) {
2629                 raw_spin_unlock_irq(&ctx->lock);
2630                 goto again;
2631         }
2632         add_event_to_ctx(event, ctx);
2633         raw_spin_unlock_irq(&ctx->lock);
2634 }
2635
2636 /*
2637  * Cross CPU call to enable a performance event
2638  */
2639 static void __perf_event_enable(struct perf_event *event,
2640                                 struct perf_cpu_context *cpuctx,
2641                                 struct perf_event_context *ctx,
2642                                 void *info)
2643 {
2644         struct perf_event *leader = event->group_leader;
2645         struct perf_event_context *task_ctx;
2646
2647         if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2648             event->state <= PERF_EVENT_STATE_ERROR)
2649                 return;
2650
2651         if (ctx->is_active)
2652                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2653
2654         perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2655
2656         if (!ctx->is_active)
2657                 return;
2658
2659         if (!event_filter_match(event)) {
2660                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2661                 return;
2662         }
2663
2664         /*
2665          * If the event is in a group and isn't the group leader,
2666          * then don't put it on unless the group is on.
2667          */
2668         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2669                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2670                 return;
2671         }
2672
2673         task_ctx = cpuctx->task_ctx;
2674         if (ctx->task)
2675                 WARN_ON_ONCE(task_ctx != ctx);
2676
2677         ctx_resched(cpuctx, task_ctx, get_event_type(event));
2678 }
2679
2680 /*
2681  * Enable an event.
2682  *
2683  * If event->ctx is a cloned context, callers must make sure that
2684  * every task struct that event->ctx->task could possibly point to
2685  * remains valid.  This condition is satisfied when called through
2686  * perf_event_for_each_child or perf_event_for_each as described
2687  * for perf_event_disable.
2688  */
2689 static void _perf_event_enable(struct perf_event *event)
2690 {
2691         struct perf_event_context *ctx = event->ctx;
2692
2693         raw_spin_lock_irq(&ctx->lock);
2694         if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2695             event->state <  PERF_EVENT_STATE_ERROR) {
2696                 raw_spin_unlock_irq(&ctx->lock);
2697                 return;
2698         }
2699
2700         /*
2701          * If the event is in error state, clear that first.
2702          *
2703          * That way, if we see the event in error state below, we know that it
2704          * has gone back into error state, as distinct from the task having
2705          * been scheduled away before the cross-call arrived.
2706          */
2707         if (event->state == PERF_EVENT_STATE_ERROR)
2708                 event->state = PERF_EVENT_STATE_OFF;
2709         raw_spin_unlock_irq(&ctx->lock);
2710
2711         event_function_call(event, __perf_event_enable, NULL);
2712 }
2713
2714 /*
2715  * See perf_event_disable();
2716  */
2717 void perf_event_enable(struct perf_event *event)
2718 {
2719         struct perf_event_context *ctx;
2720
2721         ctx = perf_event_ctx_lock(event);
2722         _perf_event_enable(event);
2723         perf_event_ctx_unlock(event, ctx);
2724 }
2725 EXPORT_SYMBOL_GPL(perf_event_enable);
2726
2727 struct stop_event_data {
2728         struct perf_event       *event;
2729         unsigned int            restart;
2730 };
2731
2732 static int __perf_event_stop(void *info)
2733 {
2734         struct stop_event_data *sd = info;
2735         struct perf_event *event = sd->event;
2736
2737         /* if it's already INACTIVE, do nothing */
2738         if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2739                 return 0;
2740
2741         /* matches smp_wmb() in event_sched_in() */
2742         smp_rmb();
2743
2744         /*
2745          * There is a window with interrupts enabled before we get here,
2746          * so we need to check again lest we try to stop another CPU's event.
2747          */
2748         if (READ_ONCE(event->oncpu) != smp_processor_id())
2749                 return -EAGAIN;
2750
2751         event->pmu->stop(event, PERF_EF_UPDATE);
2752
2753         /*
2754          * May race with the actual stop (through perf_pmu_output_stop()),
2755          * but it is only used for events with AUX ring buffer, and such
2756          * events will refuse to restart because of rb::aux_mmap_count==0,
2757          * see comments in perf_aux_output_begin().
2758          *
2759          * Since this is happening on an event-local CPU, no trace is lost
2760          * while restarting.
2761          */
2762         if (sd->restart)
2763                 event->pmu->start(event, 0);
2764
2765         return 0;
2766 }
2767
2768 static int perf_event_stop(struct perf_event *event, int restart)
2769 {
2770         struct stop_event_data sd = {
2771                 .event          = event,
2772                 .restart        = restart,
2773         };
2774         int ret = 0;
2775
2776         do {
2777                 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2778                         return 0;
2779
2780                 /* matches smp_wmb() in event_sched_in() */
2781                 smp_rmb();
2782
2783                 /*
2784                  * We only want to restart ACTIVE events, so if the event goes
2785                  * inactive here (event->oncpu==-1), there's nothing more to do;
2786                  * fall through with ret==-ENXIO.
2787                  */
2788                 ret = cpu_function_call(READ_ONCE(event->oncpu),
2789                                         __perf_event_stop, &sd);
2790         } while (ret == -EAGAIN);
2791
2792         return ret;
2793 }
2794
2795 /*
2796  * In order to contain the amount of racy and tricky in the address filter
2797  * configuration management, it is a two part process:
2798  *
2799  * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
2800  *      we update the addresses of corresponding vmas in
2801  *      event::addr_filters_offs array and bump the event::addr_filters_gen;
2802  * (p2) when an event is scheduled in (pmu::add), it calls
2803  *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
2804  *      if the generation has changed since the previous call.
2805  *
2806  * If (p1) happens while the event is active, we restart it to force (p2).
2807  *
2808  * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
2809  *     pre-existing mappings, called once when new filters arrive via SET_FILTER
2810  *     ioctl;
2811  * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
2812  *     registered mapping, called for every new mmap(), with mm::mmap_sem down
2813  *     for reading;
2814  * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
2815  *     of exec.
2816  */
2817 void perf_event_addr_filters_sync(struct perf_event *event)
2818 {
2819         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
2820
2821         if (!has_addr_filter(event))
2822                 return;
2823
2824         raw_spin_lock(&ifh->lock);
2825         if (event->addr_filters_gen != event->hw.addr_filters_gen) {
2826                 event->pmu->addr_filters_sync(event);
2827                 event->hw.addr_filters_gen = event->addr_filters_gen;
2828         }
2829         raw_spin_unlock(&ifh->lock);
2830 }
2831 EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
2832
2833 static int _perf_event_refresh(struct perf_event *event, int refresh)
2834 {
2835         /*
2836          * not supported on inherited events
2837          */
2838         if (event->attr.inherit || !is_sampling_event(event))
2839                 return -EINVAL;
2840
2841         atomic_add(refresh, &event->event_limit);
2842         _perf_event_enable(event);
2843
2844         return 0;
2845 }
2846
2847 /*
2848  * See perf_event_disable()
2849  */
2850 int perf_event_refresh(struct perf_event *event, int refresh)
2851 {
2852         struct perf_event_context *ctx;
2853         int ret;
2854
2855         ctx = perf_event_ctx_lock(event);
2856         ret = _perf_event_refresh(event, refresh);
2857         perf_event_ctx_unlock(event, ctx);
2858
2859         return ret;
2860 }
2861 EXPORT_SYMBOL_GPL(perf_event_refresh);
2862
2863 static int perf_event_modify_breakpoint(struct perf_event *bp,
2864                                          struct perf_event_attr *attr)
2865 {
2866         int err;
2867
2868         _perf_event_disable(bp);
2869
2870         err = modify_user_hw_breakpoint_check(bp, attr, true);
2871
2872         if (!bp->attr.disabled)
2873                 _perf_event_enable(bp);
2874
2875         return err;
2876 }
2877
2878 static int perf_event_modify_attr(struct perf_event *event,
2879                                   struct perf_event_attr *attr)
2880 {
2881         if (event->attr.type != attr->type)
2882                 return -EINVAL;
2883
2884         switch (event->attr.type) {
2885         case PERF_TYPE_BREAKPOINT:
2886                 return perf_event_modify_breakpoint(event, attr);
2887         default:
2888                 /* Place holder for future additions. */
2889                 return -EOPNOTSUPP;
2890         }
2891 }
2892
2893 static void ctx_sched_out(struct perf_event_context *ctx,
2894                           struct perf_cpu_context *cpuctx,
2895                           enum event_type_t event_type)
2896 {
2897         struct perf_event *event, *tmp;
2898         int is_active = ctx->is_active;
2899
2900         lockdep_assert_held(&ctx->lock);
2901
2902         if (likely(!ctx->nr_events)) {
2903                 /*
2904                  * See __perf_remove_from_context().
2905                  */
2906                 WARN_ON_ONCE(ctx->is_active);
2907                 if (ctx->task)
2908                         WARN_ON_ONCE(cpuctx->task_ctx);
2909                 return;
2910         }
2911
2912         ctx->is_active &= ~event_type;
2913         if (!(ctx->is_active & EVENT_ALL))
2914                 ctx->is_active = 0;
2915
2916         if (ctx->task) {
2917                 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2918                 if (!ctx->is_active)
2919                         cpuctx->task_ctx = NULL;
2920         }
2921
2922         /*
2923          * Always update time if it was set; not only when it changes.
2924          * Otherwise we can 'forget' to update time for any but the last
2925          * context we sched out. For example:
2926          *
2927          *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
2928          *   ctx_sched_out(.event_type = EVENT_PINNED)
2929          *
2930          * would only update time for the pinned events.
2931          */
2932         if (is_active & EVENT_TIME) {
2933                 /* update (and stop) ctx time */
2934                 update_context_time(ctx);
2935                 update_cgrp_time_from_cpuctx(cpuctx);
2936         }
2937
2938         is_active ^= ctx->is_active; /* changed bits */
2939
2940         if (!ctx->nr_active || !(is_active & EVENT_ALL))
2941                 return;
2942
2943         perf_pmu_disable(ctx->pmu);
2944         if (is_active & EVENT_PINNED) {
2945                 list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
2946                         group_sched_out(event, cpuctx, ctx);
2947         }
2948
2949         if (is_active & EVENT_FLEXIBLE) {
2950                 list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
2951                         group_sched_out(event, cpuctx, ctx);
2952         }
2953         perf_pmu_enable(ctx->pmu);
2954 }
2955
2956 /*
2957  * Test whether two contexts are equivalent, i.e. whether they have both been
2958  * cloned from the same version of the same context.
2959  *
2960  * Equivalence is measured using a generation number in the context that is
2961  * incremented on each modification to it; see unclone_ctx(), list_add_event()
2962  * and list_del_event().
2963  */
2964 static int context_equiv(struct perf_event_context *ctx1,
2965                          struct perf_event_context *ctx2)
2966 {
2967         lockdep_assert_held(&ctx1->lock);
2968         lockdep_assert_held(&ctx2->lock);
2969
2970         /* Pinning disables the swap optimization */
2971         if (ctx1->pin_count || ctx2->pin_count)
2972                 return 0;
2973
2974         /* If ctx1 is the parent of ctx2 */
2975         if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2976                 return 1;
2977
2978         /* If ctx2 is the parent of ctx1 */
2979         if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2980                 return 1;
2981
2982         /*
2983          * If ctx1 and ctx2 have the same parent; we flatten the parent
2984          * hierarchy, see perf_event_init_context().
2985          */
2986         if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2987                         ctx1->parent_gen == ctx2->parent_gen)
2988                 return 1;
2989
2990         /* Unmatched */
2991         return 0;
2992 }
2993
2994 static void __perf_event_sync_stat(struct perf_event *event,
2995                                      struct perf_event *next_event)
2996 {
2997         u64 value;
2998
2999         if (!event->attr.inherit_stat)
3000                 return;
3001
3002         /*
3003          * Update the event value, we cannot use perf_event_read()
3004          * because we're in the middle of a context switch and have IRQs
3005          * disabled, which upsets smp_call_function_single(), however
3006          * we know the event must be on the current CPU, therefore we
3007          * don't need to use it.
3008          */
3009         if (event->state == PERF_EVENT_STATE_ACTIVE)
3010                 event->pmu->read(event);
3011
3012         perf_event_update_time(event);
3013
3014         /*
3015          * In order to keep per-task stats reliable we need to flip the event
3016          * values when we flip the contexts.
3017          */
3018         value = local64_read(&next_event->count);
3019         value = local64_xchg(&event->count, value);
3020         local64_set(&next_event->count, value);
3021
3022         swap(event->total_time_enabled, next_event->total_time_enabled);
3023         swap(event->total_time_running, next_event->total_time_running);
3024
3025         /*
3026          * Since we swizzled the values, update the user visible data too.
3027          */
3028         perf_event_update_userpage(event);
3029         perf_event_update_userpage(next_event);
3030 }
3031
3032 static void perf_event_sync_stat(struct perf_event_context *ctx,
3033                                    struct perf_event_context *next_ctx)
3034 {
3035         struct perf_event *event, *next_event;
3036
3037         if (!ctx->nr_stat)
3038                 return;
3039
3040         update_context_time(ctx);
3041
3042         event = list_first_entry(&ctx->event_list,
3043                                    struct perf_event, event_entry);
3044
3045         next_event = list_first_entry(&next_ctx->event_list,
3046                                         struct perf_event, event_entry);
3047
3048         while (&event->event_entry != &ctx->event_list &&
3049                &next_event->event_entry != &next_ctx->event_list) {
3050
3051                 __perf_event_sync_stat(event, next_event);
3052
3053                 event = list_next_entry(event, event_entry);
3054                 next_event = list_next_entry(next_event, event_entry);
3055         }
3056 }
3057
3058 static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
3059                                          struct task_struct *next)
3060 {
3061         struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
3062         struct perf_event_context *next_ctx;
3063         struct perf_event_context *parent, *next_parent;
3064         struct perf_cpu_context *cpuctx;
3065         int do_switch = 1;
3066
3067         if (likely(!ctx))
3068                 return;
3069
3070         cpuctx = __get_cpu_context(ctx);
3071         if (!cpuctx->task_ctx)
3072                 return;
3073
3074         rcu_read_lock();
3075         next_ctx = next->perf_event_ctxp[ctxn];
3076         if (!next_ctx)
3077                 goto unlock;
3078
3079         parent = rcu_dereference(ctx->parent_ctx);
3080         next_parent = rcu_dereference(next_ctx->parent_ctx);
3081
3082         /* If neither context have a parent context; they cannot be clones. */
3083         if (!parent && !next_parent)
3084                 goto unlock;
3085
3086         if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
3087                 /*
3088                  * Looks like the two contexts are clones, so we might be
3089                  * able to optimize the context switch.  We lock both
3090                  * contexts and check that they are clones under the
3091                  * lock (including re-checking that neither has been
3092                  * uncloned in the meantime).  It doesn't matter which
3093                  * order we take the locks because no other cpu could
3094                  * be trying to lock both of these tasks.
3095                  */
3096                 raw_spin_lock(&ctx->lock);
3097                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
3098                 if (context_equiv(ctx, next_ctx)) {
3099                         WRITE_ONCE(ctx->task, next);
3100                         WRITE_ONCE(next_ctx->task, task);
3101
3102                         swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
3103
3104                         /*
3105                          * RCU_INIT_POINTER here is safe because we've not
3106                          * modified the ctx and the above modification of
3107                          * ctx->task and ctx->task_ctx_data are immaterial
3108                          * since those values are always verified under
3109                          * ctx->lock which we're now holding.
3110                          */
3111                         RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
3112                         RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
3113
3114                         do_switch = 0;
3115
3116                         perf_event_sync_stat(ctx, next_ctx);
3117                 }
3118                 raw_spin_unlock(&next_ctx->lock);
3119                 raw_spin_unlock(&ctx->lock);
3120         }
3121 unlock:
3122         rcu_read_unlock();
3123
3124         if (do_switch) {
3125                 raw_spin_lock(&ctx->lock);
3126                 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
3127                 raw_spin_unlock(&ctx->lock);
3128         }
3129 }
3130
3131 static DEFINE_PER_CPU(struct list_head, sched_cb_list);
3132
3133 void perf_sched_cb_dec(struct pmu *pmu)
3134 {
3135         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3136
3137         this_cpu_dec(perf_sched_cb_usages);
3138
3139         if (!--cpuctx->sched_cb_usage)
3140                 list_del(&cpuctx->sched_cb_entry);
3141 }
3142
3143
3144 void perf_sched_cb_inc(struct pmu *pmu)
3145 {
3146         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3147
3148         if (!cpuctx->sched_cb_usage++)
3149                 list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
3150
3151         this_cpu_inc(perf_sched_cb_usages);
3152 }
3153
3154 /*
3155  * This function provides the context switch callback to the lower code
3156  * layer. It is invoked ONLY when the context switch callback is enabled.
3157  *
3158  * This callback is relevant even to per-cpu events; for example multi event
3159  * PEBS requires this to provide PID/TID information. This requires we flush
3160  * all queued PEBS records before we context switch to a new task.
3161  */
3162 static void perf_pmu_sched_task(struct task_struct *prev,
3163                                 struct task_struct *next,
3164                                 bool sched_in)
3165 {
3166         struct perf_cpu_context *cpuctx;
3167         struct pmu *pmu;
3168
3169         if (prev == next)
3170                 return;
3171
3172         list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
3173                 pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
3174
3175                 if (WARN_ON_ONCE(!pmu->sched_task))
3176                         continue;
3177
3178                 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3179                 perf_pmu_disable(pmu);
3180
3181                 pmu->sched_task(cpuctx->task_ctx, sched_in);
3182
3183                 perf_pmu_enable(pmu);
3184                 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3185         }
3186 }
3187
3188 static void perf_event_switch(struct task_struct *task,
3189                               struct task_struct *next_prev, bool sched_in);
3190
3191 #define for_each_task_context_nr(ctxn)                                  \
3192         for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
3193
3194 /*
3195  * Called from scheduler to remove the events of the current task,
3196  * with interrupts disabled.
3197  *
3198  * We stop each event and update the event value in event->count.
3199  *
3200  * This does not protect us against NMI, but disable()
3201  * sets the disabled bit in the control field of event _before_
3202  * accessing the event control register. If a NMI hits, then it will
3203  * not restart the event.
3204  */
3205 void __perf_event_task_sched_out(struct task_struct *task,
3206                                  struct task_struct *next)
3207 {
3208         int ctxn;
3209
3210         if (__this_cpu_read(perf_sched_cb_usages))
3211                 perf_pmu_sched_task(task, next, false);
3212
3213         if (atomic_read(&nr_switch_events))
3214                 perf_event_switch(task, next, false);
3215
3216         for_each_task_context_nr(ctxn)
3217                 perf_event_context_sched_out(task, ctxn, next);
3218
3219         /*
3220          * if cgroup events exist on this CPU, then we need
3221          * to check if we have to switch out PMU state.
3222          * cgroup event are system-wide mode only
3223          */
3224         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3225                 perf_cgroup_sched_out(task, next);
3226 }
3227
3228 /*
3229  * Called with IRQs disabled
3230  */
3231 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
3232                               enum event_type_t event_type)
3233 {
3234         ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
3235 }
3236
3237 static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
3238                               int (*func)(struct perf_event *, void *), void *data)
3239 {
3240         struct perf_event **evt, *evt1, *evt2;
3241         int ret;
3242
3243         evt1 = perf_event_groups_first(groups, -1);
3244         evt2 = perf_event_groups_first(groups, cpu);
3245
3246         while (evt1 || evt2) {
3247                 if (evt1 && evt2) {
3248                         if (evt1->group_index < evt2->group_index)
3249                                 evt = &evt1;
3250                         else
3251                                 evt = &evt2;
3252                 } else if (evt1) {
3253                         evt = &evt1;
3254                 } else {
3255                         evt = &evt2;
3256                 }
3257
3258                 ret = func(*evt, data);
3259                 if (ret)
3260                         return ret;
3261
3262                 *evt = perf_event_groups_next(*evt);
3263         }
3264
3265         return 0;
3266 }
3267
3268 struct sched_in_data {
3269         struct perf_event_context *ctx;
3270         struct perf_cpu_context *cpuctx;
3271         int can_add_hw;
3272 };
3273
3274 static int pinned_sched_in(struct perf_event *event, void *data)
3275 {
3276         struct sched_in_data *sid = data;
3277
3278         if (event->state <= PERF_EVENT_STATE_OFF)
3279                 return 0;
3280
3281         if (!event_filter_match(event))
3282                 return 0;
3283
3284         if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
3285                 if (!group_sched_in(event, sid->cpuctx, sid->ctx))
3286                         list_add_tail(&event->active_list, &sid->ctx->pinned_active);
3287         }
3288
3289         /*
3290          * If this pinned group hasn't been scheduled,
3291          * put it in error state.
3292          */
3293         if (event->state == PERF_EVENT_STATE_INACTIVE)
3294                 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3295
3296         return 0;
3297 }
3298
3299 static int flexible_sched_in(struct perf_event *event, void *data)
3300 {
3301         struct sched_in_data *sid = data;
3302
3303         if (event->state <= PERF_EVENT_STATE_OFF)
3304                 return 0;
3305
3306         if (!event_filter_match(event))
3307                 return 0;
3308
3309         if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
3310                 if (!group_sched_in(event, sid->cpuctx, sid->ctx))
3311                         list_add_tail(&event->active_list, &sid->ctx->flexible_active);
3312                 else
3313                         sid->can_add_hw = 0;
3314         }
3315
3316         return 0;
3317 }
3318
3319 static void
3320 ctx_pinned_sched_in(struct perf_event_context *ctx,
3321                     struct perf_cpu_context *cpuctx)
3322 {
3323         struct sched_in_data sid = {
3324                 .ctx = ctx,
3325                 .cpuctx = cpuctx,
3326                 .can_add_hw = 1,
3327         };
3328
3329         visit_groups_merge(&ctx->pinned_groups,
3330                            smp_processor_id(),
3331                            pinned_sched_in, &sid);
3332 }
3333
3334 static void
3335 ctx_flexible_sched_in(struct perf_event_context *ctx,
3336                       struct perf_cpu_context *cpuctx)
3337 {
3338         struct sched_in_data sid = {
3339                 .ctx = ctx,
3340                 .cpuctx = cpuctx,
3341                 .can_add_hw = 1,
3342         };
3343
3344         visit_groups_merge(&ctx->flexible_groups,
3345                            smp_processor_id(),
3346                            flexible_sched_in, &sid);
3347 }
3348
3349 static void
3350 ctx_sched_in(struct perf_event_context *ctx,
3351              struct perf_cpu_context *cpuctx,
3352              enum event_type_t event_type,
3353              struct task_struct *task)
3354 {
3355         int is_active = ctx->is_active;
3356         u64 now;
3357
3358         lockdep_assert_held(&ctx->lock);
3359
3360         if (likely(!ctx->nr_events))
3361                 return;
3362
3363         ctx->is_active |= (event_type | EVENT_TIME);
3364         if (ctx->task) {
3365                 if (!is_active)
3366                         cpuctx->task_ctx = ctx;
3367                 else
3368                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3369         }
3370
3371         is_active ^= ctx->is_active; /* changed bits */
3372
3373         if (is_active & EVENT_TIME) {
3374                 /* start ctx time */
3375                 now = perf_clock();
3376                 ctx->timestamp = now;
3377                 perf_cgroup_set_timestamp(task, ctx);
3378         }
3379
3380         /*
3381          * First go through the list and put on any pinned groups
3382          * in order to give them the best chance of going on.
3383          */
3384         if (is_active & EVENT_PINNED)
3385                 ctx_pinned_sched_in(ctx, cpuctx);
3386
3387         /* Then walk through the lower prio flexible groups */
3388         if (is_active & EVENT_FLEXIBLE)
3389                 ctx_flexible_sched_in(ctx, cpuctx);
3390 }
3391
3392 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
3393                              enum event_type_t event_type,
3394                              struct task_struct *task)
3395 {
3396         struct perf_event_context *ctx = &cpuctx->ctx;
3397
3398         ctx_sched_in(ctx, cpuctx, event_type, task);
3399 }
3400
3401 static void perf_event_context_sched_in(struct perf_event_context *ctx,
3402                                         struct task_struct *task)
3403 {
3404         struct perf_cpu_context *cpuctx;
3405
3406         cpuctx = __get_cpu_context(ctx);
3407         if (cpuctx->task_ctx == ctx)
3408                 return;
3409
3410         perf_ctx_lock(cpuctx, ctx);
3411         /*
3412          * We must check ctx->nr_events while holding ctx->lock, such
3413          * that we serialize against perf_install_in_context().
3414          */
3415         if (!ctx->nr_events)
3416                 goto unlock;
3417
3418         perf_pmu_disable(ctx->pmu);
3419         /*
3420          * We want to keep the following priority order:
3421          * cpu pinned (that don't need to move), task pinned,
3422          * cpu flexible, task flexible.
3423          *
3424          * However, if task's ctx is not carrying any pinned
3425          * events, no need to flip the cpuctx's events around.
3426          */
3427         if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
3428                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3429         perf_event_sched_in(cpuctx, ctx, task);
3430         perf_pmu_enable(ctx->pmu);
3431
3432 unlock:
3433         perf_ctx_unlock(cpuctx, ctx);
3434 }
3435
3436 /*
3437  * Called from scheduler to add the events of the current task
3438  * with interrupts disabled.
3439  *
3440  * We restore the event value and then enable it.
3441  *
3442  * This does not protect us against NMI, but enable()
3443  * sets the enabled bit in the control field of event _before_
3444  * accessing the event control register. If a NMI hits, then it will
3445  * keep the event running.
3446  */
3447 void __perf_event_task_sched_in(struct task_struct *prev,
3448                                 struct task_struct *task)
3449 {
3450         struct perf_event_context *ctx;
3451         int ctxn;
3452
3453         /*
3454          * If cgroup events exist on this CPU, then we need to check if we have
3455          * to switch in PMU state; cgroup event are system-wide mode only.
3456          *
3457          * Since cgroup events are CPU events, we must schedule these in before
3458          * we schedule in the task events.
3459          */
3460         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3461                 perf_cgroup_sched_in(prev, task);
3462
3463         for_each_task_context_nr(ctxn) {
3464                 ctx = task->perf_event_ctxp[ctxn];
3465                 if (likely(!ctx))
3466                         continue;
3467
3468                 perf_event_context_sched_in(ctx, task);
3469         }
3470
3471         if (atomic_read(&nr_switch_events))
3472                 perf_event_switch(task, prev, true);
3473
3474         if (__this_cpu_read(perf_sched_cb_usages))
3475                 perf_pmu_sched_task(prev, task, true);
3476 }
3477
3478 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
3479 {
3480         u64 frequency = event->attr.sample_freq;
3481         u64 sec = NSEC_PER_SEC;
3482         u64 divisor, dividend;
3483
3484         int count_fls, nsec_fls, frequency_fls, sec_fls;
3485
3486         count_fls = fls64(count);
3487         nsec_fls = fls64(nsec);
3488         frequency_fls = fls64(frequency);
3489         sec_fls = 30;
3490
3491         /*
3492          * We got @count in @nsec, with a target of sample_freq HZ
3493          * the target period becomes:
3494          *
3495          *             @count * 10^9
3496          * period = -------------------
3497          *          @nsec * sample_freq
3498          *
3499          */
3500
3501         /*
3502          * Reduce accuracy by one bit such that @a and @b converge
3503          * to a similar magnitude.
3504          */
3505 #define REDUCE_FLS(a, b)                \
3506 do {                                    \
3507         if (a##_fls > b##_fls) {        \
3508                 a >>= 1;                \
3509                 a##_fls--;              \
3510         } else {                        \
3511                 b >>= 1;                \
3512                 b##_fls--;              \
3513         }                               \
3514 } while (0)
3515
3516         /*
3517          * Reduce accuracy until either term fits in a u64, then proceed with
3518          * the other, so that finally we can do a u64/u64 division.
3519          */
3520         while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
3521                 REDUCE_FLS(nsec, frequency);
3522                 REDUCE_FLS(sec, count);
3523         }
3524
3525         if (count_fls + sec_fls > 64) {
3526                 divisor = nsec * frequency;
3527
3528                 while (count_fls + sec_fls > 64) {
3529                         REDUCE_FLS(count, sec);
3530                         divisor >>= 1;
3531                 }
3532
3533                 dividend = count * sec;
3534         } else {
3535                 dividend = count * sec;
3536
3537                 while (nsec_fls + frequency_fls > 64) {
3538                         REDUCE_FLS(nsec, frequency);
3539                         dividend >>= 1;
3540                 }
3541
3542                 divisor = nsec * frequency;
3543         }
3544
3545         if (!divisor)
3546                 return dividend;
3547
3548         return div64_u64(dividend, divisor);
3549 }
3550
3551 static DEFINE_PER_CPU(int, perf_throttled_count);
3552 static DEFINE_PER_CPU(u64, perf_throttled_seq);
3553
3554 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
3555 {
3556         struct hw_perf_event *hwc = &event->hw;
3557         s64 period, sample_period;
3558         s64 delta;
3559
3560         period = perf_calculate_period(event, nsec, count);
3561
3562         delta = (s64)(period - hwc->sample_period);
3563         delta = (delta + 7) / 8; /* low pass filter */
3564
3565         sample_period = hwc->sample_period + delta;
3566
3567         if (!sample_period)
3568                 sample_period = 1;
3569
3570         hwc->sample_period = sample_period;
3571
3572         if (local64_read(&hwc->period_left) > 8*sample_period) {
3573                 if (disable)
3574                         event->pmu->stop(event, PERF_EF_UPDATE);
3575
3576                 local64_set(&hwc->period_left, 0);
3577
3578                 if (disable)
3579                         event->pmu->start(event, PERF_EF_RELOAD);
3580         }
3581 }
3582
3583 /*
3584  * combine freq adjustment with unthrottling to avoid two passes over the
3585  * events. At the same time, make sure, having freq events does not change
3586  * the rate of unthrottling as that would introduce bias.
3587  */
3588 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
3589                                            int needs_unthr)
3590 {
3591         struct perf_event *event;
3592         struct hw_perf_event *hwc;
3593         u64 now, period = TICK_NSEC;
3594         s64 delta;
3595
3596         /*
3597          * only need to iterate over all events iff:
3598          * - context have events in frequency mode (needs freq adjust)
3599          * - there are events to unthrottle on this cpu
3600          */
3601         if (!(ctx->nr_freq || needs_unthr))
3602                 return;
3603
3604         raw_spin_lock(&ctx->lock);
3605         perf_pmu_disable(ctx->pmu);
3606
3607         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3608                 if (event->state != PERF_EVENT_STATE_ACTIVE)
3609                         continue;
3610
3611                 if (!event_filter_match(event))
3612                         continue;
3613
3614                 perf_pmu_disable(event->pmu);
3615
3616                 hwc = &event->hw;
3617
3618                 if (hwc->interrupts == MAX_INTERRUPTS) {
3619                         hwc->interrupts = 0;
3620                         perf_log_throttle(event, 1);
3621                         event->pmu->start(event, 0);
3622                 }
3623
3624                 if (!event->attr.freq || !event->attr.sample_freq)
3625                         goto next;
3626
3627                 /*
3628                  * stop the event and update event->count
3629                  */
3630                 event->pmu->stop(event, PERF_EF_UPDATE);
3631
3632                 now = local64_read(&event->count);
3633                 delta = now - hwc->freq_count_stamp;
3634                 hwc->freq_count_stamp = now;
3635
3636                 /*
3637                  * restart the event
3638                  * reload only if value has changed
3639                  * we have stopped the event so tell that
3640                  * to perf_adjust_period() to avoid stopping it
3641                  * twice.
3642                  */
3643                 if (delta > 0)
3644                         perf_adjust_period(event, period, delta, false);
3645
3646                 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
3647         next:
3648                 perf_pmu_enable(event->pmu);
3649         }
3650
3651         perf_pmu_enable(ctx->pmu);
3652         raw_spin_unlock(&ctx->lock);
3653 }
3654
3655 /*
3656  * Move @event to the tail of the @ctx's elegible events.
3657  */
3658 static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
3659 {
3660         /*
3661          * Rotate the first entry last of non-pinned groups. Rotation might be
3662          * disabled by the inheritance code.
3663          */
3664         if (ctx->rotate_disable)
3665                 return;
3666
3667         perf_event_groups_delete(&ctx->flexible_groups, event);
3668         perf_event_groups_insert(&ctx->flexible_groups, event);
3669 }
3670
3671 static inline struct perf_event *
3672 ctx_first_active(struct perf_event_context *ctx)
3673 {
3674         return list_first_entry_or_null(&ctx->flexible_active,
3675                                         struct perf_event, active_list);
3676 }
3677
3678 static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
3679 {
3680         struct perf_event *cpu_event = NULL, *task_event = NULL;
3681         bool cpu_rotate = false, task_rotate = false;
3682         struct perf_event_context *ctx = NULL;
3683
3684         /*
3685          * Since we run this from IRQ context, nobody can install new
3686          * events, thus the event count values are stable.
3687          */
3688
3689         if (cpuctx->ctx.nr_events) {
3690                 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3691                         cpu_rotate = true;
3692         }
3693
3694         ctx = cpuctx->task_ctx;
3695         if (ctx && ctx->nr_events) {
3696                 if (ctx->nr_events != ctx->nr_active)
3697                         task_rotate = true;
3698         }
3699
3700         if (!(cpu_rotate || task_rotate))
3701                 return false;
3702
3703         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3704         perf_pmu_disable(cpuctx->ctx.pmu);
3705
3706         if (task_rotate)
3707                 task_event = ctx_first_active(ctx);
3708         if (cpu_rotate)
3709                 cpu_event = ctx_first_active(&cpuctx->ctx);
3710
3711         /*
3712          * As per the order given at ctx_resched() first 'pop' task flexible
3713          * and then, if needed CPU flexible.
3714          */
3715         if (task_event || (ctx && cpu_event))
3716                 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
3717         if (cpu_event)
3718                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3719
3720         if (task_event)
3721                 rotate_ctx(ctx, task_event);
3722         if (cpu_event)
3723                 rotate_ctx(&cpuctx->ctx, cpu_event);
3724
3725         perf_event_sched_in(cpuctx, ctx, current);
3726
3727         perf_pmu_enable(cpuctx->ctx.pmu);
3728         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3729
3730         return true;
3731 }
3732
3733 void perf_event_task_tick(void)
3734 {
3735         struct list_head *head = this_cpu_ptr(&active_ctx_list);
3736         struct perf_event_context *ctx, *tmp;
3737         int throttled;
3738
3739         lockdep_assert_irqs_disabled();
3740
3741         __this_cpu_inc(perf_throttled_seq);
3742         throttled = __this_cpu_xchg(perf_throttled_count, 0);
3743         tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
3744
3745         list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
3746                 perf_adjust_freq_unthr_context(ctx, throttled);
3747 }
3748
3749 static int event_enable_on_exec(struct perf_event *event,
3750                                 struct perf_event_context *ctx)
3751 {
3752         if (!event->attr.enable_on_exec)
3753                 return 0;
3754
3755         event->attr.enable_on_exec = 0;
3756         if (event->state >= PERF_EVENT_STATE_INACTIVE)
3757                 return 0;
3758
3759         perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
3760
3761         return 1;
3762 }
3763
3764 /*
3765  * Enable all of a task's events that have been marked enable-on-exec.
3766  * This expects task == current.
3767  */
3768 static void perf_event_enable_on_exec(int ctxn)
3769 {
3770         struct perf_event_context *ctx, *clone_ctx = NULL;
3771         enum event_type_t event_type = 0;
3772         struct perf_cpu_context *cpuctx;
3773         struct perf_event *event;
3774         unsigned long flags;
3775         int enabled = 0;
3776
3777         local_irq_save(flags);
3778         ctx = current->perf_event_ctxp[ctxn];
3779         if (!ctx || !ctx->nr_events)
3780                 goto out;
3781
3782         cpuctx = __get_cpu_context(ctx);
3783         perf_ctx_lock(cpuctx, ctx);
3784         ctx_sched_out(ctx, cpuctx, EVENT_TIME);
3785         list_for_each_entry(event, &ctx->event_list, event_entry) {
3786                 enabled |= event_enable_on_exec(event, ctx);
3787                 event_type |= get_event_type(event);
3788         }
3789
3790         /*
3791          * Unclone and reschedule this context if we enabled any event.
3792          */
3793         if (enabled) {
3794                 clone_ctx = unclone_ctx(ctx);
3795                 ctx_resched(cpuctx, ctx, event_type);
3796         } else {
3797                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
3798         }
3799         perf_ctx_unlock(cpuctx, ctx);
3800
3801 out:
3802         local_irq_restore(flags);
3803
3804         if (clone_ctx)
3805                 put_ctx(clone_ctx);
3806 }
3807
3808 struct perf_read_data {
3809         struct perf_event *event;
3810         bool group;
3811         int ret;
3812 };
3813
3814 static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
3815 {
3816         u16 local_pkg, event_pkg;
3817
3818         if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
3819                 int local_cpu = smp_processor_id();
3820
3821                 event_pkg = topology_physical_package_id(event_cpu);
3822                 local_pkg = topology_physical_package_id(local_cpu);
3823
3824                 if (event_pkg == local_pkg)
3825                         return local_cpu;
3826         }
3827
3828         return event_cpu;
3829 }
3830
3831 /*
3832  * Cross CPU call to read the hardware event
3833  */
3834 static void __perf_event_read(void *info)
3835 {
3836         struct perf_read_data *data = info;
3837         struct perf_event *sub, *event = data->event;
3838         struct perf_event_context *ctx = event->ctx;
3839         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3840         struct pmu *pmu = event->pmu;
3841
3842         /*
3843          * If this is a task context, we need to check whether it is
3844          * the current task context of this cpu.  If not it has been
3845          * scheduled out before the smp call arrived.  In that case
3846          * event->count would have been updated to a recent sample
3847          * when the event was scheduled out.
3848          */
3849         if (ctx->task && cpuctx->task_ctx != ctx)
3850                 return;
3851
3852         raw_spin_lock(&ctx->lock);
3853         if (ctx->is_active & EVENT_TIME) {
3854                 update_context_time(ctx);
3855                 update_cgrp_time_from_event(event);
3856         }
3857
3858         perf_event_update_time(event);
3859         if (data->group)
3860                 perf_event_update_sibling_time(event);
3861
3862         if (event->state != PERF_EVENT_STATE_ACTIVE)
3863                 goto unlock;
3864
3865         if (!data->group) {
3866                 pmu->read(event);
3867                 data->ret = 0;
3868                 goto unlock;
3869         }
3870
3871         pmu->start_txn(pmu, PERF_PMU_TXN_READ);
3872
3873         pmu->read(event);
3874
3875         for_each_sibling_event(sub, event) {
3876                 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
3877                         /*
3878                          * Use sibling's PMU rather than @event's since
3879                          * sibling could be on different (eg: software) PMU.
3880                          */
3881                         sub->pmu->read(sub);
3882                 }
3883         }
3884
3885         data->ret = pmu->commit_txn(pmu);
3886
3887 unlock:
3888         raw_spin_unlock(&ctx->lock);
3889 }
3890
3891 static inline u64 perf_event_count(struct perf_event *event)
3892 {
3893         return local64_read(&event->count) + atomic64_read(&event->child_count);
3894 }
3895
3896 /*
3897  * NMI-safe method to read a local event, that is an event that
3898  * is:
3899  *   - either for the current task, or for this CPU
3900  *   - does not have inherit set, for inherited task events
3901  *     will not be local and we cannot read them atomically
3902  *   - must not have a pmu::count method
3903  */
3904 int perf_event_read_local(struct perf_event *event, u64 *value,
3905                           u64 *enabled, u64 *running)
3906 {
3907         unsigned long flags;
3908         int ret = 0;
3909
3910         /*
3911          * Disabling interrupts avoids all counter scheduling (context
3912          * switches, timer based rotation and IPIs).
3913          */
3914         local_irq_save(flags);
3915
3916         /*
3917          * It must not be an event with inherit set, we cannot read
3918          * all child counters from atomic context.
3919          */
3920         if (event->attr.inherit) {
3921                 ret = -EOPNOTSUPP;
3922                 goto out;
3923         }
3924
3925         /* If this is a per-task event, it must be for current */
3926         if ((event->attach_state & PERF_ATTACH_TASK) &&
3927             event->hw.target != current) {
3928                 ret = -EINVAL;
3929                 goto out;
3930         }
3931
3932         /* If this is a per-CPU event, it must be for this CPU */
3933         if (!(event->attach_state & PERF_ATTACH_TASK) &&
3934             event->cpu != smp_processor_id()) {
3935                 ret = -EINVAL;
3936                 goto out;
3937         }
3938
3939         /* If this is a pinned event it must be running on this CPU */
3940         if (event->attr.pinned && event->oncpu != smp_processor_id()) {
3941                 ret = -EBUSY;
3942                 goto out;
3943         }
3944
3945         /*
3946          * If the event is currently on this CPU, its either a per-task event,
3947          * or local to this CPU. Furthermore it means its ACTIVE (otherwise
3948          * oncpu == -1).
3949          */
3950         if (event->oncpu == smp_processor_id())
3951                 event->pmu->read(event);
3952
3953         *value = local64_read(&event->count);
3954         if (enabled || running) {
3955                 u64 now = event->shadow_ctx_time + perf_clock();
3956                 u64 __enabled, __running;
3957
3958                 __perf_update_times(event, now, &__enabled, &__running);
3959                 if (enabled)
3960                         *enabled = __enabled;
3961                 if (running)
3962                         *running = __running;
3963         }
3964 out:
3965         local_irq_restore(flags);
3966
3967         return ret;
3968 }
3969
3970 static int perf_event_read(struct perf_event *event, bool group)
3971 {
3972         enum perf_event_state state = READ_ONCE(event->state);
3973         int event_cpu, ret = 0;
3974
3975         /*
3976          * If event is enabled and currently active on a CPU, update the
3977          * value in the event structure:
3978          */
3979 again:
3980         if (state == PERF_EVENT_STATE_ACTIVE) {
3981                 struct perf_read_data data;
3982
3983                 /*
3984                  * Orders the ->state and ->oncpu loads such that if we see
3985                  * ACTIVE we must also see the right ->oncpu.
3986                  *
3987                  * Matches the smp_wmb() from event_sched_in().
3988                  */
3989                 smp_rmb();
3990
3991                 event_cpu = READ_ONCE(event->oncpu);
3992                 if ((unsigned)event_cpu >= nr_cpu_ids)
3993                         return 0;
3994
3995                 data = (struct perf_read_data){
3996                         .event = event,
3997                         .group = group,
3998                         .ret = 0,
3999                 };
4000
4001                 preempt_disable();
4002                 event_cpu = __perf_event_read_cpu(event, event_cpu);
4003
4004                 /*
4005                  * Purposely ignore the smp_call_function_single() return
4006                  * value.
4007                  *
4008                  * If event_cpu isn't a valid CPU it means the event got
4009                  * scheduled out and that will have updated the event count.
4010                  *
4011                  * Therefore, either way, we'll have an up-to-date event count
4012                  * after this.
4013                  */
4014                 (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
4015                 preempt_enable();
4016                 ret = data.ret;
4017
4018         } else if (state == PERF_EVENT_STATE_INACTIVE) {
4019                 struct perf_event_context *ctx = event->ctx;
4020                 unsigned long flags;
4021
4022                 raw_spin_lock_irqsave(&ctx->lock, flags);
4023                 state = event->state;
4024                 if (state != PERF_EVENT_STATE_INACTIVE) {
4025                         raw_spin_unlock_irqrestore(&ctx->lock, flags);
4026                         goto again;
4027                 }
4028
4029                 /*
4030                  * May read while context is not active (e.g., thread is
4031                  * blocked), in that case we cannot update context time
4032                  */
4033                 if (ctx->is_active & EVENT_TIME) {
4034                         update_context_time(ctx);
4035                         update_cgrp_time_from_event(event);
4036                 }
4037
4038                 perf_event_update_time(event);
4039                 if (group)
4040                         perf_event_update_sibling_time(event);
4041                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4042         }
4043
4044         return ret;
4045 }
4046
4047 /*
4048  * Initialize the perf_event context in a task_struct:
4049  */
4050 static void __perf_event_init_context(struct perf_event_context *ctx)
4051 {
4052         raw_spin_lock_init(&ctx->lock);
4053         mutex_init(&ctx->mutex);
4054         INIT_LIST_HEAD(&ctx->active_ctx_list);
4055         perf_event_groups_init(&ctx->pinned_groups);
4056         perf_event_groups_init(&ctx->flexible_groups);
4057         INIT_LIST_HEAD(&ctx->event_list);
4058         INIT_LIST_HEAD(&ctx->pinned_active);
4059         INIT_LIST_HEAD(&ctx->flexible_active);
4060         refcount_set(&ctx->refcount, 1);
4061 }
4062
4063 static struct perf_event_context *
4064 alloc_perf_context(struct pmu *pmu, struct task_struct *task)
4065 {
4066         struct perf_event_context *ctx;
4067
4068         ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4069         if (!ctx)
4070                 return NULL;
4071
4072         __perf_event_init_context(ctx);
4073         if (task) {
4074                 ctx->task = task;
4075                 get_task_struct(task);
4076         }
4077         ctx->pmu = pmu;
4078
4079         return ctx;
4080 }
4081
4082 static struct task_struct *
4083 find_lively_task_by_vpid(pid_t vpid)
4084 {
4085         struct task_struct *task;
4086
4087         rcu_read_lock();
4088         if (!vpid)
4089                 task = current;
4090         else
4091                 task = find_task_by_vpid(vpid);
4092         if (task)
4093                 get_task_struct(task);
4094         rcu_read_unlock();
4095
4096         if (!task)
4097                 return ERR_PTR(-ESRCH);
4098
4099         return task;
4100 }
4101
4102 /*
4103  * Returns a matching context with refcount and pincount.
4104  */
4105 static struct perf_event_context *
4106 find_get_context(struct pmu *pmu, struct task_struct *task,
4107                 struct perf_event *event)
4108 {
4109         struct perf_event_context *ctx, *clone_ctx = NULL;
4110         struct perf_cpu_context *cpuctx;
4111         void *task_ctx_data = NULL;
4112         unsigned long flags;
4113         int ctxn, err;
4114         int cpu = event->cpu;
4115
4116         if (!task) {
4117                 /* Must be root to operate on a CPU event: */
4118                 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
4119                         return ERR_PTR(-EACCES);
4120
4121                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
4122                 ctx = &cpuctx->ctx;
4123                 get_ctx(ctx);
4124                 ++ctx->pin_count;
4125
4126                 return ctx;
4127         }
4128
4129         err = -EINVAL;
4130         ctxn = pmu->task_ctx_nr;
4131         if (ctxn < 0)
4132                 goto errout;
4133
4134         if (event->attach_state & PERF_ATTACH_TASK_DATA) {
4135                 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
4136                 if (!task_ctx_data) {
4137                         err = -ENOMEM;
4138                         goto errout;
4139                 }
4140         }
4141
4142 retry:
4143         ctx = perf_lock_task_context(task, ctxn, &flags);
4144         if (ctx) {
4145                 clone_ctx = unclone_ctx(ctx);
4146                 ++ctx->pin_count;
4147
4148                 if (task_ctx_data && !ctx->task_ctx_data) {
4149                         ctx->task_ctx_data = task_ctx_data;
4150                         task_ctx_data = NULL;
4151                 }
4152                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4153
4154                 if (clone_ctx)
4155                         put_ctx(clone_ctx);
4156         } else {
4157                 ctx = alloc_perf_context(pmu, task);
4158                 err = -ENOMEM;
4159                 if (!ctx)
4160                         goto errout;
4161
4162                 if (task_ctx_data) {
4163                         ctx->task_ctx_data = task_ctx_data;
4164                         task_ctx_data = NULL;
4165                 }
4166
4167                 err = 0;
4168                 mutex_lock(&task->perf_event_mutex);
4169                 /*
4170                  * If it has already passed perf_event_exit_task().
4171                  * we must see PF_EXITING, it takes this mutex too.
4172                  */
4173                 if (task->flags & PF_EXITING)
4174                         err = -ESRCH;
4175                 else if (task->perf_event_ctxp[ctxn])
4176                         err = -EAGAIN;
4177                 else {
4178                         get_ctx(ctx);
4179                         ++ctx->pin_count;
4180                         rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
4181                 }
4182                 mutex_unlock(&task->perf_event_mutex);
4183
4184                 if (unlikely(err)) {
4185                         put_ctx(ctx);
4186
4187                         if (err == -EAGAIN)
4188                                 goto retry;
4189                         goto errout;
4190                 }
4191         }
4192
4193         kfree(task_ctx_data);
4194         return ctx;
4195
4196 errout:
4197         kfree(task_ctx_data);
4198         return ERR_PTR(err);
4199 }
4200
4201 static void perf_event_free_filter(struct perf_event *event);
4202 static void perf_event_free_bpf_prog(struct perf_event *event);
4203
4204 static void free_event_rcu(struct rcu_head *head)
4205 {
4206         struct perf_event *event;
4207
4208         event = container_of(head, struct perf_event, rcu_head);
4209         if (event->ns)
4210                 put_pid_ns(event->ns);
4211         perf_event_free_filter(event);
4212         kfree(event);
4213 }
4214
4215 static void ring_buffer_attach(struct perf_event *event,
4216                                struct ring_buffer *rb);
4217
4218 static void detach_sb_event(struct perf_event *event)
4219 {
4220         struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
4221
4222         raw_spin_lock(&pel->lock);
4223         list_del_rcu(&event->sb_list);
4224         raw_spin_unlock(&pel->lock);
4225 }
4226
4227 static bool is_sb_event(struct perf_event *event)
4228 {
4229         struct perf_event_attr *attr = &event->attr;
4230
4231         if (event->parent)
4232                 return false;
4233
4234         if (event->attach_state & PERF_ATTACH_TASK)
4235                 return false;
4236
4237         if (attr->mmap || attr->mmap_data || attr->mmap2 ||
4238             attr->comm || attr->comm_exec ||
4239             attr->task || attr->ksymbol ||
4240             attr->context_switch)
4241                 return true;
4242         return false;
4243 }
4244
4245 static void unaccount_pmu_sb_event(struct perf_event *event)
4246 {
4247         if (is_sb_event(event))
4248                 detach_sb_event(event);
4249 }
4250
4251 static void unaccount_event_cpu(struct perf_event *event, int cpu)
4252 {
4253         if (event->parent)
4254                 return;
4255
4256         if (is_cgroup_event(event))
4257                 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
4258 }
4259
4260 #ifdef CONFIG_NO_HZ_FULL
4261 static DEFINE_SPINLOCK(nr_freq_lock);
4262 #endif
4263
4264 static void unaccount_freq_event_nohz(void)
4265 {
4266 #ifdef CONFIG_NO_HZ_FULL
4267         spin_lock(&nr_freq_lock);
4268         if (atomic_dec_and_test(&nr_freq_events))
4269                 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
4270         spin_unlock(&nr_freq_lock);
4271 #endif
4272 }
4273
4274 static void unaccount_freq_event(void)
4275 {
4276         if (tick_nohz_full_enabled())
4277                 unaccount_freq_event_nohz();
4278         else
4279                 atomic_dec(&nr_freq_events);
4280 }
4281
4282 static void unaccount_event(struct perf_event *event)
4283 {
4284         bool dec = false;
4285
4286         if (event->parent)
4287                 return;
4288
4289         if (event->attach_state & PERF_ATTACH_TASK)
4290                 dec = true;
4291         if (event->attr.mmap || event->attr.mmap_data)
4292                 atomic_dec(&nr_mmap_events);
4293         if (event->attr.comm)
4294                 atomic_dec(&nr_comm_events);
4295         if (event->attr.namespaces)
4296                 atomic_dec(&nr_namespaces_events);
4297         if (event->attr.task)
4298                 atomic_dec(&nr_task_events);
4299         if (event->attr.freq)
4300                 unaccount_freq_event();
4301         if (event->attr.context_switch) {
4302                 dec = true;
4303                 atomic_dec(&nr_switch_events);
4304         }
4305         if (is_cgroup_event(event))
4306                 dec = true;
4307         if (has_branch_stack(event))
4308                 dec = true;
4309         if (event->attr.ksymbol)
4310                 atomic_dec(&nr_ksymbol_events);
4311         if (event->attr.bpf_event)
4312                 atomic_dec(&nr_bpf_events);
4313
4314         if (dec) {
4315                 if (!atomic_add_unless(&perf_sched_count, -1, 1))
4316                         schedule_delayed_work(&perf_sched_work, HZ);
4317         }
4318
4319         unaccount_event_cpu(event, event->cpu);
4320
4321         unaccount_pmu_sb_event(event);
4322 }
4323
4324 static void perf_sched_delayed(struct work_struct *work)
4325 {
4326         mutex_lock(&perf_sched_mutex);
4327         if (atomic_dec_and_test(&perf_sched_count))
4328                 static_branch_disable(&perf_sched_events);
4329         mutex_unlock(&perf_sched_mutex);
4330 }
4331
4332 /*
4333  * The following implement mutual exclusion of events on "exclusive" pmus
4334  * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
4335  * at a time, so we disallow creating events that might conflict, namely:
4336  *
4337  *  1) cpu-wide events in the presence of per-task events,
4338  *  2) per-task events in the presence of cpu-wide events,
4339  *  3) two matching events on the same context.
4340  *
4341  * The former two cases are handled in the allocation path (perf_event_alloc(),
4342  * _free_event()), the latter -- before the first perf_install_in_context().
4343  */
4344 static int exclusive_event_init(struct perf_event *event)
4345 {
4346         struct pmu *pmu = event->pmu;
4347
4348         if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
4349                 return 0;
4350
4351         /*
4352          * Prevent co-existence of per-task and cpu-wide events on the
4353          * same exclusive pmu.
4354          *
4355          * Negative pmu::exclusive_cnt means there are cpu-wide
4356          * events on this "exclusive" pmu, positive means there are
4357          * per-task events.
4358          *
4359          * Since this is called in perf_event_alloc() path, event::ctx
4360          * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
4361          * to mean "per-task event", because unlike other attach states it
4362          * never gets cleared.
4363          */
4364         if (event->attach_state & PERF_ATTACH_TASK) {
4365                 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
4366                         return -EBUSY;
4367         } else {
4368                 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
4369                         return -EBUSY;
4370         }
4371
4372         return 0;
4373 }
4374
4375 static void exclusive_event_destroy(struct perf_event *event)
4376 {
4377         struct pmu *pmu = event->pmu;
4378
4379         if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
4380                 return;
4381
4382         /* see comment in exclusive_event_init() */
4383         if (event->attach_state & PERF_ATTACH_TASK)
4384                 atomic_dec(&pmu->exclusive_cnt);
4385         else
4386                 atomic_inc(&pmu->exclusive_cnt);
4387 }
4388
4389 static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
4390 {
4391         if ((e1->pmu == e2->pmu) &&
4392             (e1->cpu == e2->cpu ||
4393              e1->cpu == -1 ||
4394              e2->cpu == -1))
4395                 return true;
4396         return false;
4397 }
4398
4399 /* Called under the same ctx::mutex as perf_install_in_context() */
4400 static bool exclusive_event_installable(struct perf_event *event,
4401                                         struct perf_event_context *ctx)
4402 {
4403         struct perf_event *iter_event;
4404         struct pmu *pmu = event->pmu;
4405
4406         if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
4407                 return true;
4408
4409         list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
4410                 if (exclusive_event_match(iter_event, event))
4411                         return false;
4412         }
4413
4414         return true;
4415 }
4416
4417 static void perf_addr_filters_splice(struct perf_event *event,
4418                                        struct list_head *head);
4419
4420 static void _free_event(struct perf_event *event)
4421 {
4422         irq_work_sync(&event->pending);
4423
4424         unaccount_event(event);
4425
4426         if (event->rb) {
4427                 /*
4428                  * Can happen when we close an event with re-directed output.
4429                  *
4430                  * Since we have a 0 refcount, perf_mmap_close() will skip
4431                  * over us; possibly making our ring_buffer_put() the last.
4432                  */
4433                 mutex_lock(&event->mmap_mutex);
4434                 ring_buffer_attach(event, NULL);
4435                 mutex_unlock(&event->mmap_mutex);
4436         }
4437
4438         if (is_cgroup_event(event))
4439                 perf_detach_cgroup(event);
4440
4441         if (!event->parent) {
4442                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
4443                         put_callchain_buffers();
4444         }
4445
4446         perf_event_free_bpf_prog(event);
4447         perf_addr_filters_splice(event, NULL);
4448         kfree(event->addr_filters_offs);
4449
4450         if (event->destroy)
4451                 event->destroy(event);
4452
4453         if (event->ctx)
4454                 put_ctx(event->ctx);
4455
4456         if (event->hw.target)
4457                 put_task_struct(event->hw.target);
4458
4459         exclusive_event_destroy(event);
4460         module_put(event->pmu->module);
4461
4462         call_rcu(&event->rcu_head, free_event_rcu);
4463 }
4464
4465 /*
4466  * Used to free events which have a known refcount of 1, such as in error paths
4467  * where the event isn't exposed yet and inherited events.
4468  */
4469 static void free_event(struct perf_event *event)
4470 {
4471         if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
4472                                 "unexpected event refcount: %ld; ptr=%p\n",
4473                                 atomic_long_read(&event->refcount), event)) {
4474                 /* leak to avoid use-after-free */
4475                 return;
4476         }
4477
4478         _free_event(event);
4479 }
4480
4481 /*
4482  * Remove user event from the owner task.
4483  */
4484 static void perf_remove_from_owner(struct perf_event *event)
4485 {
4486         struct task_struct *owner;
4487
4488         rcu_read_lock();
4489         /*
4490          * Matches the smp_store_release() in perf_event_exit_task(). If we
4491          * observe !owner it means the list deletion is complete and we can
4492          * indeed free this event, otherwise we need to serialize on
4493          * owner->perf_event_mutex.
4494          */
4495         owner = READ_ONCE(event->owner);
4496         if (owner) {
4497                 /*
4498                  * Since delayed_put_task_struct() also drops the last
4499                  * task reference we can safely take a new reference
4500                  * while holding the rcu_read_lock().
4501                  */
4502                 get_task_struct(owner);
4503         }
4504         rcu_read_unlock();
4505
4506         if (owner) {
4507                 /*
4508                  * If we're here through perf_event_exit_task() we're already
4509                  * holding ctx->mutex which would be an inversion wrt. the
4510                  * normal lock order.
4511                  *
4512                  * However we can safely take this lock because its the child
4513                  * ctx->mutex.
4514                  */
4515                 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
4516
4517                 /*
4518                  * We have to re-check the event->owner field, if it is cleared
4519                  * we raced with perf_event_exit_task(), acquiring the mutex
4520                  * ensured they're done, and we can proceed with freeing the
4521                  * event.
4522                  */
4523                 if (event->owner) {
4524                         list_del_init(&event->owner_entry);
4525                         smp_store_release(&event->owner, NULL);
4526                 }
4527                 mutex_unlock(&owner->perf_event_mutex);
4528                 put_task_struct(owner);
4529         }
4530 }
4531
4532 static void put_event(struct perf_event *event)
4533 {
4534         if (!atomic_long_dec_and_test(&event->refcount))
4535                 return;
4536
4537         _free_event(event);
4538 }
4539
4540 /*
4541  * Kill an event dead; while event:refcount will preserve the event
4542  * object, it will not preserve its functionality. Once the last 'user'
4543  * gives up the object, we'll destroy the thing.
4544  */
4545 int perf_event_release_kernel(struct perf_event *event)
4546 {
4547         struct perf_event_context *ctx = event->ctx;
4548         struct perf_event *child, *tmp;
4549         LIST_HEAD(free_list);
4550
4551         /*
4552          * If we got here through err_file: fput(event_file); we will not have
4553          * attached to a context yet.
4554          */
4555         if (!ctx) {
4556                 WARN_ON_ONCE(event->attach_state &
4557                                 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
4558                 goto no_ctx;
4559         }
4560
4561         if (!is_kernel_event(event))
4562                 perf_remove_from_owner(event);
4563
4564         ctx = perf_event_ctx_lock(event);
4565         WARN_ON_ONCE(ctx->parent_ctx);
4566         perf_remove_from_context(event, DETACH_GROUP);
4567
4568         raw_spin_lock_irq(&ctx->lock);
4569         /*
4570          * Mark this event as STATE_DEAD, there is no external reference to it
4571          * anymore.
4572          *
4573          * Anybody acquiring event->child_mutex after the below loop _must_
4574          * also see this, most importantly inherit_event() which will avoid
4575          * placing more children on the list.
4576          *
4577          * Thus this guarantees that we will in fact observe and kill _ALL_
4578          * child events.
4579          */
4580         event->state = PERF_EVENT_STATE_DEAD;
4581         raw_spin_unlock_irq(&ctx->lock);
4582
4583         perf_event_ctx_unlock(event, ctx);
4584
4585 again:
4586         mutex_lock(&event->child_mutex);
4587         list_for_each_entry(child, &event->child_list, child_list) {
4588
4589                 /*
4590                  * Cannot change, child events are not migrated, see the
4591                  * comment with perf_event_ctx_lock_nested().
4592                  */
4593                 ctx = READ_ONCE(child->ctx);
4594                 /*
4595                  * Since child_mutex nests inside ctx::mutex, we must jump
4596                  * through hoops. We start by grabbing a reference on the ctx.
4597                  *
4598                  * Since the event cannot get freed while we hold the
4599                  * child_mutex, the context must also exist and have a !0
4600                  * reference count.
4601                  */
4602                 get_ctx(ctx);
4603
4604                 /*
4605                  * Now that we have a ctx ref, we can drop child_mutex, and
4606                  * acquire ctx::mutex without fear of it going away. Then we
4607                  * can re-acquire child_mutex.
4608                  */
4609                 mutex_unlock(&event->child_mutex);
4610                 mutex_lock(&ctx->mutex);
4611                 mutex_lock(&event->child_mutex);
4612
4613                 /*
4614                  * Now that we hold ctx::mutex and child_mutex, revalidate our
4615                  * state, if child is still the first entry, it didn't get freed
4616                  * and we can continue doing so.
4617                  */
4618                 tmp = list_first_entry_or_null(&event->child_list,
4619                                                struct perf_event, child_list);
4620                 if (tmp == child) {
4621                         perf_remove_from_context(child, DETACH_GROUP);
4622                         list_move(&child->child_list, &free_list);
4623                         /*
4624                          * This matches the refcount bump in inherit_event();
4625                          * this can't be the last reference.
4626                          */
4627                         put_event(event);
4628                 }
4629
4630                 mutex_unlock(&event->child_mutex);
4631                 mutex_unlock(&ctx->mutex);
4632                 put_ctx(ctx);
4633                 goto again;
4634         }
4635         mutex_unlock(&event->child_mutex);
4636
4637         list_for_each_entry_safe(child, tmp, &free_list, child_list) {
4638                 list_del(&child->child_list);
4639                 free_event(child);
4640         }
4641
4642 no_ctx:
4643         put_event(event); /* Must be the 'last' reference */
4644         return 0;
4645 }
4646 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
4647
4648 /*
4649  * Called when the last reference to the file is gone.
4650  */
4651 static int perf_release(struct inode *inode, struct file *file)
4652 {
4653         perf_event_release_kernel(file->private_data);
4654         return 0;
4655 }
4656
4657 static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
4658 {
4659         struct perf_event *child;
4660         u64 total = 0;
4661
4662         *enabled = 0;
4663         *running = 0;
4664
4665         mutex_lock(&event->child_mutex);
4666
4667         (void)perf_event_read(event, false);
4668         total += perf_event_count(event);
4669
4670         *enabled += event->total_time_enabled +
4671                         atomic64_read(&event->child_total_time_enabled);
4672         *running += event->total_time_running +
4673                         atomic64_read(&event->child_total_time_running);
4674
4675         list_for_each_entry(child, &event->child_list, child_list) {
4676                 (void)perf_event_read(child, false);
4677                 total += perf_event_count(child);
4678                 *enabled += child->total_time_enabled;
4679                 *running += child->total_time_running;
4680         }
4681         mutex_unlock(&event->child_mutex);
4682
4683         return total;
4684 }
4685
4686 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
4687 {
4688         struct perf_event_context *ctx;
4689         u64 count;
4690
4691         ctx = perf_event_ctx_lock(event);
4692         count = __perf_event_read_value(event, enabled, running);
4693         perf_event_ctx_unlock(event, ctx);
4694
4695         return count;
4696 }
4697 EXPORT_SYMBOL_GPL(perf_event_read_value);
4698
4699 static int __perf_read_group_add(struct perf_event *leader,
4700                                         u64 read_format, u64 *values)
4701 {
4702         struct perf_event_context *ctx = leader->ctx;
4703         struct perf_event *sub;
4704         unsigned long flags;
4705         int n = 1; /* skip @nr */
4706         int ret;
4707
4708         ret = perf_event_read(leader, true);
4709         if (ret)
4710                 return ret;
4711
4712         raw_spin_lock_irqsave(&ctx->lock, flags);
4713
4714         /*
4715          * Since we co-schedule groups, {enabled,running} times of siblings
4716          * will be identical to those of the leader, so we only publish one
4717          * set.
4718          */
4719         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
4720                 values[n++] += leader->total_time_enabled +
4721                         atomic64_read(&leader->child_total_time_enabled);
4722         }
4723
4724         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
4725                 values[n++] += leader->total_time_running +
4726                         atomic64_read(&leader->child_total_time_running);
4727         }
4728
4729         /*
4730          * Write {count,id} tuples for every sibling.
4731          */
4732         values[n++] += perf_event_count(leader);
4733         if (read_format & PERF_FORMAT_ID)
4734                 values[n++] = primary_event_id(leader);
4735
4736         for_each_sibling_event(sub, leader) {
4737                 values[n++] += perf_event_count(sub);
4738                 if (read_format & PERF_FORMAT_ID)
4739                         values[n++] = primary_event_id(sub);
4740         }
4741
4742         raw_spin_unlock_irqrestore(&ctx->lock, flags);
4743         return 0;
4744 }
4745
4746 static int perf_read_group(struct perf_event *event,
4747                                    u64 read_format, char __user *buf)
4748 {
4749         struct perf_event *leader = event->group_leader, *child;
4750         struct perf_event_context *ctx = leader->ctx;
4751         int ret;
4752         u64 *values;
4753
4754         lockdep_assert_held(&ctx->mutex);
4755
4756         values = kzalloc(event->read_size, GFP_KERNEL);
4757         if (!values)
4758                 return -ENOMEM;
4759
4760         values[0] = 1 + leader->nr_siblings;
4761
4762         /*
4763          * By locking the child_mutex of the leader we effectively
4764          * lock the child list of all siblings.. XXX explain how.
4765          */
4766         mutex_lock(&leader->child_mutex);
4767
4768         ret = __perf_read_group_add(leader, read_format, values);
4769         if (ret)
4770                 goto unlock;
4771
4772         list_for_each_entry(child, &leader->child_list, child_list) {
4773                 ret = __perf_read_group_add(child, read_format, values);
4774                 if (ret)
4775                         goto unlock;
4776         }
4777
4778         mutex_unlock(&leader->child_mutex);
4779
4780         ret = event->read_size;
4781         if (copy_to_user(buf, values, event->read_size))
4782                 ret = -EFAULT;
4783         goto out;
4784
4785 unlock:
4786         mutex_unlock(&leader->child_mutex);
4787 out:
4788         kfree(values);
4789         return ret;
4790 }
4791
4792 static int perf_read_one(struct perf_event *event,
4793                                  u64 read_format, char __user *buf)
4794 {
4795         u64 enabled, running;
4796         u64 values[4];
4797         int n = 0;
4798
4799         values[n++] = __perf_event_read_value(event, &enabled, &running);
4800         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
4801                 values[n++] = enabled;
4802         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
4803                 values[n++] = running;
4804         if (read_format & PERF_FORMAT_ID)
4805                 values[n++] = primary_event_id(event);
4806
4807         if (copy_to_user(buf, values, n * sizeof(u64)))
4808                 return -EFAULT;
4809
4810         return n * sizeof(u64);
4811 }
4812
4813 static bool is_event_hup(struct perf_event *event)
4814 {
4815         bool no_children;
4816
4817         if (event->state > PERF_EVENT_STATE_EXIT)
4818                 return false;
4819
4820         mutex_lock(&event->child_mutex);
4821         no_children = list_empty(&event->child_list);
4822         mutex_unlock(&event->child_mutex);
4823         return no_children;
4824 }
4825
4826 /*
4827  * Read the performance event - simple non blocking version for now
4828  */
4829 static ssize_t
4830 __perf_read(struct perf_event *event, char __user *buf, size_t count)
4831 {
4832         u64 read_format = event->attr.read_format;
4833         int ret;
4834
4835         /*
4836          * Return end-of-file for a read on an event that is in
4837          * error state (i.e. because it was pinned but it couldn't be
4838          * scheduled on to the CPU at some point).
4839          */
4840         if (event->state == PERF_EVENT_STATE_ERROR)
4841                 return 0;
4842
4843         if (count < event->read_size)
4844                 return -ENOSPC;
4845
4846         WARN_ON_ONCE(event->ctx->parent_ctx);
4847         if (read_format & PERF_FORMAT_GROUP)
4848                 ret = perf_read_group(event, read_format, buf);
4849         else
4850                 ret = perf_read_one(event, read_format, buf);
4851
4852         return ret;
4853 }
4854
4855 static ssize_t
4856 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
4857 {
4858         struct perf_event *event = file->private_data;
4859         struct perf_event_context *ctx;
4860         int ret;
4861
4862         ctx = perf_event_ctx_lock(event);
4863         ret = __perf_read(event, buf, count);
4864         perf_event_ctx_unlock(event, ctx);
4865
4866         return ret;
4867 }
4868
4869 static __poll_t perf_poll(struct file *file, poll_table *wait)
4870 {
4871         struct perf_event *event = file->private_data;
4872         struct ring_buffer *rb;
4873         __poll_t events = EPOLLHUP;
4874
4875         poll_wait(file, &event->waitq, wait);
4876
4877         if (is_event_hup(event))
4878                 return events;
4879
4880         /*
4881          * Pin the event->rb by taking event->mmap_mutex; otherwise
4882          * perf_event_set_output() can swizzle our rb and make us miss wakeups.
4883          */
4884         mutex_lock(&event->mmap_mutex);
4885         rb = event->rb;
4886         if (rb)
4887                 events = atomic_xchg(&rb->poll, 0);
4888         mutex_unlock(&event->mmap_mutex);
4889         return events;
4890 }
4891
4892 static void _perf_event_reset(struct perf_event *event)
4893 {
4894         (void)perf_event_read(event, false);
4895         local64_set(&event->count, 0);
4896         perf_event_update_userpage(event);
4897 }
4898
4899 /*
4900  * Holding the top-level event's child_mutex means that any
4901  * descendant process that has inherited this event will block
4902  * in perf_event_exit_event() if it goes to exit, thus satisfying the
4903  * task existence requirements of perf_event_enable/disable.
4904  */
4905 static void perf_event_for_each_child(struct perf_event *event,
4906                                         void (*func)(struct perf_event *))
4907 {
4908         struct perf_event *child;
4909
4910         WARN_ON_ONCE(event->ctx->parent_ctx);
4911
4912         mutex_lock(&event->child_mutex);
4913         func(event);
4914         list_for_each_entry(child, &event->child_list, child_list)
4915                 func(child);
4916         mutex_unlock(&event->child_mutex);
4917 }
4918
4919 static void perf_event_for_each(struct perf_event *event,
4920                                   void (*func)(struct perf_event *))
4921 {
4922         struct perf_event_context *ctx = event->ctx;
4923         struct perf_event *sibling;
4924
4925         lockdep_assert_held(&ctx->mutex);
4926
4927         event = event->group_leader;
4928
4929         perf_event_for_each_child(event, func);
4930         for_each_sibling_event(sibling, event)
4931                 perf_event_for_each_child(sibling, func);
4932 }
4933
4934 static void __perf_event_period(struct perf_event *event,
4935                                 struct perf_cpu_context *cpuctx,
4936                                 struct perf_event_context *ctx,
4937                                 void *info)
4938 {
4939         u64 value = *((u64 *)info);
4940         bool active;
4941
4942         if (event->attr.freq) {
4943                 event->attr.sample_freq = value;
4944         } else {
4945                 event->attr.sample_period = value;
4946                 event->hw.sample_period = value;
4947         }
4948
4949         active = (event->state == PERF_EVENT_STATE_ACTIVE);
4950         if (active) {
4951                 perf_pmu_disable(ctx->pmu);
4952                 /*
4953                  * We could be throttled; unthrottle now to avoid the tick
4954                  * trying to unthrottle while we already re-started the event.
4955                  */
4956                 if (event->hw.interrupts == MAX_INTERRUPTS) {
4957                         event->hw.interrupts = 0;
4958                         perf_log_throttle(event, 1);
4959                 }
4960                 event->pmu->stop(event, PERF_EF_UPDATE);
4961         }
4962
4963         local64_set(&event->hw.period_left, 0);
4964
4965         if (active) {
4966                 event->pmu->start(event, PERF_EF_RELOAD);
4967                 perf_pmu_enable(ctx->pmu);
4968         }
4969 }
4970
4971 static int perf_event_period(struct perf_event *event, u64 __user *arg)
4972 {
4973         u64 value;
4974
4975         if (!is_sampling_event(event))
4976                 return -EINVAL;
4977
4978         if (copy_from_user(&value, arg, sizeof(value)))
4979                 return -EFAULT;
4980
4981         if (!value)
4982                 return -EINVAL;
4983
4984         if (event->attr.freq && value > sysctl_perf_event_sample_rate)
4985                 return -EINVAL;
4986
4987         event_function_call(event, __perf_event_period, &value);
4988
4989         return 0;
4990 }
4991
4992 static const struct file_operations perf_fops;
4993
4994 static inline int perf_fget_light(int fd, struct fd *p)
4995 {
4996         struct fd f = fdget(fd);
4997         if (!f.file)
4998                 return -EBADF;
4999
5000         if (f.file->f_op != &perf_fops) {
5001                 fdput(f);
5002                 return -EBADF;
5003         }
5004         *p = f;
5005         return 0;
5006 }
5007
5008 static int perf_event_set_output(struct perf_event *event,
5009                                  struct perf_event *output_event);
5010 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
5011 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
5012 static int perf_copy_attr(struct perf_event_attr __user *uattr,
5013                           struct perf_event_attr *attr);
5014
5015 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
5016 {
5017         void (*func)(struct perf_event *);
5018         u32 flags = arg;
5019
5020         switch (cmd) {
5021         case PERF_EVENT_IOC_ENABLE:
5022                 func = _perf_event_enable;
5023                 break;
5024         case PERF_EVENT_IOC_DISABLE:
5025                 func = _perf_event_disable;
5026                 break;
5027         case PERF_EVENT_IOC_RESET:
5028                 func = _perf_event_reset;
5029                 break;
5030
5031         case PERF_EVENT_IOC_REFRESH:
5032                 return _perf_event_refresh(event, arg);
5033
5034         case PERF_EVENT_IOC_PERIOD:
5035                 return perf_event_period(event, (u64 __user *)arg);
5036
5037         case PERF_EVENT_IOC_ID:
5038         {
5039                 u64 id = primary_event_id(event);
5040
5041                 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
5042                         return -EFAULT;
5043                 return 0;
5044         }
5045
5046         case PERF_EVENT_IOC_SET_OUTPUT:
5047         {
5048                 int ret;
5049                 if (arg != -1) {
5050                         struct perf_event *output_event;
5051                         struct fd output;
5052                         ret = perf_fget_light(arg, &output);
5053                         if (ret)
5054                                 return ret;
5055                         output_event = output.file->private_data;
5056                         ret = perf_event_set_output(event, output_event);
5057                         fdput(output);
5058                 } else {
5059                         ret = perf_event_set_output(event, NULL);
5060                 }
5061                 return ret;
5062         }
5063
5064         case PERF_EVENT_IOC_SET_FILTER:
5065                 return perf_event_set_filter(event, (void __user *)arg);
5066
5067         case PERF_EVENT_IOC_SET_BPF:
5068                 return perf_event_set_bpf_prog(event, arg);
5069
5070         case PERF_EVENT_IOC_PAUSE_OUTPUT: {
5071                 struct ring_buffer *rb;
5072
5073                 rcu_read_lock();
5074                 rb = rcu_dereference(event->rb);
5075                 if (!rb || !rb->nr_pages) {
5076                         rcu_read_unlock();
5077                         return -EINVAL;
5078                 }
5079                 rb_toggle_paused(rb, !!arg);
5080                 rcu_read_unlock();
5081                 return 0;
5082         }
5083
5084         case PERF_EVENT_IOC_QUERY_BPF:
5085                 return perf_event_query_prog_array(event, (void __user *)arg);
5086
5087         case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
5088                 struct perf_event_attr new_attr;
5089                 int err = perf_copy_attr((struct perf_event_attr __user *)arg,
5090                                          &new_attr);
5091
5092                 if (err)
5093                         return err;
5094
5095                 return perf_event_modify_attr(event,  &new_attr);
5096         }
5097         default:
5098                 return -ENOTTY;
5099         }
5100
5101         if (flags & PERF_IOC_FLAG_GROUP)
5102                 perf_event_for_each(event, func);
5103         else
5104                 perf_event_for_each_child(event, func);
5105
5106         return 0;
5107 }
5108
5109 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
5110 {
5111         struct perf_event *event = file->private_data;
5112         struct perf_event_context *ctx;
5113         long ret;
5114
5115         ctx = perf_event_ctx_lock(event);
5116         ret = _perf_ioctl(event, cmd, arg);
5117         perf_event_ctx_unlock(event, ctx);
5118
5119         return ret;
5120 }
5121
5122 #ifdef CONFIG_COMPAT
5123 static long perf_compat_ioctl(struct file *file, unsigned int cmd,
5124                                 unsigned long arg)
5125 {
5126         switch (_IOC_NR(cmd)) {
5127         case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
5128         case _IOC_NR(PERF_EVENT_IOC_ID):
5129         case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
5130         case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
5131                 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
5132                 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
5133                         cmd &= ~IOCSIZE_MASK;
5134                         cmd |= sizeof(void *) << IOCSIZE_SHIFT;
5135                 }
5136                 break;
5137         }
5138         return perf_ioctl(file, cmd, arg);
5139 }
5140 #else
5141 # define perf_compat_ioctl NULL
5142 #endif
5143
5144 int perf_event_task_enable(void)
5145 {
5146         struct perf_event_context *ctx;
5147         struct perf_event *event;
5148
5149         mutex_lock(&current->perf_event_mutex);
5150         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
5151                 ctx = perf_event_ctx_lock(event);
5152                 perf_event_for_each_child(event, _perf_event_enable);
5153                 perf_event_ctx_unlock(event, ctx);
5154         }
5155         mutex_unlock(&current->perf_event_mutex);
5156
5157         return 0;
5158 }
5159
5160 int perf_event_task_disable(void)
5161 {
5162         struct perf_event_context *ctx;
5163         struct perf_event *event;
5164
5165         mutex_lock(&current->perf_event_mutex);
5166         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
5167                 ctx = perf_event_ctx_lock(event);
5168                 perf_event_for_each_child(event, _perf_event_disable);
5169                 perf_event_ctx_unlock(event, ctx);
5170         }
5171         mutex_unlock(&current->perf_event_mutex);
5172
5173         return 0;
5174 }
5175
5176 static int perf_event_index(struct perf_event *event)
5177 {
5178         if (event->hw.state & PERF_HES_STOPPED)
5179                 return 0;
5180
5181         if (event->state != PERF_EVENT_STATE_ACTIVE)
5182                 return 0;
5183
5184         return event->pmu->event_idx(event);
5185 }
5186
5187 static void calc_timer_values(struct perf_event *event,
5188                                 u64 *now,
5189                                 u64 *enabled,
5190                                 u64 *running)
5191 {
5192         u64 ctx_time;
5193
5194         *now = perf_clock();
5195         ctx_time = event->shadow_ctx_time + *now;
5196         __perf_update_times(event, ctx_time, enabled, running);
5197 }
5198
5199 static void perf_event_init_userpage(struct perf_event *event)
5200 {
5201         struct perf_event_mmap_page *userpg;
5202         struct ring_buffer *rb;
5203
5204         rcu_read_lock();
5205         rb = rcu_dereference(event->rb);
5206         if (!rb)
5207                 goto unlock;
5208
5209         userpg = rb->user_page;
5210
5211         /* Allow new userspace to detect that bit 0 is deprecated */
5212         userpg->cap_bit0_is_deprecated = 1;
5213         userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
5214         userpg->data_offset = PAGE_SIZE;
5215         userpg->data_size = perf_data_size(rb);
5216
5217 unlock:
5218         rcu_read_unlock();
5219 }
5220
5221 void __weak arch_perf_update_userpage(
5222         struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
5223 {
5224 }
5225
5226 /*
5227  * Callers need to ensure there can be no nesting of this function, otherwise
5228  * the seqlock logic goes bad. We can not serialize this because the arch
5229  * code calls this from NMI context.
5230  */
5231 void perf_event_update_userpage(struct perf_event *event)
5232 {
5233         struct perf_event_mmap_page *userpg;
5234         struct ring_buffer *rb;
5235         u64 enabled, running, now;
5236
5237         rcu_read_lock();
5238         rb = rcu_dereference(event->rb);
5239         if (!rb)
5240                 goto unlock;
5241
5242         /*
5243          * compute total_time_enabled, total_time_running
5244          * based on snapshot values taken when the event
5245          * was last scheduled in.
5246          *
5247          * we cannot simply called update_context_time()
5248          * because of locking issue as we can be called in
5249          * NMI context
5250          */
5251         calc_timer_values(event, &now, &enabled, &running);
5252
5253         userpg = rb->user_page;
5254         /*
5255          * Disable preemption to guarantee consistent time stamps are stored to
5256          * the user page.
5257          */
5258         preempt_disable();
5259         ++userpg->lock;
5260         barrier();
5261         userpg->index = perf_event_index(event);
5262         userpg->offset = perf_event_count(event);
5263         if (userpg->index)
5264                 userpg->offset -= local64_read(&event->hw.prev_count);
5265
5266         userpg->time_enabled = enabled +
5267                         atomic64_read(&event->child_total_time_enabled);
5268
5269         userpg->time_running = running +
5270                         atomic64_read(&event->child_total_time_running);
5271
5272         arch_perf_update_userpage(event, userpg, now);
5273
5274         barrier();
5275         ++userpg->lock;
5276         preempt_enable();
5277 unlock:
5278         rcu_read_unlock();
5279 }
5280 EXPORT_SYMBOL_GPL(perf_event_update_userpage);
5281
5282 static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
5283 {
5284         struct perf_event *event = vmf->vma->vm_file->private_data;
5285         struct ring_buffer *rb;
5286         vm_fault_t ret = VM_FAULT_SIGBUS;
5287
5288         if (vmf->flags & FAULT_FLAG_MKWRITE) {
5289                 if (vmf->pgoff == 0)
5290                         ret = 0;
5291                 return ret;
5292         }
5293
5294         rcu_read_lock();
5295         rb = rcu_dereference(event->rb);
5296         if (!rb)
5297                 goto unlock;
5298
5299         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
5300                 goto unlock;
5301
5302         vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
5303         if (!vmf->page)
5304                 goto unlock;
5305
5306         get_page(vmf->page);
5307         vmf->page->mapping = vmf->vma->vm_file->f_mapping;
5308         vmf->page->index   = vmf->pgoff;
5309
5310         ret = 0;
5311 unlock:
5312         rcu_read_unlock();
5313
5314         return ret;
5315 }
5316
5317 static void ring_buffer_attach(struct perf_event *event,
5318                                struct ring_buffer *rb)
5319 {
5320         struct ring_buffer *old_rb = NULL;
5321         unsigned long flags;
5322
5323         if (event->rb) {
5324                 /*
5325                  * Should be impossible, we set this when removing
5326                  * event->rb_entry and wait/clear when adding event->rb_entry.
5327                  */
5328                 WARN_ON_ONCE(event->rcu_pending);
5329
5330                 old_rb = event->rb;
5331                 spin_lock_irqsave(&old_rb->event_lock, flags);
5332                 list_del_rcu(&event->rb_entry);
5333                 spin_unlock_irqrestore(&old_rb->event_lock, flags);
5334
5335                 event->rcu_batches = get_state_synchronize_rcu();
5336                 event->rcu_pending = 1;
5337         }
5338
5339         if (rb) {
5340                 if (event->rcu_pending) {
5341                         cond_synchronize_rcu(event->rcu_batches);
5342                         event->rcu_pending = 0;
5343                 }
5344
5345                 spin_lock_irqsave(&rb->event_lock, flags);
5346                 list_add_rcu(&event->rb_entry, &rb->event_list);
5347                 spin_unlock_irqrestore(&rb->event_lock, flags);
5348         }
5349
5350         /*
5351          * Avoid racing with perf_mmap_close(AUX): stop the event
5352          * before swizzling the event::rb pointer; if it's getting
5353          * unmapped, its aux_mmap_count will be 0 and it won't
5354          * restart. See the comment in __perf_pmu_output_stop().
5355          *
5356          * Data will inevitably be lost when set_output is done in
5357          * mid-air, but then again, whoever does it like this is
5358          * not in for the data anyway.
5359          */
5360         if (has_aux(event))
5361                 perf_event_stop(event, 0);
5362
5363         rcu_assign_pointer(event->rb, rb);
5364
5365         if (old_rb) {
5366                 ring_buffer_put(old_rb);
5367                 /*
5368                  * Since we detached before setting the new rb, so that we
5369                  * could attach the new rb, we could have missed a wakeup.
5370                  * Provide it now.
5371                  */
5372                 wake_up_all(&event->waitq);
5373         }
5374 }
5375
5376 static void ring_buffer_wakeup(struct perf_event *event)
5377 {
5378         struct ring_buffer *rb;
5379
5380         rcu_read_lock();
5381         rb = rcu_dereference(event->rb);
5382         if (rb) {
5383                 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
5384                         wake_up_all(&event->waitq);
5385         }
5386         rcu_read_unlock();
5387 }
5388
5389 struct ring_buffer *ring_buffer_get(struct perf_event *event)
5390 {
5391         struct ring_buffer *rb;
5392
5393         rcu_read_lock();
5394         rb = rcu_dereference(event->rb);
5395         if (rb) {
5396                 if (!refcount_inc_not_zero(&rb->refcount))
5397                         rb = NULL;
5398         }
5399         rcu_read_unlock();
5400
5401         return rb;
5402 }
5403
5404 void ring_buffer_put(struct ring_buffer *rb)
5405 {
5406         if (!refcount_dec_and_test(&rb->refcount))
5407                 return;
5408
5409         WARN_ON_ONCE(!list_empty(&rb->event_list));
5410
5411         call_rcu(&rb->rcu_head, rb_free_rcu);
5412 }
5413
5414 static void perf_mmap_open(struct vm_area_struct *vma)
5415 {
5416         struct perf_event *event = vma->vm_file->private_data;
5417
5418         atomic_inc(&event->mmap_count);
5419         atomic_inc(&event->rb->mmap_count);
5420
5421         if (vma->vm_pgoff)
5422                 atomic_inc(&event->rb->aux_mmap_count);
5423
5424         if (event->pmu->event_mapped)
5425                 event->pmu->event_mapped(event, vma->vm_mm);
5426 }
5427
5428 static void perf_pmu_output_stop(struct perf_event *event);
5429
5430 /*
5431  * A buffer can be mmap()ed multiple times; either directly through the same
5432  * event, or through other events by use of perf_event_set_output().
5433  *
5434  * In order to undo the VM accounting done by perf_mmap() we need to destroy
5435  * the buffer here, where we still have a VM context. This means we need
5436  * to detach all events redirecting to us.
5437  */
5438 static void perf_mmap_close(struct vm_area_struct *vma)
5439 {
5440         struct perf_event *event = vma->vm_file->private_data;
5441
5442         struct ring_buffer *rb = ring_buffer_get(event);
5443         struct user_struct *mmap_user = rb->mmap_user;
5444         int mmap_locked = rb->mmap_locked;
5445         unsigned long size = perf_data_size(rb);
5446
5447         if (event->pmu->event_unmapped)
5448                 event->pmu->event_unmapped(event, vma->vm_mm);
5449
5450         /*
5451          * rb->aux_mmap_count will always drop before rb->mmap_count and
5452          * event->mmap_count, so it is ok to use event->mmap_mutex to
5453          * serialize with perf_mmap here.
5454          */
5455         if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
5456             atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
5457                 /*
5458                  * Stop all AUX events that are writing to this buffer,
5459                  * so that we can free its AUX pages and corresponding PMU
5460                  * data. Note that after rb::aux_mmap_count dropped to zero,
5461                  * they won't start any more (see perf_aux_output_begin()).
5462                  */
5463                 perf_pmu_output_stop(event);
5464
5465                 /* now it's safe to free the pages */
5466                 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
5467                 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
5468
5469                 /* this has to be the last one */
5470                 rb_free_aux(rb);
5471                 WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
5472
5473                 mutex_unlock(&event->mmap_mutex);
5474         }
5475
5476         atomic_dec(&rb->mmap_count);
5477
5478         if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
5479                 goto out_put;
5480
5481         ring_buffer_attach(event, NULL);
5482         mutex_unlock(&event->mmap_mutex);
5483
5484         /* If there's still other mmap()s of this buffer, we're done. */
5485         if (atomic_read(&rb->mmap_count))
5486                 goto out_put;
5487
5488         /*
5489          * No other mmap()s, detach from all other events that might redirect
5490          * into the now unreachable buffer. Somewhat complicated by the
5491          * fact that rb::event_lock otherwise nests inside mmap_mutex.
5492          */
5493 again:
5494         rcu_read_lock();
5495         list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
5496                 if (!atomic_long_inc_not_zero(&event->refcount)) {
5497                         /*
5498                          * This event is en-route to free_event() which will
5499                          * detach it and remove it from the list.
5500                          */
5501                         continue;
5502                 }
5503                 rcu_read_unlock();
5504
5505                 mutex_lock(&event->mmap_mutex);
5506                 /*
5507                  * Check we didn't race with perf_event_set_output() which can
5508                  * swizzle the rb from under us while we were waiting to
5509                  * acquire mmap_mutex.
5510                  *
5511                  * If we find a different rb; ignore this event, a next
5512                  * iteration will no longer find it on the list. We have to
5513                  * still restart the iteration to make sure we're not now
5514                  * iterating the wrong list.
5515                  */
5516                 if (event->rb == rb)
5517                         ring_buffer_attach(event, NULL);
5518
5519                 mutex_unlock(&event->mmap_mutex);
5520                 put_event(event);
5521
5522                 /*
5523                  * Restart the iteration; either we're on the wrong list or
5524                  * destroyed its integrity by doing a deletion.
5525                  */
5526                 goto again;
5527         }
5528         rcu_read_unlock();
5529
5530         /*
5531          * It could be there's still a few 0-ref events on the list; they'll
5532          * get cleaned up by free_event() -- they'll also still have their
5533          * ref on the rb and will free it whenever they are done with it.
5534          *
5535          * Aside from that, this buffer is 'fully' detached and unmapped,
5536          * undo the VM accounting.
5537          */
5538
5539         atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
5540         vma->vm_mm->pinned_vm -= mmap_locked;
5541         free_uid(mmap_user);
5542
5543 out_put:
5544         ring_buffer_put(rb); /* could be last */
5545 }
5546
5547 static const struct vm_operations_struct perf_mmap_vmops = {
5548         .open           = perf_mmap_open,
5549         .close          = perf_mmap_close, /* non mergeable */
5550         .fault          = perf_mmap_fault,
5551         .page_mkwrite   = perf_mmap_fault,
5552 };
5553
5554 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
5555 {
5556         struct perf_event *event = file->private_data;
5557         unsigned long user_locked, user_lock_limit;
5558         struct user_struct *user = current_user();
5559         unsigned long locked, lock_limit;
5560         struct ring_buffer *rb = NULL;
5561         unsigned long vma_size;
5562         unsigned long nr_pages;
5563         long user_extra = 0, extra = 0;
5564         int ret = 0, flags = 0;
5565
5566         /*
5567          * Don't allow mmap() of inherited per-task counters. This would
5568          * create a performance issue due to all children writing to the
5569          * same rb.
5570          */
5571         if (event->cpu == -1 && event->attr.inherit)
5572                 return -EINVAL;
5573
5574         if (!(vma->vm_flags & VM_SHARED))
5575                 return -EINVAL;
5576
5577         vma_size = vma->vm_end - vma->vm_start;
5578
5579         if (vma->vm_pgoff == 0) {
5580                 nr_pages = (vma_size / PAGE_SIZE) - 1;
5581         } else {
5582                 /*
5583                  * AUX area mapping: if rb->aux_nr_pages != 0, it's already
5584                  * mapped, all subsequent mappings should have the same size
5585                  * and offset. Must be above the normal perf buffer.
5586                  */
5587                 u64 aux_offset, aux_size;
5588
5589                 if (!event->rb)
5590                         return -EINVAL;
5591
5592                 nr_pages = vma_size / PAGE_SIZE;
5593
5594                 mutex_lock(&event->mmap_mutex);
5595                 ret = -EINVAL;
5596
5597                 rb = event->rb;
5598                 if (!rb)
5599                         goto aux_unlock;
5600
5601                 aux_offset = READ_ONCE(rb->user_page->aux_offset);
5602                 aux_size = READ_ONCE(rb->user_page->aux_size);
5603
5604                 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
5605                         goto aux_unlock;
5606
5607                 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
5608                         goto aux_unlock;
5609
5610                 /* already mapped with a different offset */
5611                 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
5612                         goto aux_unlock;
5613
5614                 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
5615                         goto aux_unlock;
5616
5617                 /* already mapped with a different size */
5618                 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
5619                         goto aux_unlock;
5620
5621                 if (!is_power_of_2(nr_pages))
5622                         goto aux_unlock;
5623
5624                 if (!atomic_inc_not_zero(&rb->mmap_count))
5625                         goto aux_unlock;
5626
5627                 if (rb_has_aux(rb)) {
5628                         atomic_inc(&rb->aux_mmap_count);
5629                         ret = 0;
5630                         goto unlock;
5631                 }
5632
5633                 atomic_set(&rb->aux_mmap_count, 1);
5634                 user_extra = nr_pages;
5635
5636                 goto accounting;
5637         }
5638
5639         /*
5640          * If we have rb pages ensure they're a power-of-two number, so we
5641          * can do bitmasks instead of modulo.
5642          */
5643         if (nr_pages != 0 && !is_power_of_2(nr_pages))
5644                 return -EINVAL;
5645
5646         if (vma_size != PAGE_SIZE * (1 + nr_pages))
5647                 return -EINVAL;
5648
5649         WARN_ON_ONCE(event->ctx->parent_ctx);
5650 again:
5651         mutex_lock(&event->mmap_mutex);
5652         if (event->rb) {
5653                 if (event->rb->nr_pages != nr_pages) {
5654                         ret = -EINVAL;
5655                         goto unlock;
5656                 }
5657
5658                 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
5659                         /*
5660                          * Raced against perf_mmap_close() through
5661                          * perf_event_set_output(). Try again, hope for better
5662                          * luck.
5663                          */
5664                         mutex_unlock(&event->mmap_mutex);
5665                         goto again;
5666                 }
5667
5668                 goto unlock;
5669         }
5670
5671         user_extra = nr_pages + 1;
5672
5673 accounting:
5674         user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
5675
5676         /*
5677          * Increase the limit linearly with more CPUs:
5678          */
5679         user_lock_limit *= num_online_cpus();
5680
5681         user_locked = atomic_long_read(&user->locked_vm) + user_extra;
5682
5683         if (user_locked > user_lock_limit)
5684                 extra = user_locked - user_lock_limit;
5685
5686         lock_limit = rlimit(RLIMIT_MEMLOCK);
5687         lock_limit >>= PAGE_SHIFT;
5688         locked = vma->vm_mm->pinned_vm + extra;
5689
5690         if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
5691                 !capable(CAP_IPC_LOCK)) {
5692                 ret = -EPERM;
5693                 goto unlock;
5694         }
5695
5696         WARN_ON(!rb && event->rb);
5697
5698         if (vma->vm_flags & VM_WRITE)
5699                 flags |= RING_BUFFER_WRITABLE;
5700
5701         if (!rb) {
5702                 rb = rb_alloc(nr_pages,
5703                               event->attr.watermark ? event->attr.wakeup_watermark : 0,
5704                               event->cpu, flags);
5705
5706                 if (!rb) {
5707                         ret = -ENOMEM;
5708                         goto unlock;
5709                 }
5710
5711                 atomic_set(&rb->mmap_count, 1);
5712                 rb->mmap_user = get_current_user();
5713                 rb->mmap_locked = extra;
5714
5715                 ring_buffer_attach(event, rb);
5716
5717                 perf_event_init_userpage(event);
5718                 perf_event_update_userpage(event);
5719         } else {
5720                 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
5721                                    event->attr.aux_watermark, flags);
5722                 if (!ret)
5723                         rb->aux_mmap_locked = extra;
5724         }
5725
5726 unlock:
5727         if (!ret) {
5728                 atomic_long_add(user_extra, &user->locked_vm);
5729                 vma->vm_mm->pinned_vm += extra;
5730
5731                 atomic_inc(&event->mmap_count);
5732         } else if (rb) {
5733                 atomic_dec(&rb->mmap_count);
5734         }
5735 aux_unlock:
5736         mutex_unlock(&event->mmap_mutex);
5737
5738         /*
5739          * Since pinned accounting is per vm we cannot allow fork() to copy our
5740          * vma.
5741          */
5742         vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
5743         vma->vm_ops = &perf_mmap_vmops;
5744
5745         if (event->pmu->event_mapped)
5746                 event->pmu->event_mapped(event, vma->vm_mm);
5747
5748         return ret;
5749 }
5750
5751 static int perf_fasync(int fd, struct file *filp, int on)
5752 {
5753         struct inode *inode = file_inode(filp);
5754         struct perf_event *event = filp->private_data;
5755         int retval;
5756
5757         inode_lock(inode);
5758         retval = fasync_helper(fd, filp, on, &event->fasync);
5759         inode_unlock(inode);
5760
5761         if (retval < 0)
5762                 return retval;
5763
5764         return 0;
5765 }
5766
5767 static const struct file_operations perf_fops = {
5768         .llseek                 = no_llseek,
5769         .release                = perf_release,
5770         .read                   = perf_read,
5771         .poll                   = perf_poll,
5772         .unlocked_ioctl         = perf_ioctl,
5773         .compat_ioctl           = perf_compat_ioctl,
5774         .mmap                   = perf_mmap,
5775         .fasync                 = perf_fasync,
5776 };
5777
5778 /*
5779  * Perf event wakeup
5780  *
5781  * If there's data, ensure we set the poll() state and publish everything
5782  * to user-space before waking everybody up.
5783  */
5784
5785 static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
5786 {
5787         /* only the parent has fasync state */
5788         if (event->parent)
5789                 event = event->parent;
5790         return &event->fasync;
5791 }
5792
5793 void perf_event_wakeup(struct perf_event *event)
5794 {
5795         ring_buffer_wakeup(event);
5796
5797         if (event->pending_kill) {
5798                 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
5799                 event->pending_kill = 0;
5800         }
5801 }
5802
5803 static void perf_pending_event(struct irq_work *entry)
5804 {
5805         struct perf_event *event = container_of(entry,
5806                         struct perf_event, pending);
5807         int rctx;
5808
5809         rctx = perf_swevent_get_recursion_context();
5810         /*
5811          * If we 'fail' here, that's OK, it means recursion is already disabled
5812          * and we won't recurse 'further'.
5813          */
5814
5815         if (event->pending_disable) {
5816                 event->pending_disable = 0;
5817                 perf_event_disable_local(event);
5818         }
5819
5820         if (event->pending_wakeup) {
5821                 event->pending_wakeup = 0;
5822                 perf_event_wakeup(event);
5823         }
5824
5825         if (rctx >= 0)
5826                 perf_swevent_put_recursion_context(rctx);
5827 }
5828
5829 /*
5830  * We assume there is only KVM supporting the callbacks.
5831  * Later on, we might change it to a list if there is
5832  * another virtualization implementation supporting the callbacks.
5833  */
5834 struct perf_guest_info_callbacks *perf_guest_cbs;
5835
5836 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5837 {
5838         perf_guest_cbs = cbs;
5839         return 0;
5840 }
5841 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
5842
5843 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5844 {
5845         perf_guest_cbs = NULL;
5846         return 0;
5847 }
5848 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
5849
5850 static void
5851 perf_output_sample_regs(struct perf_output_handle *handle,
5852                         struct pt_regs *regs, u64 mask)
5853 {
5854         int bit;
5855         DECLARE_BITMAP(_mask, 64);
5856
5857         bitmap_from_u64(_mask, mask);
5858         for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
5859                 u64 val;
5860
5861                 val = perf_reg_value(regs, bit);
5862                 perf_output_put(handle, val);
5863         }
5864 }
5865
5866 static void perf_sample_regs_user(struct perf_regs *regs_user,
5867                                   struct pt_regs *regs,
5868                                   struct pt_regs *regs_user_copy)
5869 {
5870         if (user_mode(regs)) {
5871                 regs_user->abi = perf_reg_abi(current);
5872                 regs_user->regs = regs;
5873         } else if (current->mm) {
5874                 perf_get_regs_user(regs_user, regs, regs_user_copy);
5875         } else {
5876                 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
5877                 regs_user->regs = NULL;
5878         }
5879 }
5880
5881 static void perf_sample_regs_intr(struct perf_regs *regs_intr,
5882                                   struct pt_regs *regs)
5883 {
5884         regs_intr->regs = regs;
5885         regs_intr->abi  = perf_reg_abi(current);
5886 }
5887
5888
5889 /*
5890  * Get remaining task size from user stack pointer.
5891  *
5892  * It'd be better to take stack vma map and limit this more
5893  * precisly, but there's no way to get it safely under interrupt,
5894  * so using TASK_SIZE as limit.
5895  */
5896 static u64 perf_ustack_task_size(struct pt_regs *regs)
5897 {
5898         unsigned long addr = perf_user_stack_pointer(regs);
5899
5900         if (!addr || addr >= TASK_SIZE)
5901                 return 0;
5902
5903         return TASK_SIZE - addr;
5904 }
5905
5906 static u16
5907 perf_sample_ustack_size(u16 stack_size, u16 header_size,
5908                         struct pt_regs *regs)
5909 {
5910         u64 task_size;
5911
5912         /* No regs, no stack pointer, no dump. */
5913         if (!regs)
5914                 return 0;
5915
5916         /*
5917          * Check if we fit in with the requested stack size into the:
5918          * - TASK_SIZE
5919          *   If we don't, we limit the size to the TASK_SIZE.
5920          *
5921          * - remaining sample size
5922          *   If we don't, we customize the stack size to
5923          *   fit in to the remaining sample size.
5924          */
5925
5926         task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
5927         stack_size = min(stack_size, (u16) task_size);
5928
5929         /* Current header size plus static size and dynamic size. */
5930         header_size += 2 * sizeof(u64);
5931
5932         /* Do we fit in with the current stack dump size? */
5933         if ((u16) (header_size + stack_size) < header_size) {
5934                 /*
5935                  * If we overflow the maximum size for the sample,
5936                  * we customize the stack dump size to fit in.
5937                  */
5938                 stack_size = USHRT_MAX - header_size - sizeof(u64);
5939                 stack_size = round_up(stack_size, sizeof(u64));
5940         }
5941
5942         return stack_size;
5943 }
5944
5945 static void
5946 perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
5947                           struct pt_regs *regs)
5948 {
5949         /* Case of a kernel thread, nothing to dump */
5950         if (!regs) {
5951                 u64 size = 0;
5952                 perf_output_put(handle, size);
5953         } else {
5954                 unsigned long sp;
5955                 unsigned int rem;
5956                 u64 dyn_size;
5957                 mm_segment_t fs;
5958
5959                 /*
5960                  * We dump:
5961                  * static size
5962                  *   - the size requested by user or the best one we can fit
5963                  *     in to the sample max size
5964                  * data
5965                  *   - user stack dump data
5966                  * dynamic size
5967                  *   - the actual dumped size
5968                  */
5969
5970                 /* Static size. */
5971                 perf_output_put(handle, dump_size);
5972
5973                 /* Data. */
5974                 sp = perf_user_stack_pointer(regs);
5975                 fs = get_fs();
5976                 set_fs(USER_DS);
5977                 rem = __output_copy_user(handle, (void *) sp, dump_size);
5978                 set_fs(fs);
5979                 dyn_size = dump_size - rem;
5980
5981                 perf_output_skip(handle, rem);
5982
5983                 /* Dynamic size. */
5984                 perf_output_put(handle, dyn_size);
5985         }
5986 }
5987
5988 static void __perf_event_header__init_id(struct perf_event_header *header,
5989                                          struct perf_sample_data *data,
5990                                          struct perf_event *event)
5991 {
5992         u64 sample_type = event->attr.sample_type;
5993
5994         data->type = sample_type;
5995         header->size += event->id_header_size;
5996
5997         if (sample_type & PERF_SAMPLE_TID) {
5998                 /* namespace issues */
5999                 data->tid_entry.pid = perf_event_pid(event, current);
6000                 data->tid_entry.tid = perf_event_tid(event, current);
6001         }
6002
6003         if (sample_type & PERF_SAMPLE_TIME)
6004                 data->time = perf_event_clock(event);
6005
6006         if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
6007                 data->id = primary_event_id(event);
6008
6009         if (sample_type & PERF_SAMPLE_STREAM_ID)
6010                 data->stream_id = event->id;
6011
6012         if (sample_type & PERF_SAMPLE_CPU) {
6013                 data->cpu_entry.cpu      = raw_smp_processor_id();
6014                 data->cpu_entry.reserved = 0;
6015         }
6016 }
6017
6018 void perf_event_header__init_id(struct perf_event_header *header,
6019                                 struct perf_sample_data *data,
6020                                 struct perf_event *event)
6021 {
6022         if (event->attr.sample_id_all)
6023                 __perf_event_header__init_id(header, data, event);
6024 }
6025
6026 static void __perf_event__output_id_sample(struct perf_output_handle *handle,
6027                                            struct perf_sample_data *data)
6028 {
6029         u64 sample_type = data->type;
6030
6031         if (sample_type & PERF_SAMPLE_TID)
6032                 perf_output_put(handle, data->tid_entry);
6033
6034         if (sample_type & PERF_SAMPLE_TIME)
6035                 perf_output_put(handle, data->time);
6036
6037         if (sample_type & PERF_SAMPLE_ID)
6038                 perf_output_put(handle, data->id);
6039
6040         if (sample_type & PERF_SAMPLE_STREAM_ID)
6041                 perf_output_put(handle, data->stream_id);
6042
6043         if (sample_type & PERF_SAMPLE_CPU)
6044                 perf_output_put(handle, data->cpu_entry);
6045
6046         if (sample_type & PERF_SAMPLE_IDENTIFIER)
6047                 perf_output_put(handle, data->id);
6048 }
6049
6050 void perf_event__output_id_sample(struct perf_event *event,
6051                                   struct perf_output_handle *handle,
6052                                   struct perf_sample_data *sample)
6053 {
6054         if (event->attr.sample_id_all)
6055                 __perf_event__output_id_sample(handle, sample);
6056 }
6057
6058 static void perf_output_read_one(struct perf_output_handle *handle,
6059                                  struct perf_event *event,
6060                                  u64 enabled, u64 running)
6061 {
6062         u64 read_format = event->attr.read_format;
6063         u64 values[4];
6064         int n = 0;
6065
6066         values[n++] = perf_event_count(event);
6067         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
6068                 values[n++] = enabled +
6069                         atomic64_read(&event->child_total_time_enabled);
6070         }
6071         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
6072                 values[n++] = running +
6073                         atomic64_read(&event->child_total_time_running);
6074         }
6075         if (read_format & PERF_FORMAT_ID)
6076                 values[n++] = primary_event_id(event);
6077
6078         __output_copy(handle, values, n * sizeof(u64));
6079 }
6080
6081 static void perf_output_read_group(struct perf_output_handle *handle,
6082                             struct perf_event *event,
6083                             u64 enabled, u64 running)
6084 {
6085         struct perf_event *leader = event->group_leader, *sub;
6086         u64 read_format = event->attr.read_format;
6087         u64 values[5];
6088         int n = 0;
6089
6090         values[n++] = 1 + leader->nr_siblings;
6091
6092         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
6093                 values[n++] = enabled;
6094
6095         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
6096                 values[n++] = running;
6097
6098         if ((leader != event) &&
6099             (leader->state == PERF_EVENT_STATE_ACTIVE))
6100                 leader->pmu->read(leader);
6101
6102         values[n++] = perf_event_count(leader);
6103         if (read_format & PERF_FORMAT_ID)
6104                 values[n++] = primary_event_id(leader);
6105
6106         __output_copy(handle, values, n * sizeof(u64));
6107
6108         for_each_sibling_event(sub, leader) {
6109                 n = 0;
6110
6111                 if ((sub != event) &&
6112                     (sub->state == PERF_EVENT_STATE_ACTIVE))
6113                         sub->pmu->read(sub);
6114
6115                 values[n++] = perf_event_count(sub);
6116                 if (read_format & PERF_FORMAT_ID)
6117                         values[n++] = primary_event_id(sub);
6118
6119                 __output_copy(handle, values, n * sizeof(u64));
6120         }
6121 }
6122
6123 #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
6124                                  PERF_FORMAT_TOTAL_TIME_RUNNING)
6125
6126 /*
6127  * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
6128  *
6129  * The problem is that its both hard and excessively expensive to iterate the
6130  * child list, not to mention that its impossible to IPI the children running
6131  * on another CPU, from interrupt/NMI context.
6132  */
6133 static void perf_output_read(struct perf_output_handle *handle,
6134                              struct perf_event *event)
6135 {
6136         u64 enabled = 0, running = 0, now;
6137         u64 read_format = event->attr.read_format;
6138
6139         /*
6140          * compute total_time_enabled, total_time_running
6141          * based on snapshot values taken when the event
6142          * was last scheduled in.
6143          *
6144          * we cannot simply called update_context_time()
6145          * because of locking issue as we are called in
6146          * NMI context
6147          */
6148         if (read_format & PERF_FORMAT_TOTAL_TIMES)
6149                 calc_timer_values(event, &now, &enabled, &running);
6150
6151         if (event->attr.read_format & PERF_FORMAT_GROUP)
6152                 perf_output_read_group(handle, event, enabled, running);
6153         else
6154                 perf_output_read_one(handle, event, enabled, running);
6155 }
6156
6157 void perf_output_sample(struct perf_output_handle *handle,
6158                         struct perf_event_header *header,
6159                         struct perf_sample_data *data,
6160                         struct perf_event *event)
6161 {
6162         u64 sample_type = data->type;
6163
6164         perf_output_put(handle, *header);
6165
6166         if (sample_type & PERF_SAMPLE_IDENTIFIER)
6167                 perf_output_put(handle, data->id);
6168
6169         if (sample_type & PERF_SAMPLE_IP)
6170                 perf_output_put(handle, data->ip);
6171
6172         if (sample_type & PERF_SAMPLE_TID)
6173                 perf_output_put(handle, data->tid_entry);
6174
6175         if (sample_type & PERF_SAMPLE_TIME)
6176                 perf_output_put(handle, data->time);
6177
6178         if (sample_type & PERF_SAMPLE_ADDR)
6179                 perf_output_put(handle, data->addr);
6180
6181         if (sample_type & PERF_SAMPLE_ID)
6182                 perf_output_put(handle, data->id);
6183
6184         if (sample_type & PERF_SAMPLE_STREAM_ID)
6185                 perf_output_put(handle, data->stream_id);
6186
6187         if (sample_type & PERF_SAMPLE_CPU)
6188                 perf_output_put(handle, data->cpu_entry);
6189
6190         if (sample_type & PERF_SAMPLE_PERIOD)
6191                 perf_output_put(handle, data->period);
6192
6193         if (sample_type & PERF_SAMPLE_READ)
6194                 perf_output_read(handle, event);
6195
6196         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
6197                 int size = 1;
6198
6199                 size += data->callchain->nr;
6200                 size *= sizeof(u64);
6201                 __output_copy(handle, data->callchain, size);
6202         }
6203
6204         if (sample_type & PERF_SAMPLE_RAW) {
6205                 struct perf_raw_record *raw = data->raw;
6206
6207                 if (raw) {
6208                         struct perf_raw_frag *frag = &raw->frag;
6209
6210                         perf_output_put(handle, raw->size);
6211                         do {
6212                                 if (frag->copy) {
6213                                         __output_custom(handle, frag->copy,
6214                                                         frag->data, frag->size);
6215                                 } else {
6216                                         __output_copy(handle, frag->data,
6217                                                       frag->size);
6218                                 }
6219                                 if (perf_raw_frag_last(frag))
6220                                         break;
6221                                 frag = frag->next;
6222                         } while (1);
6223                         if (frag->pad)
6224                                 __output_skip(handle, NULL, frag->pad);
6225                 } else {
6226                         struct {
6227                                 u32     size;
6228                                 u32     data;
6229                         } raw = {
6230                                 .size = sizeof(u32),
6231                                 .data = 0,
6232                         };
6233                         perf_output_put(handle, raw);
6234                 }
6235         }
6236
6237         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
6238                 if (data->br_stack) {
6239                         size_t size;
6240
6241                         size = data->br_stack->nr
6242                              * sizeof(struct perf_branch_entry);
6243
6244                         perf_output_put(handle, data->br_stack->nr);
6245                         perf_output_copy(handle, data->br_stack->entries, size);
6246                 } else {
6247                         /*
6248                          * we always store at least the value of nr
6249                          */
6250                         u64 nr = 0;
6251                         perf_output_put(handle, nr);
6252                 }
6253         }
6254
6255         if (sample_type & PERF_SAMPLE_REGS_USER) {
6256                 u64 abi = data->regs_user.abi;
6257
6258                 /*
6259                  * If there are no regs to dump, notice it through
6260                  * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
6261                  */
6262                 perf_output_put(handle, abi);
6263
6264                 if (abi) {
6265                         u64 mask = event->attr.sample_regs_user;
6266                         perf_output_sample_regs(handle,
6267                                                 data->regs_user.regs,
6268                                                 mask);
6269                 }
6270         }
6271
6272         if (sample_type & PERF_SAMPLE_STACK_USER) {
6273                 perf_output_sample_ustack(handle,
6274                                           data->stack_user_size,
6275                                           data->regs_user.regs);
6276         }
6277
6278         if (sample_type & PERF_SAMPLE_WEIGHT)
6279                 perf_output_put(handle, data->weight);
6280
6281         if (sample_type & PERF_SAMPLE_DATA_SRC)
6282                 perf_output_put(handle, data->data_src.val);
6283
6284         if (sample_type & PERF_SAMPLE_TRANSACTION)
6285                 perf_output_put(handle, data->txn);
6286
6287         if (sample_type & PERF_SAMPLE_REGS_INTR) {
6288                 u64 abi = data->regs_intr.abi;
6289                 /*
6290                  * If there are no regs to dump, notice it through
6291                  * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
6292                  */
6293                 perf_output_put(handle, abi);
6294
6295                 if (abi) {
6296                         u64 mask = event->attr.sample_regs_intr;
6297
6298                         perf_output_sample_regs(handle,
6299                                                 data->regs_intr.regs,
6300                                                 mask);
6301                 }
6302         }
6303
6304         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
6305                 perf_output_put(handle, data->phys_addr);
6306
6307         if (!event->attr.watermark) {
6308                 int wakeup_events = event->attr.wakeup_events;
6309
6310                 if (wakeup_events) {
6311                         struct ring_buffer *rb = handle->rb;
6312                         int events = local_inc_return(&rb->events);
6313
6314                         if (events >= wakeup_events) {
6315                                 local_sub(wakeup_events, &rb->events);
6316                                 local_inc(&rb->wakeup);
6317                         }
6318                 }
6319         }
6320 }
6321
6322 static u64 perf_virt_to_phys(u64 virt)
6323 {
6324         u64 phys_addr = 0;
6325         struct page *p = NULL;
6326
6327         if (!virt)
6328                 return 0;
6329
6330         if (virt >= TASK_SIZE) {
6331                 /* If it's vmalloc()d memory, leave phys_addr as 0 */
6332                 if (virt_addr_valid((void *)(uintptr_t)virt) &&
6333                     !(virt >= VMALLOC_START && virt < VMALLOC_END))
6334                         phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
6335         } else {
6336                 /*
6337                  * Walking the pages tables for user address.
6338                  * Interrupts are disabled, so it prevents any tear down
6339                  * of the page tables.
6340                  * Try IRQ-safe __get_user_pages_fast first.
6341                  * If failed, leave phys_addr as 0.
6342                  */
6343                 if ((current->mm != NULL) &&
6344                     (__get_user_pages_fast(virt, 1, 0, &p) == 1))
6345                         phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
6346
6347                 if (p)
6348                         put_page(p);
6349         }
6350
6351         return phys_addr;
6352 }
6353
6354 static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
6355
6356 struct perf_callchain_entry *
6357 perf_callchain(struct perf_event *event, struct pt_regs *regs)
6358 {
6359         bool kernel = !event->attr.exclude_callchain_kernel;
6360         bool user   = !event->attr.exclude_callchain_user;
6361         /* Disallow cross-task user callchains. */
6362         bool crosstask = event->ctx->task && event->ctx->task != current;
6363         const u32 max_stack = event->attr.sample_max_stack;
6364         struct perf_callchain_entry *callchain;
6365
6366         if (!kernel && !user)
6367                 return &__empty_callchain;
6368
6369         callchain = get_perf_callchain(regs, 0, kernel, user,
6370                                        max_stack, crosstask, true);
6371         return callchain ?: &__empty_callchain;
6372 }
6373
6374 void perf_prepare_sample(struct perf_event_header *header,
6375                          struct perf_sample_data *data,
6376                          struct perf_event *event,
6377                          struct pt_regs *regs)
6378 {
6379         u64 sample_type = event->attr.sample_type;
6380
6381         header->type = PERF_RECORD_SAMPLE;
6382         header->size = sizeof(*header) + event->header_size;
6383
6384         header->misc = 0;
6385         header->misc |= perf_misc_flags(regs);
6386
6387         __perf_event_header__init_id(header, data, event);
6388
6389         if (sample_type & PERF_SAMPLE_IP)
6390                 data->ip = perf_instruction_pointer(regs);
6391
6392         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
6393                 int size = 1;
6394
6395                 if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
6396                         data->callchain = perf_callchain(event, regs);
6397
6398                 size += data->callchain->nr;
6399
6400                 header->size += size * sizeof(u64);
6401         }
6402
6403         if (sample_type & PERF_SAMPLE_RAW) {
6404                 struct perf_raw_record *raw = data->raw;
6405                 int size;
6406
6407                 if (raw) {
6408                         struct perf_raw_frag *frag = &raw->frag;
6409                         u32 sum = 0;
6410
6411                         do {
6412                                 sum += frag->size;
6413                                 if (perf_raw_frag_last(frag))
6414                                         break;
6415                                 frag = frag->next;
6416                         } while (1);
6417
6418                         size = round_up(sum + sizeof(u32), sizeof(u64));
6419                         raw->size = size - sizeof(u32);
6420                         frag->pad = raw->size - sum;
6421                 } else {
6422                         size = sizeof(u64);
6423                 }
6424
6425                 header->size += size;
6426         }
6427
6428         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
6429                 int size = sizeof(u64); /* nr */
6430                 if (data->br_stack) {
6431                         size += data->br_stack->nr
6432                               * sizeof(struct perf_branch_entry);
6433                 }
6434                 header->size += size;
6435         }
6436
6437         if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
6438                 perf_sample_regs_user(&data->regs_user, regs,
6439                                       &data->regs_user_copy);
6440
6441         if (sample_type & PERF_SAMPLE_REGS_USER) {
6442                 /* regs dump ABI info */
6443                 int size = sizeof(u64);
6444
6445                 if (data->regs_user.regs) {
6446                         u64 mask = event->attr.sample_regs_user;
6447                         size += hweight64(mask) * sizeof(u64);
6448                 }
6449
6450                 header->size += size;
6451         }
6452
6453         if (sample_type & PERF_SAMPLE_STACK_USER) {
6454                 /*
6455                  * Either we need PERF_SAMPLE_STACK_USER bit to be allways
6456                  * processed as the last one or have additional check added
6457                  * in case new sample type is added, because we could eat
6458                  * up the rest of the sample size.
6459                  */
6460                 u16 stack_size = event->attr.sample_stack_user;
6461                 u16 size = sizeof(u64);
6462
6463                 stack_size = perf_sample_ustack_size(stack_size, header->size,
6464                                                      data->regs_user.regs);
6465
6466                 /*
6467                  * If there is something to dump, add space for the dump
6468                  * itself and for the field that tells the dynamic size,
6469                  * which is how many have been actually dumped.
6470                  */
6471                 if (stack_size)
6472                         size += sizeof(u64) + stack_size;
6473
6474                 data->stack_user_size = stack_size;
6475                 header->size += size;
6476         }
6477
6478         if (sample_type & PERF_SAMPLE_REGS_INTR) {
6479                 /* regs dump ABI info */
6480                 int size = sizeof(u64);
6481
6482                 perf_sample_regs_intr(&data->regs_intr, regs);
6483
6484                 if (data->regs_intr.regs) {
6485                         u64 mask = event->attr.sample_regs_intr;
6486
6487                         size += hweight64(mask) * sizeof(u64);
6488                 }
6489
6490                 header->size += size;
6491         }
6492
6493         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
6494                 data->phys_addr = perf_virt_to_phys(data->addr);
6495 }
6496
6497 static __always_inline int
6498 __perf_event_output(struct perf_event *event,
6499                     struct perf_sample_data *data,
6500                     struct pt_regs *regs,
6501                     int (*output_begin)(struct perf_output_handle *,
6502                                         struct perf_event *,
6503                                         unsigned int))
6504 {
6505         struct perf_output_handle handle;
6506         struct perf_event_header header;
6507         int err;
6508
6509         /* protect the callchain buffers */
6510         rcu_read_lock();
6511
6512         perf_prepare_sample(&header, data, event, regs);
6513
6514         err = output_begin(&handle, event, header.size);
6515         if (err)
6516                 goto exit;
6517
6518         perf_output_sample(&handle, &header, data, event);
6519
6520         perf_output_end(&handle);
6521
6522 exit:
6523         rcu_read_unlock();
6524         return err;
6525 }
6526
6527 void
6528 perf_event_output_forward(struct perf_event *event,
6529                          struct perf_sample_data *data,
6530                          struct pt_regs *regs)
6531 {
6532         __perf_event_output(event, data, regs, perf_output_begin_forward);
6533 }
6534
6535 void
6536 perf_event_output_backward(struct perf_event *event,
6537                            struct perf_sample_data *data,
6538                            struct pt_regs *regs)
6539 {
6540         __perf_event_output(event, data, regs, perf_output_begin_backward);
6541 }
6542
6543 int
6544 perf_event_output(struct perf_event *event,
6545                   struct perf_sample_data *data,
6546                   struct pt_regs *regs)
6547 {
6548         return __perf_event_output(event, data, regs, perf_output_begin);
6549 }
6550
6551 /*
6552  * read event_id
6553  */
6554
6555 struct perf_read_event {
6556         struct perf_event_header        header;
6557
6558         u32                             pid;
6559         u32                             tid;
6560 };
6561
6562 static void
6563 perf_event_read_event(struct perf_event *event,
6564                         struct task_struct *task)
6565 {
6566         struct perf_output_handle handle;
6567         struct perf_sample_data sample;
6568         struct perf_read_event read_event = {
6569                 .header = {
6570                         .type = PERF_RECORD_READ,
6571                         .misc = 0,
6572                         .size = sizeof(read_event) + event->read_size,
6573                 },
6574                 .pid = perf_event_pid(event, task),
6575                 .tid = perf_event_tid(event, task),
6576         };
6577         int ret;
6578
6579         perf_event_header__init_id(&read_event.header, &sample, event);
6580         ret = perf_output_begin(&handle, event, read_event.header.size);
6581         if (ret)
6582                 return;
6583
6584         perf_output_put(&handle, read_event);
6585         perf_output_read(&handle, event);
6586         perf_event__output_id_sample(event, &handle, &sample);
6587
6588         perf_output_end(&handle);
6589 }
6590
6591 typedef void (perf_iterate_f)(struct perf_event *event, void *data);
6592
6593 static void
6594 perf_iterate_ctx(struct perf_event_context *ctx,
6595                    perf_iterate_f output,
6596                    void *data, bool all)
6597 {
6598         struct perf_event *event;
6599
6600         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
6601                 if (!all) {
6602                         if (event->state < PERF_EVENT_STATE_INACTIVE)
6603                                 continue;
6604                         if (!event_filter_match(event))
6605                                 continue;
6606                 }
6607
6608                 output(event, data);
6609         }
6610 }
6611
6612 static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
6613 {
6614         struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
6615         struct perf_event *event;
6616
6617         list_for_each_entry_rcu(event, &pel->list, sb_list) {
6618                 /*
6619                  * Skip events that are not fully formed yet; ensure that
6620                  * if we observe event->ctx, both event and ctx will be
6621                  * complete enough. See perf_install_in_context().
6622                  */
6623                 if (!smp_load_acquire(&event->ctx))
6624                         continue;
6625
6626                 if (event->state < PERF_EVENT_STATE_INACTIVE)
6627                         continue;
6628                 if (!event_filter_match(event))
6629                         continue;
6630                 output(event, data);
6631         }
6632 }
6633
6634 /*
6635  * Iterate all events that need to receive side-band events.
6636  *
6637  * For new callers; ensure that account_pmu_sb_event() includes
6638  * your event, otherwise it might not get delivered.
6639  */
6640 static void
6641 perf_iterate_sb(perf_iterate_f output, void *data,
6642                struct perf_event_context *task_ctx)
6643 {
6644         struct perf_event_context *ctx;
6645         int ctxn;
6646
6647         rcu_read_lock();
6648         preempt_disable();
6649
6650         /*
6651          * If we have task_ctx != NULL we only notify the task context itself.
6652          * The task_ctx is set only for EXIT events before releasing task
6653          * context.
6654          */
6655         if (task_ctx) {
6656                 perf_iterate_ctx(task_ctx, output, data, false);
6657                 goto done;
6658         }
6659
6660         perf_iterate_sb_cpu(output, data);
6661
6662         for_each_task_context_nr(ctxn) {
6663                 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
6664                 if (ctx)
6665                         perf_iterate_ctx(ctx, output, data, false);
6666         }
6667 done:
6668         preempt_enable();
6669         rcu_read_unlock();
6670 }
6671
6672 /*
6673  * Clear all file-based filters at exec, they'll have to be
6674  * re-instated when/if these objects are mmapped again.
6675  */
6676 static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
6677 {
6678         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
6679         struct perf_addr_filter *filter;
6680         unsigned int restart = 0, count = 0;
6681         unsigned long flags;
6682
6683         if (!has_addr_filter(event))
6684                 return;
6685
6686         raw_spin_lock_irqsave(&ifh->lock, flags);
6687         list_for_each_entry(filter, &ifh->list, entry) {
6688                 if (filter->path.dentry) {
6689                         event->addr_filters_offs[count] = 0;
6690                         restart++;
6691                 }
6692
6693                 count++;
6694         }
6695
6696         if (restart)
6697                 event->addr_filters_gen++;
6698         raw_spin_unlock_irqrestore(&ifh->lock, flags);
6699
6700         if (restart)
6701                 perf_event_stop(event, 1);
6702 }
6703
6704 void perf_event_exec(void)
6705 {
6706         struct perf_event_context *ctx;
6707         int ctxn;
6708
6709         rcu_read_lock();
6710         for_each_task_context_nr(ctxn) {
6711                 ctx = current->perf_event_ctxp[ctxn];
6712                 if (!ctx)
6713                         continue;
6714
6715                 perf_event_enable_on_exec(ctxn);
6716
6717                 perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
6718                                    true);
6719         }
6720         rcu_read_unlock();
6721 }
6722
6723 struct remote_output {
6724         struct ring_buffer      *rb;
6725         int                     err;
6726 };
6727
6728 static void __perf_event_output_stop(struct perf_event *event, void *data)
6729 {
6730         struct perf_event *parent = event->parent;
6731         struct remote_output *ro = data;
6732         struct ring_buffer *rb = ro->rb;
6733         struct stop_event_data sd = {
6734                 .event  = event,
6735         };
6736
6737         if (!has_aux(event))
6738                 return;
6739
6740         if (!parent)
6741                 parent = event;
6742
6743         /*
6744          * In case of inheritance, it will be the parent that links to the
6745          * ring-buffer, but it will be the child that's actually using it.
6746          *
6747          * We are using event::rb to determine if the event should be stopped,
6748          * however this may race with ring_buffer_attach() (through set_output),
6749          * which will make us skip the event that actually needs to be stopped.
6750          * So ring_buffer_attach() has to stop an aux event before re-assigning
6751          * its rb pointer.
6752          */
6753         if (rcu_dereference(parent->rb) == rb)
6754                 ro->err = __perf_event_stop(&sd);
6755 }
6756
6757 static int __perf_pmu_output_stop(void *info)
6758 {
6759         struct perf_event *event = info;
6760         struct pmu *pmu = event->pmu;
6761         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
6762         struct remote_output ro = {
6763                 .rb     = event->rb,
6764         };
6765
6766         rcu_read_lock();
6767         perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
6768         if (cpuctx->task_ctx)
6769                 perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
6770                                    &ro, false);
6771         rcu_read_unlock();
6772
6773         return ro.err;
6774 }
6775
6776 static void perf_pmu_output_stop(struct perf_event *event)
6777 {
6778         struct perf_event *iter;
6779         int err, cpu;
6780
6781 restart:
6782         rcu_read_lock();
6783         list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
6784                 /*
6785                  * For per-CPU events, we need to make sure that neither they
6786                  * nor their children are running; for cpu==-1 events it's
6787                  * sufficient to stop the event itself if it's active, since
6788                  * it can't have children.
6789                  */
6790                 cpu = iter->cpu;
6791                 if (cpu == -1)
6792                         cpu = READ_ONCE(iter->oncpu);
6793
6794                 if (cpu == -1)
6795                         continue;
6796
6797                 err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
6798                 if (err == -EAGAIN) {
6799                         rcu_read_unlock();
6800                         goto restart;
6801                 }
6802         }
6803         rcu_read_unlock();
6804 }
6805
6806 /*
6807  * task tracking -- fork/exit
6808  *
6809  * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
6810  */
6811
6812 struct perf_task_event {
6813         struct task_struct              *task;
6814         struct perf_event_context       *task_ctx;
6815
6816         struct {
6817                 struct perf_event_header        header;
6818
6819                 u32                             pid;
6820                 u32                             ppid;
6821                 u32                             tid;
6822                 u32                             ptid;
6823                 u64                             time;
6824         } event_id;
6825 };
6826
6827 static int perf_event_task_match(struct perf_event *event)
6828 {
6829         return event->attr.comm  || event->attr.mmap ||
6830                event->attr.mmap2 || event->attr.mmap_data ||
6831                event->attr.task;
6832 }
6833
6834 static void perf_event_task_output(struct perf_event *event,
6835                                    void *data)
6836 {
6837         struct perf_task_event *task_event = data;
6838         struct perf_output_handle handle;
6839         struct perf_sample_data sample;
6840         struct task_struct *task = task_event->task;
6841         int ret, size = task_event->event_id.header.size;
6842
6843         if (!perf_event_task_match(event))
6844                 return;
6845
6846         perf_event_header__init_id(&task_event->event_id.header, &sample, event);
6847
6848         ret = perf_output_begin(&handle, event,
6849                                 task_event->event_id.header.size);
6850         if (ret)
6851                 goto out;
6852
6853         task_event->event_id.pid = perf_event_pid(event, task);
6854         task_event->event_id.ppid = perf_event_pid(event, current);
6855
6856         task_event->event_id.tid = perf_event_tid(event, task);
6857         task_event->event_id.ptid = perf_event_tid(event, current);
6858
6859         task_event->event_id.time = perf_event_clock(event);
6860
6861         perf_output_put(&handle, task_event->event_id);
6862
6863         perf_event__output_id_sample(event, &handle, &sample);
6864
6865         perf_output_end(&handle);
6866 out:
6867         task_event->event_id.header.size = size;
6868 }
6869
6870 static void perf_event_task(struct task_struct *task,
6871                               struct perf_event_context *task_ctx,
6872                               int new)
6873 {
6874         struct perf_task_event task_event;
6875
6876         if (!atomic_read(&nr_comm_events) &&
6877             !atomic_read(&nr_mmap_events) &&
6878             !atomic_read(&nr_task_events))
6879                 return;
6880
6881         task_event = (struct perf_task_event){
6882                 .task     = task,
6883                 .task_ctx = task_ctx,
6884                 .event_id    = {
6885                         .header = {
6886                                 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
6887                                 .misc = 0,
6888                                 .size = sizeof(task_event.event_id),
6889                         },
6890                         /* .pid  */
6891                         /* .ppid */
6892                         /* .tid  */
6893                         /* .ptid */
6894                         /* .time */
6895                 },
6896         };
6897
6898         perf_iterate_sb(perf_event_task_output,
6899                        &task_event,
6900                        task_ctx);
6901 }
6902
6903 void perf_event_fork(struct task_struct *task)
6904 {
6905         perf_event_task(task, NULL, 1);
6906         perf_event_namespaces(task);
6907 }
6908
6909 /*
6910  * comm tracking
6911  */
6912
6913 struct perf_comm_event {
6914         struct task_struct      *task;
6915         char                    *comm;
6916         int                     comm_size;
6917
6918         struct {
6919                 struct perf_event_header        header;
6920
6921                 u32                             pid;
6922                 u32                             tid;
6923         } event_id;
6924 };
6925
6926 static int perf_event_comm_match(struct perf_event *event)
6927 {
6928         return event->attr.comm;
6929 }
6930
6931 static void perf_event_comm_output(struct perf_event *event,
6932                                    void *data)
6933 {
6934         struct perf_comm_event *comm_event = data;
6935         struct perf_output_handle handle;
6936         struct perf_sample_data sample;
6937         int size = comm_event->event_id.header.size;
6938         int ret;
6939
6940         if (!perf_event_comm_match(event))
6941                 return;
6942
6943         perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
6944         ret = perf_output_begin(&handle, event,
6945                                 comm_event->event_id.header.size);
6946
6947         if (ret)
6948                 goto out;
6949
6950         comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
6951         comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
6952
6953         perf_output_put(&handle, comm_event->event_id);
6954         __output_copy(&handle, comm_event->comm,
6955                                    comm_event->comm_size);
6956
6957         perf_event__output_id_sample(event, &handle, &sample);
6958
6959         perf_output_end(&handle);
6960 out:
6961         comm_event->event_id.header.size = size;
6962 }
6963
6964 static void perf_event_comm_event(struct perf_comm_event *comm_event)
6965 {
6966         char comm[TASK_COMM_LEN];
6967         unsigned int size;
6968
6969         memset(comm, 0, sizeof(comm));
6970         strlcpy(comm, comm_event->task->comm, sizeof(comm));
6971         size = ALIGN(strlen(comm)+1, sizeof(u64));
6972
6973         comm_event->comm = comm;
6974         comm_event->comm_size = size;
6975
6976         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
6977
6978         perf_iterate_sb(perf_event_comm_output,
6979                        comm_event,
6980                        NULL);
6981 }
6982
6983 void perf_event_comm(struct task_struct *task, bool exec)
6984 {
6985         struct perf_comm_event comm_event;
6986
6987         if (!atomic_read(&nr_comm_events))
6988                 return;
6989
6990         comm_event = (struct perf_comm_event){
6991                 .task   = task,
6992                 /* .comm      */
6993                 /* .comm_size */
6994                 .event_id  = {
6995                         .header = {
6996                                 .type = PERF_RECORD_COMM,
6997                                 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
6998                                 /* .size */
6999                         },
7000                         /* .pid */
7001                         /* .tid */
7002                 },
7003         };
7004
7005         perf_event_comm_event(&comm_event);
7006 }
7007
7008 /*
7009  * namespaces tracking
7010  */
7011
7012 struct perf_namespaces_event {
7013         struct task_struct              *task;
7014
7015         struct {
7016                 struct perf_event_header        header;
7017
7018                 u32                             pid;
7019                 u32                             tid;
7020                 u64                             nr_namespaces;
7021                 struct perf_ns_link_info        link_info[NR_NAMESPACES];
7022         } event_id;
7023 };
7024
7025 static int perf_event_namespaces_match(struct perf_event *event)
7026 {
7027         return event->attr.namespaces;
7028 }
7029
7030 static void perf_event_namespaces_output(struct perf_event *event,
7031                                          void *data)
7032 {
7033         struct perf_namespaces_event *namespaces_event = data;
7034         struct perf_output_handle handle;
7035         struct perf_sample_data sample;
7036         u16 header_size = namespaces_event->event_id.header.size;
7037         int ret;
7038
7039         if (!perf_event_namespaces_match(event))
7040                 return;
7041
7042         perf_event_header__init_id(&namespaces_event->event_id.header,
7043                                    &sample, event);
7044         ret = perf_output_begin(&handle, event,
7045                                 namespaces_event->event_id.header.size);
7046         if (ret)
7047                 goto out;
7048
7049         namespaces_event->event_id.pid = perf_event_pid(event,
7050                                                         namespaces_event->task);
7051         namespaces_event->event_id.tid = perf_event_tid(event,
7052                                                         namespaces_event->task);
7053
7054         perf_output_put(&handle, namespaces_event->event_id);
7055
7056         perf_event__output_id_sample(event, &handle, &sample);
7057
7058         perf_output_end(&handle);
7059 out:
7060         namespaces_event->event_id.header.size = header_size;
7061 }
7062
7063 static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
7064                                    struct task_struct *task,
7065                                    const struct proc_ns_operations *ns_ops)
7066 {
7067         struct path ns_path;
7068         struct inode *ns_inode;
7069         void *error;
7070
7071         error = ns_get_path(&ns_path, task, ns_ops);
7072         if (!error) {
7073                 ns_inode = ns_path.dentry->d_inode;
7074                 ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
7075                 ns_link_info->ino = ns_inode->i_ino;
7076                 path_put(&ns_path);
7077         }
7078 }
7079
7080 void perf_event_namespaces(struct task_struct *task)
7081 {
7082         struct perf_namespaces_event namespaces_event;
7083         struct perf_ns_link_info *ns_link_info;
7084
7085         if (!atomic_read(&nr_namespaces_events))
7086                 return;
7087
7088         namespaces_event = (struct perf_namespaces_event){
7089                 .task   = task,
7090                 .event_id  = {
7091                         .header = {
7092                                 .type = PERF_RECORD_NAMESPACES,
7093                                 .misc = 0,
7094                                 .size = sizeof(namespaces_event.event_id),
7095                         },
7096                         /* .pid */
7097                         /* .tid */
7098                         .nr_namespaces = NR_NAMESPACES,
7099                         /* .link_info[NR_NAMESPACES] */
7100                 },
7101         };
7102
7103         ns_link_info = namespaces_event.event_id.link_info;
7104
7105         perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
7106                                task, &mntns_operations);
7107
7108 #ifdef CONFIG_USER_NS
7109         perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
7110                                task, &userns_operations);
7111 #endif
7112 #ifdef CONFIG_NET_NS
7113         perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
7114                                task, &netns_operations);
7115 #endif
7116 #ifdef CONFIG_UTS_NS
7117         perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
7118                                task, &utsns_operations);
7119 #endif
7120 #ifdef CONFIG_IPC_NS
7121         perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
7122                                task, &ipcns_operations);
7123 #endif
7124 #ifdef CONFIG_PID_NS
7125         perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
7126                                task, &pidns_operations);
7127 #endif
7128 #ifdef CONFIG_CGROUPS
7129         perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
7130                                task, &cgroupns_operations);
7131 #endif
7132
7133         perf_iterate_sb(perf_event_namespaces_output,
7134                         &namespaces_event,
7135                         NULL);
7136 }
7137
7138 /*
7139  * mmap tracking
7140  */
7141
7142 struct perf_mmap_event {
7143         struct vm_area_struct   *vma;
7144
7145         const char              *file_name;
7146         int                     file_size;
7147         int                     maj, min;
7148         u64                     ino;
7149         u64                     ino_generation;
7150         u32                     prot, flags;
7151
7152         struct {
7153                 struct perf_event_header        header;
7154
7155                 u32                             pid;
7156                 u32                             tid;
7157                 u64                             start;
7158                 u64                             len;
7159                 u64                             pgoff;
7160         } event_id;
7161 };
7162
7163 static int perf_event_mmap_match(struct perf_event *event,
7164                                  void *data)
7165 {
7166         struct perf_mmap_event *mmap_event = data;
7167         struct vm_area_struct *vma = mmap_event->vma;
7168         int executable = vma->vm_flags & VM_EXEC;
7169
7170         return (!executable && event->attr.mmap_data) ||
7171                (executable && (event->attr.mmap || event->attr.mmap2));
7172 }
7173
7174 static void perf_event_mmap_output(struct perf_event *event,
7175                                    void *data)
7176 {
7177         struct perf_mmap_event *mmap_event = data;
7178         struct perf_output_handle handle;
7179         struct perf_sample_data sample;
7180         int size = mmap_event->event_id.header.size;
7181         int ret;
7182
7183         if (!perf_event_mmap_match(event, data))
7184                 return;
7185
7186         if (event->attr.mmap2) {
7187                 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
7188                 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
7189                 mmap_event->event_id.header.size += sizeof(mmap_event->min);
7190                 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
7191                 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
7192                 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
7193                 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
7194         }
7195
7196         perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
7197         ret = perf_output_begin(&handle, event,
7198                                 mmap_event->event_id.header.size);
7199         if (ret)
7200                 goto out;
7201
7202         mmap_event->event_id.pid = perf_event_pid(event, current);
7203         mmap_event->event_id.tid = perf_event_tid(event, current);
7204
7205         perf_output_put(&handle, mmap_event->event_id);
7206
7207         if (event->attr.mmap2) {
7208                 perf_output_put(&handle, mmap_event->maj);
7209                 perf_output_put(&handle, mmap_event->min);
7210                 perf_output_put(&handle, mmap_event->ino);
7211                 perf_output_put(&handle, mmap_event->ino_generation);
7212                 perf_output_put(&handle, mmap_event->prot);
7213                 perf_output_put(&handle, mmap_event->flags);
7214         }
7215
7216         __output_copy(&handle, mmap_event->file_name,
7217                                    mmap_event->file_size);
7218
7219         perf_event__output_id_sample(event, &handle, &sample);
7220
7221         perf_output_end(&handle);
7222 out:
7223         mmap_event->event_id.header.size = size;
7224 }
7225
7226 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
7227 {
7228         struct vm_area_struct *vma = mmap_event->vma;
7229         struct file *file = vma->vm_file;
7230         int maj = 0, min = 0;
7231         u64 ino = 0, gen = 0;
7232         u32 prot = 0, flags = 0;
7233         unsigned int size;
7234         char tmp[16];
7235         char *buf = NULL;
7236         char *name;
7237
7238         if (vma->vm_flags & VM_READ)
7239                 prot |= PROT_READ;
7240         if (vma->vm_flags & VM_WRITE)
7241                 prot |= PROT_WRITE;
7242         if (vma->vm_flags & VM_EXEC)
7243                 prot |= PROT_EXEC;
7244
7245         if (vma->vm_flags & VM_MAYSHARE)
7246                 flags = MAP_SHARED;
7247         else
7248                 flags = MAP_PRIVATE;
7249
7250         if (vma->vm_flags & VM_DENYWRITE)
7251                 flags |= MAP_DENYWRITE;
7252         if (vma->vm_flags & VM_MAYEXEC)
7253                 flags |= MAP_EXECUTABLE;
7254         if (vma->vm_flags & VM_LOCKED)
7255                 flags |= MAP_LOCKED;
7256         if (vma->vm_flags & VM_HUGETLB)
7257                 flags |= MAP_HUGETLB;
7258
7259         if (file) {
7260                 struct inode *inode;
7261                 dev_t dev;
7262
7263                 buf = kmalloc(PATH_MAX, GFP_KERNEL);
7264                 if (!buf) {
7265                         name = "//enomem";
7266                         goto cpy_name;
7267                 }
7268                 /*
7269                  * d_path() works from the end of the rb backwards, so we
7270                  * need to add enough zero bytes after the string to handle
7271                  * the 64bit alignment we do later.
7272                  */
7273                 name = file_path(file, buf, PATH_MAX - sizeof(u64));
7274                 if (IS_ERR(name)) {
7275                         name = "//toolong";
7276                         goto cpy_name;
7277                 }
7278                 inode = file_inode(vma->vm_file);
7279                 dev = inode->i_sb->s_dev;
7280                 ino = inode->i_ino;
7281                 gen = inode->i_generation;
7282                 maj = MAJOR(dev);
7283                 min = MINOR(dev);
7284
7285                 goto got_name;
7286         } else {
7287                 if (vma->vm_ops && vma->vm_ops->name) {
7288                         name = (char *) vma->vm_ops->name(vma);
7289                         if (name)
7290                                 goto cpy_name;
7291                 }
7292
7293                 name = (char *)arch_vma_name(vma);
7294                 if (name)
7295                         goto cpy_name;
7296
7297                 if (vma->vm_start <= vma->vm_mm->start_brk &&
7298                                 vma->vm_end >= vma->vm_mm->brk) {
7299                         name = "[heap]";
7300                         goto cpy_name;
7301                 }
7302                 if (vma->vm_start <= vma->vm_mm->start_stack &&
7303                                 vma->vm_end >= vma->vm_mm->start_stack) {
7304                         name = "[stack]";
7305                         goto cpy_name;
7306                 }
7307
7308                 name = "//anon";
7309                 goto cpy_name;
7310         }
7311
7312 cpy_name:
7313         strlcpy(tmp, name, sizeof(tmp));
7314         name = tmp;
7315 got_name:
7316         /*
7317          * Since our buffer works in 8 byte units we need to align our string
7318          * size to a multiple of 8. However, we must guarantee the tail end is
7319          * zero'd out to avoid leaking random bits to userspace.
7320          */
7321         size = strlen(name)+1;
7322         while (!IS_ALIGNED(size, sizeof(u64)))
7323                 name[size++] = '\0';
7324
7325         mmap_event->file_name = name;
7326         mmap_event->file_size = size;
7327         mmap_event->maj = maj;
7328         mmap_event->min = min;
7329         mmap_event->ino = ino;
7330         mmap_event->ino_generation = gen;
7331         mmap_event->prot = prot;
7332         mmap_event->flags = flags;
7333
7334         if (!(vma->vm_flags & VM_EXEC))
7335                 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
7336
7337         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
7338
7339         perf_iterate_sb(perf_event_mmap_output,
7340                        mmap_event,
7341                        NULL);
7342
7343         kfree(buf);
7344 }
7345
7346 /*
7347  * Check whether inode and address range match filter criteria.
7348  */
7349 static bool perf_addr_filter_match(struct perf_addr_filter *filter,
7350                                      struct file *file, unsigned long offset,
7351                                      unsigned long size)
7352 {
7353         /* d_inode(NULL) won't be equal to any mapped user-space file */
7354         if (!filter->path.dentry)
7355                 return false;
7356
7357         if (d_inode(filter->path.dentry) != file_inode(file))
7358                 return false;
7359
7360         if (filter->offset > offset + size)
7361                 return false;
7362
7363         if (filter->offset + filter->size < offset)
7364                 return false;
7365
7366         return true;
7367 }
7368
7369 static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
7370 {
7371         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
7372         struct vm_area_struct *vma = data;
7373         unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags;
7374         struct file *file = vma->vm_file;
7375         struct perf_addr_filter *filter;
7376         unsigned int restart = 0, count = 0;
7377
7378         if (!has_addr_filter(event))
7379                 return;
7380
7381         if (!file)
7382                 return;
7383
7384         raw_spin_lock_irqsave(&ifh->lock, flags);
7385         list_for_each_entry(filter, &ifh->list, entry) {
7386                 if (perf_addr_filter_match(filter, file, off,
7387                                              vma->vm_end - vma->vm_start)) {
7388                         event->addr_filters_offs[count] = vma->vm_start;
7389                         restart++;
7390                 }
7391
7392                 count++;
7393         }
7394
7395         if (restart)
7396                 event->addr_filters_gen++;
7397         raw_spin_unlock_irqrestore(&ifh->lock, flags);
7398
7399         if (restart)
7400                 perf_event_stop(event, 1);
7401 }
7402
7403 /*
7404  * Adjust all task's events' filters to the new vma
7405  */
7406 static void perf_addr_filters_adjust(struct vm_area_struct *vma)
7407 {
7408         struct perf_event_context *ctx;
7409         int ctxn;
7410
7411         /*
7412          * Data tracing isn't supported yet and as such there is no need
7413          * to keep track of anything that isn't related to executable code:
7414          */
7415         if (!(vma->vm_flags & VM_EXEC))
7416                 return;
7417
7418         rcu_read_lock();
7419         for_each_task_context_nr(ctxn) {
7420                 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
7421                 if (!ctx)
7422                         continue;
7423
7424                 perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
7425         }
7426         rcu_read_unlock();
7427 }
7428
7429 void perf_event_mmap(struct vm_area_struct *vma)
7430 {
7431         struct perf_mmap_event mmap_event;
7432
7433         if (!atomic_read(&nr_mmap_events))
7434                 return;
7435
7436         mmap_event = (struct perf_mmap_event){
7437                 .vma    = vma,
7438                 /* .file_name */
7439                 /* .file_size */
7440                 .event_id  = {
7441                         .header = {
7442                                 .type = PERF_RECORD_MMAP,
7443                                 .misc = PERF_RECORD_MISC_USER,
7444                                 /* .size */
7445                         },
7446                         /* .pid */
7447                         /* .tid */
7448                         .start  = vma->vm_start,
7449                         .len    = vma->vm_end - vma->vm_start,
7450                         .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
7451                 },
7452                 /* .maj (attr_mmap2 only) */
7453                 /* .min (attr_mmap2 only) */
7454                 /* .ino (attr_mmap2 only) */
7455                 /* .ino_generation (attr_mmap2 only) */
7456                 /* .prot (attr_mmap2 only) */
7457                 /* .flags (attr_mmap2 only) */
7458         };
7459
7460         perf_addr_filters_adjust(vma);
7461         perf_event_mmap_event(&mmap_event);
7462 }
7463
7464 void perf_event_aux_event(struct perf_event *event, unsigned long head,
7465                           unsigned long size, u64 flags)
7466 {
7467         struct perf_output_handle handle;
7468         struct perf_sample_data sample;
7469         struct perf_aux_event {
7470                 struct perf_event_header        header;
7471                 u64                             offset;
7472                 u64                             size;
7473                 u64                             flags;
7474         } rec = {
7475                 .header = {
7476                         .type = PERF_RECORD_AUX,
7477                         .misc = 0,
7478                         .size = sizeof(rec),
7479                 },
7480                 .offset         = head,
7481                 .size           = size,
7482                 .flags          = flags,
7483         };
7484         int ret;
7485
7486         perf_event_header__init_id(&rec.header, &sample, event);
7487         ret = perf_output_begin(&handle, event, rec.header.size);
7488
7489         if (ret)
7490                 return;
7491
7492         perf_output_put(&handle, rec);
7493         perf_event__output_id_sample(event, &handle, &sample);
7494
7495         perf_output_end(&handle);
7496 }
7497
7498 /*
7499  * Lost/dropped samples logging
7500  */
7501 void perf_log_lost_samples(struct perf_event *event, u64 lost)
7502 {
7503         struct perf_output_handle handle;
7504         struct perf_sample_data sample;
7505         int ret;
7506
7507         struct {
7508                 struct perf_event_header        header;
7509                 u64                             lost;
7510         } lost_samples_event = {
7511                 .header = {
7512                         .type = PERF_RECORD_LOST_SAMPLES,
7513                         .misc = 0,
7514                         .size = sizeof(lost_samples_event),
7515                 },
7516                 .lost           = lost,
7517         };
7518
7519         perf_event_header__init_id(&lost_samples_event.header, &sample, event);
7520
7521         ret = perf_output_begin(&handle, event,
7522                                 lost_samples_event.header.size);
7523         if (ret)
7524                 return;
7525
7526         perf_output_put(&handle, lost_samples_event);
7527         perf_event__output_id_sample(event, &handle, &sample);
7528         perf_output_end(&handle);
7529 }
7530
7531 /*
7532  * context_switch tracking
7533  */
7534
7535 struct perf_switch_event {
7536         struct task_struct      *task;
7537         struct task_struct      *next_prev;
7538
7539         struct {
7540                 struct perf_event_header        header;
7541                 u32                             next_prev_pid;
7542                 u32                             next_prev_tid;
7543         } event_id;
7544 };
7545
7546 static int perf_event_switch_match(struct perf_event *event)
7547 {
7548         return event->attr.context_switch;
7549 }
7550
7551 static void perf_event_switch_output(struct perf_event *event, void *data)
7552 {
7553         struct perf_switch_event *se = data;
7554         struct perf_output_handle handle;
7555         struct perf_sample_data sample;
7556         int ret;
7557
7558         if (!perf_event_switch_match(event))
7559                 return;
7560
7561         /* Only CPU-wide events are allowed to see next/prev pid/tid */
7562         if (event->ctx->task) {
7563                 se->event_id.header.type = PERF_RECORD_SWITCH;
7564                 se->event_id.header.size = sizeof(se->event_id.header);
7565         } else {
7566                 se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
7567                 se->event_id.header.size = sizeof(se->event_id);
7568                 se->event_id.next_prev_pid =
7569                                         perf_event_pid(event, se->next_prev);
7570                 se->event_id.next_prev_tid =
7571                                         perf_event_tid(event, se->next_prev);
7572         }
7573
7574         perf_event_header__init_id(&se->event_id.header, &sample, event);
7575
7576         ret = perf_output_begin(&handle, event, se->event_id.header.size);
7577         if (ret)
7578                 return;
7579
7580         if (event->ctx->task)
7581                 perf_output_put(&handle, se->event_id.header);
7582         else
7583                 perf_output_put(&handle, se->event_id);
7584
7585         perf_event__output_id_sample(event, &handle, &sample);
7586
7587         perf_output_end(&handle);
7588 }
7589
7590 static void perf_event_switch(struct task_struct *task,
7591                               struct task_struct *next_prev, bool sched_in)
7592 {
7593         struct perf_switch_event switch_event;
7594
7595         /* N.B. caller checks nr_switch_events != 0 */
7596
7597         switch_event = (struct perf_switch_event){
7598                 .task           = task,
7599                 .next_prev      = next_prev,
7600                 .event_id       = {
7601                         .header = {
7602                                 /* .type */
7603                                 .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
7604                                 /* .size */
7605                         },
7606                         /* .next_prev_pid */
7607                         /* .next_prev_tid */
7608                 },
7609         };
7610
7611         if (!sched_in && task->state == TASK_RUNNING)
7612                 switch_event.event_id.header.misc |=
7613                                 PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
7614
7615         perf_iterate_sb(perf_event_switch_output,
7616                        &switch_event,
7617                        NULL);
7618 }
7619
7620 /*
7621  * IRQ throttle logging
7622  */
7623
7624 static void perf_log_throttle(struct perf_event *event, int enable)
7625 {
7626         struct perf_output_handle handle;
7627         struct perf_sample_data sample;
7628         int ret;
7629
7630         struct {
7631                 struct perf_event_header        header;
7632                 u64                             time;
7633                 u64                             id;
7634                 u64                             stream_id;
7635         } throttle_event = {
7636                 .header = {
7637                         .type = PERF_RECORD_THROTTLE,
7638                         .misc = 0,
7639                         .size = sizeof(throttle_event),
7640                 },
7641                 .time           = perf_event_clock(event),
7642                 .id             = primary_event_id(event),
7643                 .stream_id      = event->id,
7644         };
7645
7646         if (enable)
7647                 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
7648
7649         perf_event_header__init_id(&throttle_event.header, &sample, event);
7650
7651         ret = perf_output_begin(&handle, event,
7652                                 throttle_event.header.size);
7653         if (ret)
7654                 return;
7655
7656         perf_output_put(&handle, throttle_event);
7657         perf_event__output_id_sample(event, &handle, &sample);
7658         perf_output_end(&handle);
7659 }
7660
7661 /*
7662  * ksymbol register/unregister tracking
7663  */
7664
7665 struct perf_ksymbol_event {
7666         const char      *name;
7667         int             name_len;
7668         struct {
7669                 struct perf_event_header        header;
7670                 u64                             addr;
7671                 u32                             len;
7672                 u16                             ksym_type;
7673                 u16                             flags;
7674         } event_id;
7675 };
7676
7677 static int perf_event_ksymbol_match(struct perf_event *event)
7678 {
7679         return event->attr.ksymbol;
7680 }
7681
7682 static void perf_event_ksymbol_output(struct perf_event *event, void *data)
7683 {
7684         struct perf_ksymbol_event *ksymbol_event = data;
7685         struct perf_output_handle handle;
7686         struct perf_sample_data sample;
7687         int ret;
7688
7689         if (!perf_event_ksymbol_match(event))
7690                 return;
7691
7692         perf_event_header__init_id(&ksymbol_event->event_id.header,
7693                                    &sample, event);
7694         ret = perf_output_begin(&handle, event,
7695                                 ksymbol_event->event_id.header.size);
7696         if (ret)
7697                 return;
7698
7699         perf_output_put(&handle, ksymbol_event->event_id);
7700         __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
7701         perf_event__output_id_sample(event, &handle, &sample);
7702
7703         perf_output_end(&handle);
7704 }
7705
7706 void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
7707                         const char *sym)
7708 {
7709         struct perf_ksymbol_event ksymbol_event;
7710         char name[KSYM_NAME_LEN];
7711         u16 flags = 0;
7712         int name_len;
7713
7714         if (!atomic_read(&nr_ksymbol_events))
7715                 return;
7716
7717         if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
7718             ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
7719                 goto err;
7720
7721         strlcpy(name, sym, KSYM_NAME_LEN);
7722         name_len = strlen(name) + 1;
7723         while (!IS_ALIGNED(name_len, sizeof(u64)))
7724                 name[name_len++] = '\0';
7725         BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
7726
7727         if (unregister)
7728                 flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
7729
7730         ksymbol_event = (struct perf_ksymbol_event){
7731                 .name = name,
7732                 .name_len = name_len,
7733                 .event_id = {
7734                         .header = {
7735                                 .type = PERF_RECORD_KSYMBOL,
7736                                 .size = sizeof(ksymbol_event.event_id) +
7737                                         name_len,
7738                         },
7739                         .addr = addr,
7740                         .len = len,
7741                         .ksym_type = ksym_type,
7742                         .flags = flags,
7743                 },
7744         };
7745
7746         perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
7747         return;
7748 err:
7749         WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
7750 }
7751
7752 /*
7753  * bpf program load/unload tracking
7754  */
7755
7756 struct perf_bpf_event {
7757         struct bpf_prog *prog;
7758         struct {
7759                 struct perf_event_header        header;
7760                 u16                             type;
7761                 u16                             flags;
7762                 u32                             id;
7763                 u8                              tag[BPF_TAG_SIZE];
7764         } event_id;
7765 };
7766
7767 static int perf_event_bpf_match(struct perf_event *event)
7768 {
7769         return event->attr.bpf_event;
7770 }
7771
7772 static void perf_event_bpf_output(struct perf_event *event, void *data)
7773 {
7774         struct perf_bpf_event *bpf_event = data;
7775         struct perf_output_handle handle;
7776         struct perf_sample_data sample;
7777         int ret;
7778
7779         if (!perf_event_bpf_match(event))
7780                 return;
7781
7782         perf_event_header__init_id(&bpf_event->event_id.header,
7783                                    &sample, event);
7784         ret = perf_output_begin(&handle, event,
7785                                 bpf_event->event_id.header.size);
7786         if (ret)
7787                 return;
7788
7789         perf_output_put(&handle, bpf_event->event_id);
7790         perf_event__output_id_sample(event, &handle, &sample);
7791
7792         perf_output_end(&handle);
7793 }
7794
7795 static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
7796                                          enum perf_bpf_event_type type)
7797 {
7798         bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
7799         char sym[KSYM_NAME_LEN];
7800         int i;
7801
7802         if (prog->aux->func_cnt == 0) {
7803                 bpf_get_prog_name(prog, sym);
7804                 perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
7805                                    (u64)(unsigned long)prog->bpf_func,
7806                                    prog->jited_len, unregister, sym);
7807         } else {
7808                 for (i = 0; i < prog->aux->func_cnt; i++) {
7809                         struct bpf_prog *subprog = prog->aux->func[i];
7810
7811                         bpf_get_prog_name(subprog, sym);
7812                         perf_event_ksymbol(
7813                                 PERF_RECORD_KSYMBOL_TYPE_BPF,
7814                                 (u64)(unsigned long)subprog->bpf_func,
7815                                 subprog->jited_len, unregister, sym);
7816                 }
7817         }
7818 }
7819
7820 void perf_event_bpf_event(struct bpf_prog *prog,
7821                           enum perf_bpf_event_type type,
7822                           u16 flags)
7823 {
7824         struct perf_bpf_event bpf_event;
7825
7826         if (type <= PERF_BPF_EVENT_UNKNOWN ||
7827             type >= PERF_BPF_EVENT_MAX)
7828                 return;
7829
7830         switch (type) {
7831         case PERF_BPF_EVENT_PROG_LOAD:
7832         case PERF_BPF_EVENT_PROG_UNLOAD:
7833                 if (atomic_read(&nr_ksymbol_events))
7834                         perf_event_bpf_emit_ksymbols(prog, type);
7835                 break;
7836         default:
7837                 break;
7838         }
7839
7840         if (!atomic_read(&nr_bpf_events))
7841                 return;
7842
7843         bpf_event = (struct perf_bpf_event){
7844                 .prog = prog,
7845                 .event_id = {
7846                         .header = {
7847                                 .type = PERF_RECORD_BPF_EVENT,
7848                                 .size = sizeof(bpf_event.event_id),
7849                         },
7850                         .type = type,
7851                         .flags = flags,
7852                         .id = prog->aux->id,
7853                 },
7854         };
7855
7856         BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));
7857
7858         memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
7859         perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
7860 }
7861
7862 void perf_event_itrace_started(struct perf_event *event)
7863 {
7864         event->attach_state |= PERF_ATTACH_ITRACE;
7865 }
7866
7867 static void perf_log_itrace_start(struct perf_event *event)
7868 {
7869         struct perf_output_handle handle;
7870         struct perf_sample_data sample;
7871         struct perf_aux_event {
7872                 struct perf_event_header        header;
7873                 u32                             pid;
7874                 u32                             tid;
7875         } rec;
7876         int ret;
7877
7878         if (event->parent)
7879                 event = event->parent;
7880
7881         if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
7882             event->attach_state & PERF_ATTACH_ITRACE)
7883                 return;
7884
7885         rec.header.type = PERF_RECORD_ITRACE_START;
7886         rec.header.misc = 0;
7887         rec.header.size = sizeof(rec);
7888         rec.pid = perf_event_pid(event, current);
7889         rec.tid = perf_event_tid(event, current);
7890
7891         perf_event_header__init_id(&rec.header, &sample, event);
7892         ret = perf_output_begin(&handle, event, rec.header.size);
7893
7894         if (ret)
7895                 return;
7896
7897         perf_output_put(&handle, rec);
7898         perf_event__output_id_sample(event, &handle, &sample);
7899
7900         perf_output_end(&handle);
7901 }
7902
7903 static int
7904 __perf_event_account_interrupt(struct perf_event *event, int throttle)
7905 {
7906         struct hw_perf_event *hwc = &event->hw;
7907         int ret = 0;
7908         u64 seq;
7909
7910         seq = __this_cpu_read(perf_throttled_seq);
7911         if (seq != hwc->interrupts_seq) {
7912                 hwc->interrupts_seq = seq;
7913                 hwc->interrupts = 1;
7914         } else {
7915                 hwc->interrupts++;
7916                 if (unlikely(throttle
7917                              && hwc->interrupts >= max_samples_per_tick)) {
7918                         __this_cpu_inc(perf_throttled_count);
7919                         tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
7920                         hwc->interrupts = MAX_INTERRUPTS;
7921                         perf_log_throttle(event, 0);
7922                         ret = 1;
7923                 }
7924         }
7925
7926         if (event->attr.freq) {
7927                 u64 now = perf_clock();
7928                 s64 delta = now - hwc->freq_time_stamp;
7929
7930                 hwc->freq_time_stamp = now;
7931
7932                 if (delta > 0 && delta < 2*TICK_NSEC)
7933                         perf_adjust_period(event, delta, hwc->last_period, true);
7934         }
7935
7936         return ret;
7937 }
7938
7939 int perf_event_account_interrupt(struct perf_event *event)
7940 {
7941         return __perf_event_account_interrupt(event, 1);
7942 }
7943
7944 /*
7945  * Generic event overflow handling, sampling.
7946  */
7947
7948 static int __perf_event_overflow(struct perf_event *event,
7949                                    int throttle, struct perf_sample_data *data,
7950                                    struct pt_regs *regs)
7951 {
7952         int events = atomic_read(&event->event_limit);
7953         int ret = 0;
7954
7955         /*
7956          * Non-sampling counters might still use the PMI to fold short
7957          * hardware counters, ignore those.
7958          */
7959         if (unlikely(!is_sampling_event(event)))
7960                 return 0;
7961
7962         ret = __perf_event_account_interrupt(event, throttle);
7963
7964         /*
7965          * XXX event_limit might not quite work as expected on inherited
7966          * events
7967          */
7968
7969         event->pending_kill = POLL_IN;
7970         if (events && atomic_dec_and_test(&event->event_limit)) {
7971                 ret = 1;
7972                 event->pending_kill = POLL_HUP;
7973
7974                 perf_event_disable_inatomic(event);
7975         }
7976
7977         READ_ONCE(event->overflow_handler)(event, data, regs);
7978
7979         if (*perf_event_fasync(event) && event->pending_kill) {
7980                 event->pending_wakeup = 1;
7981                 irq_work_queue(&event->pending);
7982         }
7983
7984         return ret;
7985 }
7986
7987 int perf_event_overflow(struct perf_event *event,
7988                           struct perf_sample_data *data,
7989                           struct pt_regs *regs)
7990 {
7991         return __perf_event_overflow(event, 1, data, regs);
7992 }
7993
7994 /*
7995  * Generic software event infrastructure
7996  */
7997
7998 struct swevent_htable {
7999         struct swevent_hlist            *swevent_hlist;
8000         struct mutex                    hlist_mutex;
8001         int                             hlist_refcount;
8002
8003         /* Recursion avoidance in each contexts */
8004         int                             recursion[PERF_NR_CONTEXTS];
8005 };
8006
8007 static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
8008
8009 /*
8010  * We directly increment event->count and keep a second value in
8011  * event->hw.period_left to count intervals. This period event
8012  * is kept in the range [-sample_period, 0] so that we can use the
8013  * sign as trigger.
8014  */
8015
8016 u64 perf_swevent_set_period(struct perf_event *event)
8017 {
8018         struct hw_perf_event *hwc = &event->hw;
8019         u64 period = hwc->last_period;
8020         u64 nr, offset;
8021         s64 old, val;
8022
8023         hwc->last_period = hwc->sample_period;
8024
8025 again:
8026         old = val = local64_read(&hwc->period_left);
8027         if (val < 0)
8028                 return 0;
8029
8030         nr = div64_u64(period + val, period);
8031         offset = nr * period;
8032         val -= offset;
8033         if (local64_cmpxchg(&hwc->period_left, old, val) != old)
8034                 goto again;
8035
8036         return nr;
8037 }
8038
8039 static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
8040                                     struct perf_sample_data *data,
8041                                     struct pt_regs *regs)
8042 {
8043         struct hw_perf_event *hwc = &event->hw;
8044         int throttle = 0;
8045
8046         if (!overflow)
8047                 overflow = perf_swevent_set_period(event);
8048
8049         if (hwc->interrupts == MAX_INTERRUPTS)
8050                 return;
8051
8052         for (; overflow; overflow--) {
8053                 if (__perf_event_overflow(event, throttle,
8054                                             data, regs)) {
8055                         /*
8056                          * We inhibit the overflow from happening when
8057                          * hwc->interrupts == MAX_INTERRUPTS.
8058                          */
8059                         break;
8060                 }
8061                 throttle = 1;
8062         }
8063 }
8064
8065 static void perf_swevent_event(struct perf_event *event, u64 nr,
8066                                struct perf_sample_data *data,
8067                                struct pt_regs *regs)
8068 {
8069         struct hw_perf_event *hwc = &event->hw;
8070
8071         local64_add(nr, &event->count);
8072
8073         if (!regs)
8074                 return;
8075
8076         if (!is_sampling_event(event))
8077                 return;
8078
8079         if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
8080                 data->period = nr;
8081                 return perf_swevent_overflow(event, 1, data, regs);
8082         } else
8083                 data->period = event->hw.last_period;
8084
8085         if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
8086                 return perf_swevent_overflow(event, 1, data, regs);
8087
8088         if (local64_add_negative(nr, &hwc->period_left))
8089                 return;
8090
8091         perf_swevent_overflow(event, 0, data, regs);
8092 }
8093
8094 static int perf_exclude_event(struct perf_event *event,
8095                               struct pt_regs *regs)
8096 {
8097         if (event->hw.state & PERF_HES_STOPPED)
8098                 return 1;
8099
8100         if (regs) {
8101                 if (event->attr.exclude_user && user_mode(regs))
8102                         return 1;
8103
8104                 if (event->attr.exclude_kernel && !user_mode(regs))
8105                         return 1;
8106         }
8107
8108         return 0;
8109 }
8110
8111 static int perf_swevent_match(struct perf_event *event,
8112                                 enum perf_type_id type,
8113                                 u32 event_id,
8114                                 struct perf_sample_data *data,
8115                                 struct pt_regs *regs)
8116 {
8117         if (event->attr.type != type)
8118                 return 0;
8119
8120         if (event->attr.config != event_id)
8121                 return 0;
8122
8123         if (perf_exclude_event(event, regs))
8124                 return 0;
8125
8126         return 1;
8127 }
8128
8129 static inline u64 swevent_hash(u64 type, u32 event_id)
8130 {
8131         u64 val = event_id | (type << 32);
8132
8133         return hash_64(val, SWEVENT_HLIST_BITS);
8134 }
8135
8136 static inline struct hlist_head *
8137 __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
8138 {
8139         u64 hash = swevent_hash(type, event_id);
8140
8141         return &hlist->heads[hash];
8142 }
8143
8144 /* For the read side: events when they trigger */
8145 static inline struct hlist_head *
8146 find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
8147 {
8148         struct swevent_hlist *hlist;
8149
8150         hlist = rcu_dereference(swhash->swevent_hlist);
8151         if (!hlist)
8152                 return NULL;
8153
8154         return __find_swevent_head(hlist, type, event_id);
8155 }
8156
8157 /* For the event head insertion and removal in the hlist */
8158 static inline struct hlist_head *
8159 find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
8160 {
8161         struct swevent_hlist *hlist;
8162         u32 event_id = event->attr.config;
8163         u64 type = event->attr.type;
8164
8165         /*
8166          * Event scheduling is always serialized against hlist allocation
8167          * and release. Which makes the protected version suitable here.
8168          * The context lock guarantees that.
8169          */
8170         hlist = rcu_dereference_protected(swhash->swevent_hlist,
8171                                           lockdep_is_held(&event->ctx->lock));
8172         if (!hlist)
8173                 return NULL;
8174
8175         return __find_swevent_head(hlist, type, event_id);
8176 }
8177
8178 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
8179                                     u64 nr,
8180                                     struct perf_sample_data *data,
8181                                     struct pt_regs *regs)
8182 {
8183         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
8184         struct perf_event *event;
8185         struct hlist_head *head;
8186
8187         rcu_read_lock();
8188         head = find_swevent_head_rcu(swhash, type, event_id);
8189         if (!head)
8190                 goto end;
8191
8192         hlist_for_each_entry_rcu(event, head, hlist_entry) {
8193                 if (perf_swevent_match(event, type, event_id, data, regs))
8194                         perf_swevent_event(event, nr, data, regs);
8195         }
8196 end:
8197         rcu_read_unlock();
8198 }
8199
8200 DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
8201
8202 int perf_swevent_get_recursion_context(void)
8203 {
8204         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
8205
8206         return get_recursion_context(swhash->recursion);
8207 }
8208 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
8209
8210 void perf_swevent_put_recursion_context(int rctx)
8211 {
8212         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
8213
8214         put_recursion_context(swhash->recursion, rctx);
8215 }
8216
8217 void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
8218 {
8219         struct perf_sample_data data;
8220
8221         if (WARN_ON_ONCE(!regs))
8222                 return;
8223
8224         perf_sample_data_init(&data, addr, 0);
8225         do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
8226 }
8227
8228 void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
8229 {
8230         int rctx;
8231
8232         preempt_disable_notrace();
8233         rctx = perf_swevent_get_recursion_context();
8234         if (unlikely(rctx < 0))
8235                 goto fail;
8236
8237         ___perf_sw_event(event_id, nr, regs, addr);
8238
8239         perf_swevent_put_recursion_context(rctx);
8240 fail:
8241         preempt_enable_notrace();
8242 }
8243
8244 static void perf_swevent_read(struct perf_event *event)
8245 {
8246 }
8247
8248 static int perf_swevent_add(struct perf_event *event, int flags)
8249 {
8250         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
8251         struct hw_perf_event *hwc = &event->hw;
8252         struct hlist_head *head;
8253
8254         if (is_sampling_event(event)) {
8255                 hwc->last_period = hwc->sample_period;
8256                 perf_swevent_set_period(event);
8257         }
8258
8259         hwc->state = !(flags & PERF_EF_START);
8260
8261         head = find_swevent_head(swhash, event);
8262         if (WARN_ON_ONCE(!head))
8263                 return -EINVAL;
8264
8265         hlist_add_head_rcu(&event->hlist_entry, head);
8266         perf_event_update_userpage(event);
8267
8268         return 0;
8269 }
8270
8271 static void perf_swevent_del(struct perf_event *event, int flags)
8272 {
8273         hlist_del_rcu(&event->hlist_entry);
8274 }
8275
8276 static void perf_swevent_start(struct perf_event *event, int flags)
8277 {
8278         event->hw.state = 0;
8279 }
8280
8281 static void perf_swevent_stop(struct perf_event *event, int flags)
8282 {
8283         event->hw.state = PERF_HES_STOPPED;
8284 }
8285
8286 /* Deref the hlist from the update side */
8287 static inline struct swevent_hlist *
8288 swevent_hlist_deref(struct swevent_htable *swhash)
8289 {
8290         return rcu_dereference_protected(swhash->swevent_hlist,
8291                                          lockdep_is_held(&swhash->hlist_mutex));
8292 }
8293
8294 static void swevent_hlist_release(struct swevent_htable *swhash)
8295 {
8296         struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
8297
8298         if (!hlist)
8299                 return;
8300
8301         RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
8302         kfree_rcu(hlist, rcu_head);
8303 }
8304
8305 static void swevent_hlist_put_cpu(int cpu)
8306 {
8307         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
8308
8309         mutex_lock(&swhash->hlist_mutex);
8310
8311         if (!--swhash->hlist_refcount)
8312                 swevent_hlist_release(swhash);
8313
8314         mutex_unlock(&swhash->hlist_mutex);
8315 }
8316
8317 static void swevent_hlist_put(void)
8318 {
8319         int cpu;
8320
8321         for_each_possible_cpu(cpu)
8322                 swevent_hlist_put_cpu(cpu);
8323 }
8324
8325 static int swevent_hlist_get_cpu(int cpu)
8326 {
8327         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
8328         int err = 0;
8329
8330         mutex_lock(&swhash->hlist_mutex);
8331         if (!swevent_hlist_deref(swhash) &&
8332             cpumask_test_cpu(cpu, perf_online_mask)) {
8333                 struct swevent_hlist *hlist;
8334
8335                 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
8336                 if (!hlist) {
8337                         err = -ENOMEM;
8338                         goto exit;
8339                 }
8340                 rcu_assign_pointer(swhash->swevent_hlist, hlist);
8341         }
8342         swhash->hlist_refcount++;
8343 exit:
8344         mutex_unlock(&swhash->hlist_mutex);
8345
8346         return err;
8347 }
8348
8349 static int swevent_hlist_get(void)
8350 {
8351         int err, cpu, failed_cpu;
8352
8353         mutex_lock(&pmus_lock);
8354         for_each_possible_cpu(cpu) {
8355                 err = swevent_hlist_get_cpu(cpu);
8356                 if (err) {
8357                         failed_cpu = cpu;
8358                         goto fail;
8359                 }
8360         }
8361         mutex_unlock(&pmus_lock);
8362         return 0;
8363 fail:
8364         for_each_possible_cpu(cpu) {
8365                 if (cpu == failed_cpu)
8366                         break;
8367                 swevent_hlist_put_cpu(cpu);
8368         }
8369         mutex_unlock(&pmus_lock);
8370         return err;
8371 }
8372
8373 struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
8374
8375 static void sw_perf_event_destroy(struct perf_event *event)
8376 {
8377         u64 event_id = event->attr.config;
8378
8379         WARN_ON(event->parent);
8380
8381         static_key_slow_dec(&perf_swevent_enabled[event_id]);
8382         swevent_hlist_put();
8383 }
8384
8385 static int perf_swevent_init(struct perf_event *event)
8386 {
8387         u64 event_id = event->attr.config;
8388
8389         if (event->attr.type != PERF_TYPE_SOFTWARE)
8390                 return -ENOENT;
8391
8392         /*
8393          * no branch sampling for software events
8394          */
8395         if (has_branch_stack(event))
8396                 return -EOPNOTSUPP;
8397
8398         switch (event_id) {
8399         case PERF_COUNT_SW_CPU_CLOCK:
8400         case PERF_COUNT_SW_TASK_CLOCK:
8401                 return -ENOENT;
8402
8403         default:
8404                 break;
8405         }
8406
8407         if (event_id >= PERF_COUNT_SW_MAX)
8408                 return -ENOENT;
8409
8410         if (!event->parent) {
8411                 int err;
8412
8413                 err = swevent_hlist_get();
8414                 if (err)
8415                         return err;
8416
8417                 static_key_slow_inc(&perf_swevent_enabled[event_id]);
8418                 event->destroy = sw_perf_event_destroy;
8419         }
8420
8421         return 0;
8422 }
8423
8424 static struct pmu perf_swevent = {
8425         .task_ctx_nr    = perf_sw_context,
8426
8427         .capabilities   = PERF_PMU_CAP_NO_NMI,
8428
8429         .event_init     = perf_swevent_init,
8430         .add            = perf_swevent_add,
8431         .del            = perf_swevent_del,
8432         .start          = perf_swevent_start,
8433         .stop           = perf_swevent_stop,
8434         .read           = perf_swevent_read,
8435 };
8436
8437 #ifdef CONFIG_EVENT_TRACING
8438
8439 static int perf_tp_filter_match(struct perf_event *event,
8440                                 struct perf_sample_data *data)
8441 {
8442         void *record = data->raw->frag.data;
8443
8444         /* only top level events have filters set */
8445         if (event->parent)
8446                 event = event->parent;
8447
8448         if (likely(!event->filter) || filter_match_preds(event->filter, record))
8449                 return 1;
8450         return 0;
8451 }
8452
8453 static int perf_tp_event_match(struct perf_event *event,
8454                                 struct perf_sample_data *data,
8455                                 struct pt_regs *regs)
8456 {
8457         if (event->hw.state & PERF_HES_STOPPED)
8458                 return 0;
8459         /*
8460          * All tracepoints are from kernel-space.
8461          */
8462         if (event->attr.exclude_kernel)
8463                 return 0;
8464
8465         if (!perf_tp_filter_match(event, data))
8466                 return 0;
8467
8468         return 1;
8469 }
8470
8471 void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
8472                                struct trace_event_call *call, u64 count,
8473                                struct pt_regs *regs, struct hlist_head *head,
8474                                struct task_struct *task)
8475 {
8476         if (bpf_prog_array_valid(call)) {
8477                 *(struct pt_regs **)raw_data = regs;
8478                 if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
8479                         perf_swevent_put_recursion_context(rctx);
8480                         return;
8481                 }
8482         }
8483         perf_tp_event(call->event.type, count, raw_data, size, regs, head,
8484                       rctx, task);
8485 }
8486 EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
8487
8488 void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
8489                    struct pt_regs *regs, struct hlist_head *head, int rctx,
8490                    struct task_struct *task)
8491 {
8492         struct perf_sample_data data;
8493         struct perf_event *event;
8494
8495         struct perf_raw_record raw = {
8496                 .frag = {
8497                         .size = entry_size,
8498                         .data = record,
8499                 },
8500         };
8501
8502         perf_sample_data_init(&data, 0, 0);
8503         data.raw = &raw;
8504
8505         perf_trace_buf_update(record, event_type);
8506
8507         hlist_for_each_entry_rcu(event, head, hlist_entry) {
8508                 if (perf_tp_event_match(event, &data, regs))
8509                         perf_swevent_event(event, count, &data, regs);
8510         }
8511
8512         /*
8513          * If we got specified a target task, also iterate its context and
8514          * deliver this event there too.
8515          */
8516         if (task && task != current) {
8517                 struct perf_event_context *ctx;
8518                 struct trace_entry *entry = record;
8519
8520                 rcu_read_lock();
8521                 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
8522                 if (!ctx)
8523                         goto unlock;
8524
8525                 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
8526                         if (event->cpu != smp_processor_id())
8527                                 continue;
8528                         if (event->attr.type != PERF_TYPE_TRACEPOINT)
8529                                 continue;
8530                         if (event->attr.config != entry->type)
8531                                 continue;
8532                         if (perf_tp_event_match(event, &data, regs))
8533                                 perf_swevent_event(event, count, &data, regs);
8534                 }
8535 unlock:
8536                 rcu_read_unlock();
8537         }
8538
8539         perf_swevent_put_recursion_context(rctx);
8540 }
8541 EXPORT_SYMBOL_GPL(perf_tp_event);
8542
8543 static void tp_perf_event_destroy(struct perf_event *event)
8544 {
8545         perf_trace_destroy(event);
8546 }
8547
8548 static int perf_tp_event_init(struct perf_event *event)
8549 {
8550         int err;
8551
8552         if (event->attr.type != PERF_TYPE_TRACEPOINT)
8553                 return -ENOENT;
8554
8555         /*
8556          * no branch sampling for tracepoint events
8557          */
8558         if (has_branch_stack(event))
8559                 return -EOPNOTSUPP;
8560
8561         err = perf_trace_init(event);
8562         if (err)
8563                 return err;
8564
8565         event->destroy = tp_perf_event_destroy;
8566
8567         return 0;
8568 }
8569
8570 static struct pmu perf_tracepoint = {
8571         .task_ctx_nr    = perf_sw_context,
8572
8573         .event_init     = perf_tp_event_init,
8574         .add            = perf_trace_add,
8575         .del            = perf_trace_del,
8576         .start          = perf_swevent_start,
8577         .stop           = perf_swevent_stop,
8578         .read           = perf_swevent_read,
8579 };
8580
8581 #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
8582 /*
8583  * Flags in config, used by dynamic PMU kprobe and uprobe
8584  * The flags should match following PMU_FORMAT_ATTR().
8585  *
8586  * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
8587  *                               if not set, create kprobe/uprobe
8588  *
8589  * The following values specify a reference counter (or semaphore in the
8590  * terminology of tools like dtrace, systemtap, etc.) Userspace Statically
8591  * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset.
8592  *
8593  * PERF_UPROBE_REF_CTR_OFFSET_BITS      # of bits in config as th offset
8594  * PERF_UPROBE_REF_CTR_OFFSET_SHIFT     # of bits to shift left
8595  */
8596 enum perf_probe_config {
8597         PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,  /* [k,u]retprobe */
8598         PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
8599         PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
8600 };
8601
8602 PMU_FORMAT_ATTR(retprobe, "config:0");
8603 #endif
8604
8605 #ifdef CONFIG_KPROBE_EVENTS
8606 static struct attribute *kprobe_attrs[] = {
8607         &format_attr_retprobe.attr,
8608         NULL,
8609 };
8610
8611 static struct attribute_group kprobe_format_group = {
8612         .name = "format",
8613         .attrs = kprobe_attrs,
8614 };
8615
8616 static const struct attribute_group *kprobe_attr_groups[] = {
8617         &kprobe_format_group,
8618         NULL,
8619 };
8620
8621 static int perf_kprobe_event_init(struct perf_event *event);
8622 static struct pmu perf_kprobe = {
8623         .task_ctx_nr    = perf_sw_context,
8624         .event_init     = perf_kprobe_event_init,
8625         .add            = perf_trace_add,
8626         .del            = perf_trace_del,
8627         .start          = perf_swevent_start,
8628         .stop           = perf_swevent_stop,
8629         .read           = perf_swevent_read,
8630         .attr_groups    = kprobe_attr_groups,
8631 };
8632
8633 static int perf_kprobe_event_init(struct perf_event *event)
8634 {
8635         int err;
8636         bool is_retprobe;
8637
8638         if (event->attr.type != perf_kprobe.type)
8639                 return -ENOENT;
8640
8641         if (!capable(CAP_SYS_ADMIN))
8642                 return -EACCES;
8643
8644         /*
8645          * no branch sampling for probe events
8646          */
8647         if (has_branch_stack(event))
8648                 return -EOPNOTSUPP;
8649
8650         is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
8651         err = perf_kprobe_init(event, is_retprobe);
8652         if (err)
8653                 return err;
8654
8655         event->destroy = perf_kprobe_destroy;
8656
8657         return 0;
8658 }
8659 #endif /* CONFIG_KPROBE_EVENTS */
8660
8661 #ifdef CONFIG_UPROBE_EVENTS
8662 PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");
8663
8664 static struct attribute *uprobe_attrs[] = {
8665         &format_attr_retprobe.attr,
8666         &format_attr_ref_ctr_offset.attr,
8667         NULL,
8668 };
8669
8670 static struct attribute_group uprobe_format_group = {
8671         .name = "format",
8672         .attrs = uprobe_attrs,
8673 };
8674
8675 static const struct attribute_group *uprobe_attr_groups[] = {
8676         &uprobe_format_group,
8677         NULL,
8678 };
8679
8680 static int perf_uprobe_event_init(struct perf_event *event);
8681 static struct pmu perf_uprobe = {
8682         .task_ctx_nr    = perf_sw_context,
8683         .event_init     = perf_uprobe_event_init,
8684         .add            = perf_trace_add,
8685         .del            = perf_trace_del,
8686         .start          = perf_swevent_start,
8687         .stop           = perf_swevent_stop,
8688         .read           = perf_swevent_read,
8689         .attr_groups    = uprobe_attr_groups,
8690 };
8691
8692 static int perf_uprobe_event_init(struct perf_event *event)
8693 {
8694         int err;
8695         unsigned long ref_ctr_offset;
8696         bool is_retprobe;
8697
8698         if (event->attr.type != perf_uprobe.type)
8699                 return -ENOENT;
8700
8701         if (!capable(CAP_SYS_ADMIN))
8702                 return -EACCES;
8703
8704         /*
8705          * no branch sampling for probe events
8706          */
8707         if (has_branch_stack(event))
8708                 return -EOPNOTSUPP;
8709
8710         is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
8711         ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
8712         err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
8713         if (err)
8714                 return err;
8715
8716         event->destroy = perf_uprobe_destroy;
8717
8718         return 0;
8719 }
8720 #endif /* CONFIG_UPROBE_EVENTS */
8721
8722 static inline void perf_tp_register(void)
8723 {
8724         perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
8725 #ifdef CONFIG_KPROBE_EVENTS
8726         perf_pmu_register(&perf_kprobe, "kprobe", -1);
8727 #endif
8728 #ifdef CONFIG_UPROBE_EVENTS
8729         perf_pmu_register(&perf_uprobe, "uprobe", -1);
8730 #endif
8731 }
8732
8733 static void perf_event_free_filter(struct perf_event *event)
8734 {
8735         ftrace_profile_free_filter(event);
8736 }
8737
8738 #ifdef CONFIG_BPF_SYSCALL
8739 static void bpf_overflow_handler(struct perf_event *event,
8740                                  struct perf_sample_data *data,
8741                                  struct pt_regs *regs)
8742 {
8743         struct bpf_perf_event_data_kern ctx = {
8744                 .data = data,
8745                 .event = event,
8746         };
8747         int ret = 0;
8748
8749         ctx.regs = perf_arch_bpf_user_pt_regs(regs);
8750         preempt_disable();
8751         if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
8752                 goto out;
8753         rcu_read_lock();
8754         ret = BPF_PROG_RUN(event->prog, &ctx);
8755         rcu_read_unlock();
8756 out:
8757         __this_cpu_dec(bpf_prog_active);
8758         preempt_enable();
8759         if (!ret)
8760                 return;
8761
8762         event->orig_overflow_handler(event, data, regs);
8763 }
8764
8765 static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
8766 {
8767         struct bpf_prog *prog;
8768
8769         if (event->overflow_handler_context)
8770                 /* hw breakpoint or kernel counter */
8771                 return -EINVAL;
8772
8773         if (event->prog)
8774                 return -EEXIST;
8775
8776         prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
8777         if (IS_ERR(prog))
8778                 return PTR_ERR(prog);
8779
8780         event->prog = prog;
8781         event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
8782         WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
8783         return 0;
8784 }
8785
8786 static void perf_event_free_bpf_handler(struct perf_event *event)
8787 {
8788         struct bpf_prog *prog = event->prog;
8789
8790         if (!prog)
8791                 return;
8792
8793         WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
8794         event->prog = NULL;
8795         bpf_prog_put(prog);
8796 }
8797 #else
8798 static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
8799 {
8800         return -EOPNOTSUPP;
8801 }
8802 static void perf_event_free_bpf_handler(struct perf_event *event)
8803 {
8804 }
8805 #endif
8806
8807 /*
8808  * returns true if the event is a tracepoint, or a kprobe/upprobe created
8809  * with perf_event_open()
8810  */
8811 static inline bool perf_event_is_tracing(struct perf_event *event)
8812 {
8813         if (event->pmu == &perf_tracepoint)
8814                 return true;
8815 #ifdef CONFIG_KPROBE_EVENTS
8816         if (event->pmu == &perf_kprobe)
8817                 return true;
8818 #endif
8819 #ifdef CONFIG_UPROBE_EVENTS
8820         if (event->pmu == &perf_uprobe)
8821                 return true;
8822 #endif
8823         return false;
8824 }
8825
8826 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
8827 {
8828         bool is_kprobe, is_tracepoint, is_syscall_tp;
8829         struct bpf_prog *prog;
8830         int ret;
8831
8832         if (!perf_event_is_tracing(event))
8833                 return perf_event_set_bpf_handler(event, prog_fd);
8834
8835         is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
8836         is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
8837         is_syscall_tp = is_syscall_trace_event(event->tp_event);
8838         if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
8839                 /* bpf programs can only be attached to u/kprobe or tracepoint */
8840                 return -EINVAL;
8841
8842         prog = bpf_prog_get(prog_fd);
8843         if (IS_ERR(prog))
8844                 return PTR_ERR(prog);
8845
8846         if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
8847             (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
8848             (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
8849                 /* valid fd, but invalid bpf program type */
8850                 bpf_prog_put(prog);
8851                 return -EINVAL;
8852         }
8853
8854         /* Kprobe override only works for kprobes, not uprobes. */
8855         if (prog->kprobe_override &&
8856             !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
8857                 bpf_prog_put(prog);
8858                 return -EINVAL;
8859         }
8860
8861         if (is_tracepoint || is_syscall_tp) {
8862                 int off = trace_event_get_offsets(event->tp_event);
8863
8864                 if (prog->aux->max_ctx_offset > off) {
8865                         bpf_prog_put(prog);
8866                         return -EACCES;
8867                 }
8868         }
8869
8870         ret = perf_event_attach_bpf_prog(event, prog);
8871         if (ret)
8872                 bpf_prog_put(prog);
8873         return ret;
8874 }
8875
8876 static void perf_event_free_bpf_prog(struct perf_event *event)
8877 {
8878         if (!perf_event_is_tracing(event)) {
8879                 perf_event_free_bpf_handler(event);
8880                 return;
8881         }
8882         perf_event_detach_bpf_prog(event);
8883 }
8884
8885 #else
8886
8887 static inline void perf_tp_register(void)
8888 {
8889 }
8890
8891 static void perf_event_free_filter(struct perf_event *event)
8892 {
8893 }
8894
8895 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
8896 {
8897         return -ENOENT;
8898 }
8899
8900 static void perf_event_free_bpf_prog(struct perf_event *event)
8901 {
8902 }
8903 #endif /* CONFIG_EVENT_TRACING */
8904
8905 #ifdef CONFIG_HAVE_HW_BREAKPOINT
8906 void perf_bp_event(struct perf_event *bp, void *data)
8907 {
8908         struct perf_sample_data sample;
8909         struct pt_regs *regs = data;
8910
8911         perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
8912
8913         if (!bp->hw.state && !perf_exclude_event(bp, regs))
8914                 perf_swevent_event(bp, 1, &sample, regs);
8915 }
8916 #endif
8917
8918 /*
8919  * Allocate a new address filter
8920  */
8921 static struct perf_addr_filter *
8922 perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
8923 {
8924         int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
8925         struct perf_addr_filter *filter;
8926
8927         filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
8928         if (!filter)
8929                 return NULL;
8930
8931         INIT_LIST_HEAD(&filter->entry);
8932         list_add_tail(&filter->entry, filters);
8933
8934         return filter;
8935 }
8936
8937 static void free_filters_list(struct list_head *filters)
8938 {
8939         struct perf_addr_filter *filter, *iter;
8940
8941         list_for_each_entry_safe(filter, iter, filters, entry) {
8942                 path_put(&filter->path);
8943                 list_del(&filter->entry);
8944                 kfree(filter);
8945         }
8946 }
8947
8948 /*
8949  * Free existing address filters and optionally install new ones
8950  */
8951 static void perf_addr_filters_splice(struct perf_event *event,
8952                                      struct list_head *head)
8953 {
8954         unsigned long flags;
8955         LIST_HEAD(list);
8956
8957         if (!has_addr_filter(event))
8958                 return;
8959
8960         /* don't bother with children, they don't have their own filters */
8961         if (event->parent)
8962                 return;
8963
8964         raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
8965
8966         list_splice_init(&event->addr_filters.list, &list);
8967         if (head)
8968                 list_splice(head, &event->addr_filters.list);
8969
8970         raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
8971
8972         free_filters_list(&list);
8973 }
8974
8975 /*
8976  * Scan through mm's vmas and see if one of them matches the
8977  * @filter; if so, adjust filter's address range.
8978  * Called with mm::mmap_sem down for reading.
8979  */
8980 static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter,
8981                                             struct mm_struct *mm)
8982 {
8983         struct vm_area_struct *vma;
8984
8985         for (vma = mm->mmap; vma; vma = vma->vm_next) {
8986                 struct file *file = vma->vm_file;
8987                 unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
8988                 unsigned long vma_size = vma->vm_end - vma->vm_start;
8989
8990                 if (!file)
8991                         continue;
8992
8993                 if (!perf_addr_filter_match(filter, file, off, vma_size))
8994                         continue;
8995
8996                 return vma->vm_start;
8997         }
8998
8999         return 0;
9000 }
9001
9002 /*
9003  * Update event's address range filters based on the
9004  * task's existing mappings, if any.
9005  */
9006 static void perf_event_addr_filters_apply(struct perf_event *event)
9007 {
9008         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
9009         struct task_struct *task = READ_ONCE(event->ctx->task);
9010         struct perf_addr_filter *filter;
9011         struct mm_struct *mm = NULL;
9012         unsigned int count = 0;
9013         unsigned long flags;
9014
9015         /*
9016          * We may observe TASK_TOMBSTONE, which means that the event tear-down
9017          * will stop on the parent's child_mutex that our caller is also holding
9018          */
9019         if (task == TASK_TOMBSTONE)
9020                 return;
9021
9022         if (!ifh->nr_file_filters)
9023                 return;
9024
9025         mm = get_task_mm(event->ctx->task);
9026         if (!mm)
9027                 goto restart;
9028
9029         down_read(&mm->mmap_sem);
9030
9031         raw_spin_lock_irqsave(&ifh->lock, flags);
9032         list_for_each_entry(filter, &ifh->list, entry) {
9033                 event->addr_filters_offs[count] = 0;
9034
9035                 /*
9036                  * Adjust base offset if the filter is associated to a binary
9037                  * that needs to be mapped:
9038                  */
9039                 if (filter->path.dentry)
9040                         event->addr_filters_offs[count] =
9041                                 perf_addr_filter_apply(filter, mm);
9042
9043                 count++;
9044         }
9045
9046         event->addr_filters_gen++;
9047         raw_spin_unlock_irqrestore(&ifh->lock, flags);
9048
9049         up_read(&mm->mmap_sem);
9050
9051         mmput(mm);
9052
9053 restart:
9054         perf_event_stop(event, 1);
9055 }
9056
9057 /*
9058  * Address range filtering: limiting the data to certain
9059  * instruction address ranges. Filters are ioctl()ed to us from
9060  * userspace as ascii strings.
9061  *
9062  * Filter string format:
9063  *
9064  * ACTION RANGE_SPEC
9065  * where ACTION is one of the
9066  *  * "filter": limit the trace to this region
9067  *  * "start": start tracing from this address
9068  *  * "stop": stop tracing at this address/region;
9069  * RANGE_SPEC is
9070  *  * for kernel addresses: <start address>[/<size>]
9071  *  * for object files:     <start address>[/<size>]@</path/to/object/file>
9072  *
9073  * if <size> is not specified or is zero, the range is treated as a single
9074  * address; not valid for ACTION=="filter".
9075  */
9076 enum {
9077         IF_ACT_NONE = -1,
9078         IF_ACT_FILTER,
9079         IF_ACT_START,
9080         IF_ACT_STOP,
9081         IF_SRC_FILE,
9082         IF_SRC_KERNEL,
9083         IF_SRC_FILEADDR,
9084         IF_SRC_KERNELADDR,
9085 };
9086
9087 enum {
9088         IF_STATE_ACTION = 0,
9089         IF_STATE_SOURCE,
9090         IF_STATE_END,
9091 };
9092
9093 static const match_table_t if_tokens = {
9094         { IF_ACT_FILTER,        "filter" },
9095         { IF_ACT_START,         "start" },
9096         { IF_ACT_STOP,          "stop" },
9097         { IF_SRC_FILE,          "%u/%u@%s" },
9098         { IF_SRC_KERNEL,        "%u/%u" },
9099         { IF_SRC_FILEADDR,      "%u@%s" },
9100         { IF_SRC_KERNELADDR,    "%u" },
9101         { IF_ACT_NONE,          NULL },
9102 };
9103
9104 /*
9105  * Address filter string parser
9106  */
9107 static int
9108 perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
9109                              struct list_head *filters)
9110 {
9111         struct perf_addr_filter *filter = NULL;
9112         char *start, *orig, *filename = NULL;
9113         substring_t args[MAX_OPT_ARGS];
9114         int state = IF_STATE_ACTION, token;
9115         unsigned int kernel = 0;
9116         int ret = -EINVAL;
9117
9118         orig = fstr = kstrdup(fstr, GFP_KERNEL);
9119         if (!fstr)
9120                 return -ENOMEM;
9121
9122         while ((start = strsep(&fstr, " ,\n")) != NULL) {
9123                 static const enum perf_addr_filter_action_t actions[] = {
9124                         [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER,
9125                         [IF_ACT_START]  = PERF_ADDR_FILTER_ACTION_START,
9126                         [IF_ACT_STOP]   = PERF_ADDR_FILTER_ACTION_STOP,
9127                 };
9128                 ret = -EINVAL;
9129
9130                 if (!*start)
9131                         continue;
9132
9133                 /* filter definition begins */
9134                 if (state == IF_STATE_ACTION) {
9135                         filter = perf_addr_filter_new(event, filters);
9136                         if (!filter)
9137                                 goto fail;
9138                 }
9139
9140                 token = match_token(start, if_tokens, args);
9141                 switch (token) {
9142                 case IF_ACT_FILTER:
9143                 case IF_ACT_START:
9144                 case IF_ACT_STOP:
9145                         if (state != IF_STATE_ACTION)
9146                                 goto fail;
9147
9148                         filter->action = actions[token];
9149                         state = IF_STATE_SOURCE;
9150                         break;
9151
9152                 case IF_SRC_KERNELADDR:
9153                 case IF_SRC_KERNEL:
9154                         kernel = 1;
9155
9156                 case IF_SRC_FILEADDR:
9157                 case IF_SRC_FILE:
9158                         if (state != IF_STATE_SOURCE)
9159                                 goto fail;
9160
9161                         *args[0].to = 0;
9162                         ret = kstrtoul(args[0].from, 0, &filter->offset);
9163                         if (ret)
9164                                 goto fail;
9165
9166                         if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
9167                                 *args[1].to = 0;
9168                                 ret = kstrtoul(args[1].from, 0, &filter->size);
9169                                 if (ret)
9170                                         goto fail;
9171                         }
9172
9173                         if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
9174                                 int fpos = token == IF_SRC_FILE ? 2 : 1;
9175
9176                                 filename = match_strdup(&args[fpos]);
9177                                 if (!filename) {
9178                                         ret = -ENOMEM;
9179                                         goto fail;
9180                                 }
9181                         }
9182
9183                         state = IF_STATE_END;
9184                         break;
9185
9186                 default:
9187                         goto fail;
9188                 }
9189
9190                 /*
9191                  * Filter definition is fully parsed, validate and install it.
9192                  * Make sure that it doesn't contradict itself or the event's
9193                  * attribute.
9194                  */
9195                 if (state == IF_STATE_END) {
9196                         ret = -EINVAL;
9197                         if (kernel && event->attr.exclude_kernel)
9198                                 goto fail;
9199
9200                         /*
9201                          * ACTION "filter" must have a non-zero length region
9202                          * specified.
9203                          */
9204                         if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
9205                             !filter->size)
9206                                 goto fail;
9207
9208                         if (!kernel) {
9209                                 if (!filename)
9210                                         goto fail;
9211
9212                                 /*
9213                                  * For now, we only support file-based filters
9214                                  * in per-task events; doing so for CPU-wide
9215                                  * events requires additional context switching
9216                                  * trickery, since same object code will be
9217                                  * mapped at different virtual addresses in
9218                                  * different processes.
9219                                  */
9220                                 ret = -EOPNOTSUPP;
9221                                 if (!event->ctx->task)
9222                                         goto fail_free_name;
9223
9224                                 /* look up the path and grab its inode */
9225                                 ret = kern_path(filename, LOOKUP_FOLLOW,
9226                                                 &filter->path);
9227                                 if (ret)
9228                                         goto fail_free_name;
9229
9230                                 kfree(filename);
9231                                 filename = NULL;
9232
9233                                 ret = -EINVAL;
9234                                 if (!filter->path.dentry ||
9235                                     !S_ISREG(d_inode(filter->path.dentry)
9236                                              ->i_mode))
9237                                         goto fail;
9238
9239                                 event->addr_filters.nr_file_filters++;
9240                         }
9241
9242                         /* ready to consume more filters */
9243                         state = IF_STATE_ACTION;
9244                         filter = NULL;
9245                 }
9246         }
9247
9248         if (state != IF_STATE_ACTION)
9249                 goto fail;
9250
9251         kfree(orig);
9252
9253         return 0;
9254
9255 fail_free_name:
9256         kfree(filename);
9257 fail:
9258         free_filters_list(filters);
9259         kfree(orig);
9260
9261         return ret;
9262 }
9263
9264 static int
9265 perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
9266 {
9267         LIST_HEAD(filters);
9268         int ret;
9269
9270         /*
9271          * Since this is called in perf_ioctl() path, we're already holding
9272          * ctx::mutex.
9273          */
9274         lockdep_assert_held(&event->ctx->mutex);
9275
9276         if (WARN_ON_ONCE(event->parent))
9277                 return -EINVAL;
9278
9279         ret = perf_event_parse_addr_filter(event, filter_str, &filters);
9280         if (ret)
9281                 goto fail_clear_files;
9282
9283         ret = event->pmu->addr_filters_validate(&filters);
9284         if (ret)
9285                 goto fail_free_filters;
9286
9287         /* remove existing filters, if any */
9288         perf_addr_filters_splice(event, &filters);
9289
9290         /* install new filters */
9291         perf_event_for_each_child(event, perf_event_addr_filters_apply);
9292
9293         return ret;
9294
9295 fail_free_filters:
9296         free_filters_list(&filters);
9297
9298 fail_clear_files:
9299         event->addr_filters.nr_file_filters = 0;
9300
9301         return ret;
9302 }
9303
9304 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
9305 {
9306         int ret = -EINVAL;
9307         char *filter_str;
9308
9309         filter_str = strndup_user(arg, PAGE_SIZE);
9310         if (IS_ERR(filter_str))
9311                 return PTR_ERR(filter_str);
9312
9313 #ifdef CONFIG_EVENT_TRACING
9314         if (perf_event_is_tracing(event)) {
9315                 struct perf_event_context *ctx = event->ctx;
9316
9317                 /*
9318                  * Beware, here be dragons!!
9319                  *
9320                  * the tracepoint muck will deadlock against ctx->mutex, but
9321                  * the tracepoint stuff does not actually need it. So
9322                  * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we
9323                  * already have a reference on ctx.
9324                  *
9325                  * This can result in event getting moved to a different ctx,
9326                  * but that does not affect the tracepoint state.
9327                  */
9328                 mutex_unlock(&ctx->mutex);
9329                 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
9330                 mutex_lock(&ctx->mutex);
9331         } else
9332 #endif
9333         if (has_addr_filter(event))
9334                 ret = perf_event_set_addr_filter(event, filter_str);
9335
9336         kfree(filter_str);
9337         return ret;
9338 }
9339
9340 /*
9341  * hrtimer based swevent callback
9342  */
9343
9344 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
9345 {
9346         enum hrtimer_restart ret = HRTIMER_RESTART;
9347         struct perf_sample_data data;
9348         struct pt_regs *regs;
9349         struct perf_event *event;
9350         u64 period;
9351
9352         event = container_of(hrtimer, struct perf_event, hw.hrtimer);
9353
9354         if (event->state != PERF_EVENT_STATE_ACTIVE)
9355                 return HRTIMER_NORESTART;
9356
9357         event->pmu->read(event);
9358
9359         perf_sample_data_init(&data, 0, event->hw.last_period);
9360         regs = get_irq_regs();
9361
9362         if (regs && !perf_exclude_event(event, regs)) {
9363                 if (!(event->attr.exclude_idle && is_idle_task(current)))
9364                         if (__perf_event_overflow(event, 1, &data, regs))
9365                                 ret = HRTIMER_NORESTART;
9366         }
9367
9368         period = max_t(u64, 10000, event->hw.sample_period);
9369         hrtimer_forward_now(hrtimer, ns_to_ktime(period));
9370
9371         return ret;
9372 }
9373
9374 static void perf_swevent_start_hrtimer(struct perf_event *event)
9375 {
9376         struct hw_perf_event *hwc = &event->hw;
9377         s64 period;
9378
9379         if (!is_sampling_event(event))
9380                 return;
9381
9382         period = local64_read(&hwc->period_left);
9383         if (period) {
9384                 if (period < 0)
9385                         period = 10000;
9386
9387                 local64_set(&hwc->period_left, 0);
9388         } else {
9389                 period = max_t(u64, 10000, hwc->sample_period);
9390         }
9391         hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
9392                       HRTIMER_MODE_REL_PINNED);
9393 }
9394
9395 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
9396 {
9397         struct hw_perf_event *hwc = &event->hw;
9398
9399         if (is_sampling_event(event)) {
9400                 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
9401                 local64_set(&hwc->period_left, ktime_to_ns(remaining));
9402
9403                 hrtimer_cancel(&hwc->hrtimer);
9404         }
9405 }
9406
9407 static void perf_swevent_init_hrtimer(struct perf_event *event)
9408 {
9409         struct hw_perf_event *hwc = &event->hw;
9410
9411         if (!is_sampling_event(event))
9412                 return;
9413
9414         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
9415         hwc->hrtimer.function = perf_swevent_hrtimer;
9416
9417         /*
9418          * Since hrtimers have a fixed rate, we can do a static freq->period
9419          * mapping and avoid the whole period adjust feedback stuff.
9420          */
9421         if (event->attr.freq) {
9422                 long freq = event->attr.sample_freq;
9423
9424                 event->attr.sample_period = NSEC_PER_SEC / freq;
9425                 hwc->sample_period = event->attr.sample_period;
9426                 local64_set(&hwc->period_left, hwc->sample_period);
9427                 hwc->last_period = hwc->sample_period;
9428                 event->attr.freq = 0;
9429         }
9430 }
9431
9432 /*
9433  * Software event: cpu wall time clock
9434  */
9435
9436 static void cpu_clock_event_update(struct perf_event *event)
9437 {
9438         s64 prev;
9439         u64 now;
9440
9441         now = local_clock();
9442         prev = local64_xchg(&event->hw.prev_count, now);
9443         local64_add(now - prev, &event->count);
9444 }
9445
9446 static void cpu_clock_event_start(struct perf_event *event, int flags)
9447 {
9448         local64_set(&event->hw.prev_count, local_clock());
9449         perf_swevent_start_hrtimer(event);
9450 }
9451
9452 static void cpu_clock_event_stop(struct perf_event *event, int flags)
9453 {
9454         perf_swevent_cancel_hrtimer(event);
9455         cpu_clock_event_update(event);
9456 }
9457
9458 static int cpu_clock_event_add(struct perf_event *event, int flags)
9459 {
9460         if (flags & PERF_EF_START)
9461                 cpu_clock_event_start(event, flags);
9462         perf_event_update_userpage(event);
9463
9464         return 0;
9465 }
9466
9467 static void cpu_clock_event_del(struct perf_event *event, int flags)
9468 {
9469         cpu_clock_event_stop(event, flags);
9470 }
9471
9472 static void cpu_clock_event_read(struct perf_event *event)
9473 {
9474         cpu_clock_event_update(event);
9475 }
9476
9477 static int cpu_clock_event_init(struct perf_event *event)
9478 {
9479         if (event->attr.type != PERF_TYPE_SOFTWARE)
9480                 return -ENOENT;
9481
9482         if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
9483                 return -ENOENT;
9484
9485         /*
9486          * no branch sampling for software events
9487          */
9488         if (has_branch_stack(event))
9489                 return -EOPNOTSUPP;
9490
9491         perf_swevent_init_hrtimer(event);
9492
9493         return 0;
9494 }
9495
9496 static struct pmu perf_cpu_clock = {
9497         .task_ctx_nr    = perf_sw_context,
9498
9499         .capabilities   = PERF_PMU_CAP_NO_NMI,
9500
9501         .event_init     = cpu_clock_event_init,
9502         .add            = cpu_clock_event_add,
9503         .del            = cpu_clock_event_del,
9504         .start          = cpu_clock_event_start,
9505         .stop           = cpu_clock_event_stop,
9506         .read           = cpu_clock_event_read,
9507 };
9508
9509 /*
9510  * Software event: task time clock
9511  */
9512
9513 static void task_clock_event_update(struct perf_event *event, u64 now)
9514 {
9515         u64 prev;
9516         s64 delta;
9517
9518         prev = local64_xchg(&event->hw.prev_count, now);
9519         delta = now - prev;
9520         local64_add(delta, &event->count);
9521 }
9522
9523 static void task_clock_event_start(struct perf_event *event, int flags)
9524 {
9525         local64_set(&event->hw.prev_count, event->ctx->time);
9526         perf_swevent_start_hrtimer(event);
9527 }
9528
9529 static void task_clock_event_stop(struct perf_event *event, int flags)
9530 {
9531         perf_swevent_cancel_hrtimer(event);
9532         task_clock_event_update(event, event->ctx->time);
9533 }
9534
9535 static int task_clock_event_add(struct perf_event *event, int flags)
9536 {
9537         if (flags & PERF_EF_START)
9538                 task_clock_event_start(event, flags);
9539         perf_event_update_userpage(event);
9540
9541         return 0;
9542 }
9543
9544 static void task_clock_event_del(struct perf_event *event, int flags)
9545 {
9546         task_clock_event_stop(event, PERF_EF_UPDATE);
9547 }
9548
9549 static void task_clock_event_read(struct perf_event *event)
9550 {
9551         u64 now = perf_clock();
9552         u64 delta = now - event->ctx->timestamp;
9553         u64 time = event->ctx->time + delta;
9554
9555         task_clock_event_update(event, time);
9556 }
9557
9558 static int task_clock_event_init(struct perf_event *event)
9559 {
9560         if (event->attr.type != PERF_TYPE_SOFTWARE)
9561                 return -ENOENT;
9562
9563         if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
9564                 return -ENOENT;
9565
9566         /*
9567          * no branch sampling for software events
9568          */
9569         if (has_branch_stack(event))
9570                 return -EOPNOTSUPP;
9571
9572         perf_swevent_init_hrtimer(event);
9573
9574         return 0;
9575 }
9576
9577 static struct pmu perf_task_clock = {
9578         .task_ctx_nr    = perf_sw_context,
9579
9580         .capabilities   = PERF_PMU_CAP_NO_NMI,
9581
9582         .event_init     = task_clock_event_init,
9583         .add            = task_clock_event_add,
9584         .del            = task_clock_event_del,
9585         .start          = task_clock_event_start,
9586         .stop           = task_clock_event_stop,
9587         .read           = task_clock_event_read,
9588 };
9589
9590 static void perf_pmu_nop_void(struct pmu *pmu)
9591 {
9592 }
9593
9594 static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
9595 {
9596 }
9597
9598 static int perf_pmu_nop_int(struct pmu *pmu)
9599 {
9600         return 0;
9601 }
9602
9603 static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
9604
9605 static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
9606 {
9607         __this_cpu_write(nop_txn_flags, flags);
9608
9609         if (flags & ~PERF_PMU_TXN_ADD)
9610                 return;
9611
9612         perf_pmu_disable(pmu);
9613 }
9614
9615 static int perf_pmu_commit_txn(struct pmu *pmu)
9616 {
9617         unsigned int flags = __this_cpu_read(nop_txn_flags);
9618
9619         __this_cpu_write(nop_txn_flags, 0);
9620
9621         if (flags & ~PERF_PMU_TXN_ADD)
9622                 return 0;
9623
9624         perf_pmu_enable(pmu);
9625         return 0;
9626 }
9627
9628 static void perf_pmu_cancel_txn(struct pmu *pmu)
9629 {
9630         unsigned int flags =  __this_cpu_read(nop_txn_flags);
9631
9632         __this_cpu_write(nop_txn_flags, 0);
9633
9634         if (flags & ~PERF_PMU_TXN_ADD)
9635                 return;
9636
9637         perf_pmu_enable(pmu);
9638 }
9639
9640 static int perf_event_idx_default(struct perf_event *event)
9641 {
9642         return 0;
9643 }
9644
9645 /*
9646  * Ensures all contexts with the same task_ctx_nr have the same
9647  * pmu_cpu_context too.
9648  */
9649 static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
9650 {
9651         struct pmu *pmu;
9652
9653         if (ctxn < 0)
9654                 return NULL;
9655
9656         list_for_each_entry(pmu, &pmus, entry) {
9657                 if (pmu->task_ctx_nr == ctxn)
9658                         return pmu->pmu_cpu_context;
9659         }
9660
9661         return NULL;
9662 }
9663
9664 static void free_pmu_context(struct pmu *pmu)
9665 {
9666         /*
9667          * Static contexts such as perf_sw_context have a global lifetime
9668          * and may be shared between different PMUs. Avoid freeing them
9669          * when a single PMU is going away.
9670          */
9671         if (pmu->task_ctx_nr > perf_invalid_context)
9672                 return;
9673
9674         free_percpu(pmu->pmu_cpu_context);
9675 }
9676
9677 /*
9678  * Let userspace know that this PMU supports address range filtering:
9679  */
9680 static ssize_t nr_addr_filters_show(struct device *dev,
9681                                     struct device_attribute *attr,
9682                                     char *page)
9683 {
9684         struct pmu *pmu = dev_get_drvdata(dev);
9685
9686         return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
9687 }
9688 DEVICE_ATTR_RO(nr_addr_filters);
9689
9690 static struct idr pmu_idr;
9691
9692 static ssize_t
9693 type_show(struct device *dev, struct device_attribute *attr, char *page)
9694 {
9695         struct pmu *pmu = dev_get_drvdata(dev);
9696
9697         return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
9698 }
9699 static DEVICE_ATTR_RO(type);
9700
9701 static ssize_t
9702 perf_event_mux_interval_ms_show(struct device *dev,
9703                                 struct device_attribute *attr,
9704                                 char *page)
9705 {
9706         struct pmu *pmu = dev_get_drvdata(dev);
9707
9708         return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
9709 }
9710
9711 static DEFINE_MUTEX(mux_interval_mutex);
9712
9713 static ssize_t
9714 perf_event_mux_interval_ms_store(struct device *dev,
9715                                  struct device_attribute *attr,
9716                                  const char *buf, size_t count)
9717 {
9718         struct pmu *pmu = dev_get_drvdata(dev);
9719         int timer, cpu, ret;
9720
9721         ret = kstrtoint(buf, 0, &timer);
9722         if (ret)
9723                 return ret;
9724
9725         if (timer < 1)
9726                 return -EINVAL;
9727
9728         /* same value, noting to do */
9729         if (timer == pmu->hrtimer_interval_ms)
9730                 return count;
9731
9732         mutex_lock(&mux_interval_mutex);
9733         pmu->hrtimer_interval_ms = timer;
9734
9735         /* update all cpuctx for this PMU */
9736         cpus_read_lock();
9737         for_each_online_cpu(cpu) {
9738                 struct perf_cpu_context *cpuctx;
9739                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
9740                 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
9741
9742                 cpu_function_call(cpu,
9743                         (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
9744         }
9745         cpus_read_unlock();
9746         mutex_unlock(&mux_interval_mutex);
9747
9748         return count;
9749 }
9750 static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
9751
9752 static struct attribute *pmu_dev_attrs[] = {
9753         &dev_attr_type.attr,
9754         &dev_attr_perf_event_mux_interval_ms.attr,
9755         NULL,
9756 };
9757 ATTRIBUTE_GROUPS(pmu_dev);
9758
9759 static int pmu_bus_running;
9760 static struct bus_type pmu_bus = {
9761         .name           = "event_source",
9762         .dev_groups     = pmu_dev_groups,
9763 };
9764
9765 static void pmu_dev_release(struct device *dev)
9766 {
9767         kfree(dev);
9768 }
9769
9770 static int pmu_dev_alloc(struct pmu *pmu)
9771 {
9772         int ret = -ENOMEM;
9773
9774         pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
9775         if (!pmu->dev)
9776                 goto out;
9777
9778         pmu->dev->groups = pmu->attr_groups;
9779         device_initialize(pmu->dev);
9780         ret = dev_set_name(pmu->dev, "%s", pmu->name);
9781         if (ret)
9782                 goto free_dev;
9783
9784         dev_set_drvdata(pmu->dev, pmu);
9785         pmu->dev->bus = &pmu_bus;
9786         pmu->dev->release = pmu_dev_release;
9787         ret = device_add(pmu->dev);
9788         if (ret)
9789                 goto free_dev;
9790
9791         /* For PMUs with address filters, throw in an extra attribute: */
9792         if (pmu->nr_addr_filters)
9793                 ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
9794
9795         if (ret)
9796                 goto del_dev;
9797
9798 out:
9799         return ret;
9800
9801 del_dev:
9802         device_del(pmu->dev);
9803
9804 free_dev:
9805         put_device(pmu->dev);
9806         goto out;
9807 }
9808
9809 static struct lock_class_key cpuctx_mutex;
9810 static struct lock_class_key cpuctx_lock;
9811
9812 int perf_pmu_register(struct pmu *pmu, const char *name, int type)
9813 {
9814         int cpu, ret;
9815
9816         mutex_lock(&pmus_lock);
9817         ret = -ENOMEM;
9818         pmu->pmu_disable_count = alloc_percpu(int);
9819         if (!pmu->pmu_disable_count)
9820                 goto unlock;
9821
9822         pmu->type = -1;
9823         if (!name)
9824                 goto skip_type;
9825         pmu->name = name;
9826
9827         if (type < 0) {
9828                 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
9829                 if (type < 0) {
9830                         ret = type;
9831                         goto free_pdc;
9832                 }
9833         }
9834         pmu->type = type;
9835
9836         if (pmu_bus_running) {
9837                 ret = pmu_dev_alloc(pmu);
9838                 if (ret)
9839                         goto free_idr;
9840         }
9841
9842 skip_type:
9843         if (pmu->task_ctx_nr == perf_hw_context) {
9844                 static int hw_context_taken = 0;
9845
9846                 /*
9847                  * Other than systems with heterogeneous CPUs, it never makes
9848                  * sense for two PMUs to share perf_hw_context. PMUs which are
9849                  * uncore must use perf_invalid_context.
9850                  */
9851                 if (WARN_ON_ONCE(hw_context_taken &&
9852                     !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
9853                         pmu->task_ctx_nr = perf_invalid_context;
9854
9855                 hw_context_taken = 1;
9856         }
9857
9858         pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
9859         if (pmu->pmu_cpu_context)
9860                 goto got_cpu_context;
9861
9862         ret = -ENOMEM;
9863         pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
9864         if (!pmu->pmu_cpu_context)
9865                 goto free_dev;
9866
9867         for_each_possible_cpu(cpu) {
9868                 struct perf_cpu_context *cpuctx;
9869
9870                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
9871                 __perf_event_init_context(&cpuctx->ctx);
9872                 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
9873                 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
9874                 cpuctx->ctx.pmu = pmu;
9875                 cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
9876
9877                 __perf_mux_hrtimer_init(cpuctx, cpu);
9878         }
9879
9880 got_cpu_context:
9881         if (!pmu->start_txn) {
9882                 if (pmu->pmu_enable) {
9883                         /*
9884                          * If we have pmu_enable/pmu_disable calls, install
9885                          * transaction stubs that use that to try and batch
9886                          * hardware accesses.
9887                          */
9888                         pmu->start_txn  = perf_pmu_start_txn;
9889                         pmu->commit_txn = perf_pmu_commit_txn;
9890                         pmu->cancel_txn = perf_pmu_cancel_txn;
9891                 } else {
9892                         pmu->start_txn  = perf_pmu_nop_txn;
9893                         pmu->commit_txn = perf_pmu_nop_int;
9894                         pmu->cancel_txn = perf_pmu_nop_void;
9895                 }
9896         }
9897
9898         if (!pmu->pmu_enable) {
9899                 pmu->pmu_enable  = perf_pmu_nop_void;
9900                 pmu->pmu_disable = perf_pmu_nop_void;
9901         }
9902
9903         if (!pmu->event_idx)
9904                 pmu->event_idx = perf_event_idx_default;
9905
9906         list_add_rcu(&pmu->entry, &pmus);
9907         atomic_set(&pmu->exclusive_cnt, 0);
9908         ret = 0;
9909 unlock:
9910         mutex_unlock(&pmus_lock);
9911
9912         return ret;
9913
9914 free_dev:
9915         device_del(pmu->dev);
9916         put_device(pmu->dev);
9917
9918 free_idr:
9919         if (pmu->type >= PERF_TYPE_MAX)
9920                 idr_remove(&pmu_idr, pmu->type);
9921
9922 free_pdc:
9923         free_percpu(pmu->pmu_disable_count);
9924         goto unlock;
9925 }
9926 EXPORT_SYMBOL_GPL(perf_pmu_register);
9927
9928 void perf_pmu_unregister(struct pmu *pmu)
9929 {
9930         mutex_lock(&pmus_lock);
9931         list_del_rcu(&pmu->entry);
9932
9933         /*
9934          * We dereference the pmu list under both SRCU and regular RCU, so
9935          * synchronize against both of those.
9936          */
9937         synchronize_srcu(&pmus_srcu);
9938         synchronize_rcu();
9939
9940         free_percpu(pmu->pmu_disable_count);
9941         if (pmu->type >= PERF_TYPE_MAX)
9942                 idr_remove(&pmu_idr, pmu->type);
9943         if (pmu_bus_running) {
9944                 if (pmu->nr_addr_filters)
9945                         device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
9946                 device_del(pmu->dev);
9947                 put_device(pmu->dev);
9948         }
9949         free_pmu_context(pmu);
9950         mutex_unlock(&pmus_lock);
9951 }
9952 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
9953
9954 static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
9955 {
9956         struct perf_event_context *ctx = NULL;
9957         int ret;
9958
9959         if (!try_module_get(pmu->module))
9960                 return -ENODEV;
9961
9962         /*
9963          * A number of pmu->event_init() methods iterate the sibling_list to,
9964          * for example, validate if the group fits on the PMU. Therefore,
9965          * if this is a sibling event, acquire the ctx->mutex to protect
9966          * the sibling_list.
9967          */
9968         if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
9969                 /*
9970                  * This ctx->mutex can nest when we're called through
9971                  * inheritance. See the perf_event_ctx_lock_nested() comment.
9972                  */
9973                 ctx = perf_event_ctx_lock_nested(event->group_leader,
9974                                                  SINGLE_DEPTH_NESTING);
9975                 BUG_ON(!ctx);
9976         }
9977
9978         event->pmu = pmu;
9979         ret = pmu->event_init(event);
9980
9981         if (ctx)
9982                 perf_event_ctx_unlock(event->group_leader, ctx);
9983
9984         if (!ret) {
9985                 if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
9986                                 event_has_any_exclude_flag(event)) {
9987                         if (event->destroy)
9988                                 event->destroy(event);
9989                         ret = -EINVAL;
9990                 }
9991         }
9992
9993         if (ret)
9994                 module_put(pmu->module);
9995
9996         return ret;
9997 }
9998
9999 static struct pmu *perf_init_event(struct perf_event *event)
10000 {
10001         struct pmu *pmu;
10002         int idx;
10003         int ret;
10004
10005         idx = srcu_read_lock(&pmus_srcu);
10006
10007         /* Try parent's PMU first: */
10008         if (event->parent && event->parent->pmu) {
10009                 pmu = event->parent->pmu;
10010                 ret = perf_try_init_event(pmu, event);
10011                 if (!ret)
10012                         goto unlock;
10013         }
10014
10015         rcu_read_lock();
10016         pmu = idr_find(&pmu_idr, event->attr.type);
10017         rcu_read_unlock();
10018         if (pmu) {
10019                 ret = perf_try_init_event(pmu, event);
10020                 if (ret)
10021                         pmu = ERR_PTR(ret);
10022                 goto unlock;
10023         }
10024
10025         list_for_each_entry_rcu(pmu, &pmus, entry) {
10026                 ret = perf_try_init_event(pmu, event);
10027                 if (!ret)
10028                         goto unlock;
10029
10030                 if (ret != -ENOENT) {
10031                         pmu = ERR_PTR(ret);
10032                         goto unlock;
10033                 }
10034         }
10035         pmu = ERR_PTR(-ENOENT);
10036 unlock:
10037         srcu_read_unlock(&pmus_srcu, idx);
10038
10039         return pmu;
10040 }
10041
10042 static void attach_sb_event(struct perf_event *event)
10043 {
10044         struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
10045
10046         raw_spin_lock(&pel->lock);
10047         list_add_rcu(&event->sb_list, &pel->list);
10048         raw_spin_unlock(&pel->lock);
10049 }
10050
10051 /*
10052  * We keep a list of all !task (and therefore per-cpu) events
10053  * that need to receive side-band records.
10054  *
10055  * This avoids having to scan all the various PMU per-cpu contexts
10056  * looking for them.
10057  */
10058 static void account_pmu_sb_event(struct perf_event *event)
10059 {
10060         if (is_sb_event(event))
10061                 attach_sb_event(event);
10062 }
10063
10064 static void account_event_cpu(struct perf_event *event, int cpu)
10065 {
10066         if (event->parent)
10067                 return;
10068
10069         if (is_cgroup_event(event))
10070                 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
10071 }
10072
10073 /* Freq events need the tick to stay alive (see perf_event_task_tick). */
10074 static void account_freq_event_nohz(void)
10075 {
10076 #ifdef CONFIG_NO_HZ_FULL
10077         /* Lock so we don't race with concurrent unaccount */
10078         spin_lock(&nr_freq_lock);
10079         if (atomic_inc_return(&nr_freq_events) == 1)
10080                 tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
10081         spin_unlock(&nr_freq_lock);
10082 #endif
10083 }
10084
10085 static void account_freq_event(void)
10086 {
10087         if (tick_nohz_full_enabled())
10088                 account_freq_event_nohz();
10089         else
10090                 atomic_inc(&nr_freq_events);
10091 }
10092
10093
10094 static void account_event(struct perf_event *event)
10095 {
10096         bool inc = false;
10097
10098         if (event->parent)
10099                 return;
10100
10101         if (event->attach_state & PERF_ATTACH_TASK)
10102                 inc = true;
10103         if (event->attr.mmap || event->attr.mmap_data)
10104                 atomic_inc(&nr_mmap_events);
10105         if (event->attr.comm)
10106                 atomic_inc(&nr_comm_events);
10107         if (event->attr.namespaces)
10108                 atomic_inc(&nr_namespaces_events);
10109         if (event->attr.task)
10110                 atomic_inc(&nr_task_events);
10111         if (event->attr.freq)
10112                 account_freq_event();
10113         if (event->attr.context_switch) {
10114                 atomic_inc(&nr_switch_events);
10115                 inc = true;
10116         }
10117         if (has_branch_stack(event))
10118                 inc = true;
10119         if (is_cgroup_event(event))
10120                 inc = true;
10121         if (event->attr.ksymbol)
10122                 atomic_inc(&nr_ksymbol_events);
10123         if (event->attr.bpf_event)
10124                 atomic_inc(&nr_bpf_events);
10125
10126         if (inc) {
10127                 /*
10128                  * We need the mutex here because static_branch_enable()
10129                  * must complete *before* the perf_sched_count increment
10130                  * becomes visible.
10131                  */
10132                 if (atomic_inc_not_zero(&perf_sched_count))
10133                         goto enabled;
10134
10135                 mutex_lock(&perf_sched_mutex);
10136                 if (!atomic_read(&perf_sched_count)) {
10137                         static_branch_enable(&perf_sched_events);
10138                         /*
10139                          * Guarantee that all CPUs observe they key change and
10140                          * call the perf scheduling hooks before proceeding to
10141                          * install events that need them.
10142                          */
10143                         synchronize_rcu();
10144                 }
10145                 /*
10146                  * Now that we have waited for the sync_sched(), allow further
10147                  * increments to by-pass the mutex.
10148                  */
10149                 atomic_inc(&perf_sched_count);
10150                 mutex_unlock(&perf_sched_mutex);
10151         }
10152 enabled:
10153
10154         account_event_cpu(event, event->cpu);
10155
10156         account_pmu_sb_event(event);
10157 }
10158
10159 /*
10160  * Allocate and initialize an event structure
10161  */
10162 static struct perf_event *
10163 perf_event_alloc(struct perf_event_attr *attr, int cpu,
10164                  struct task_struct *task,
10165                  struct perf_event *group_leader,
10166                  struct perf_event *parent_event,
10167                  perf_overflow_handler_t overflow_handler,
10168                  void *context, int cgroup_fd)
10169 {
10170         struct pmu *pmu;
10171         struct perf_event *event;
10172         struct hw_perf_event *hwc;
10173         long err = -EINVAL;
10174
10175         if ((unsigned)cpu >= nr_cpu_ids) {
10176                 if (!task || cpu != -1)
10177                         return ERR_PTR(-EINVAL);
10178         }
10179
10180         event = kzalloc(sizeof(*event), GFP_KERNEL);
10181         if (!event)
10182                 return ERR_PTR(-ENOMEM);
10183
10184         /*
10185          * Single events are their own group leaders, with an
10186          * empty sibling list:
10187          */
10188         if (!group_leader)
10189                 group_leader = event;
10190
10191         mutex_init(&event->child_mutex);
10192         INIT_LIST_HEAD(&event->child_list);
10193
10194         INIT_LIST_HEAD(&event->event_entry);
10195         INIT_LIST_HEAD(&event->sibling_list);
10196         INIT_LIST_HEAD(&event->active_list);
10197         init_event_group(event);
10198         INIT_LIST_HEAD(&event->rb_entry);
10199         INIT_LIST_HEAD(&event->active_entry);
10200         INIT_LIST_HEAD(&event->addr_filters.list);
10201         INIT_HLIST_NODE(&event->hlist_entry);
10202
10203
10204         init_waitqueue_head(&event->waitq);
10205         init_irq_work(&event->pending, perf_pending_event);
10206
10207         mutex_init(&event->mmap_mutex);
10208         raw_spin_lock_init(&event->addr_filters.lock);
10209
10210         atomic_long_set(&event->refcount, 1);
10211         event->cpu              = cpu;
10212         event->attr             = *attr;
10213         event->group_leader     = group_leader;
10214         event->pmu              = NULL;
10215         event->oncpu            = -1;
10216
10217         event->parent           = parent_event;
10218
10219         event->ns               = get_pid_ns(task_active_pid_ns(current));
10220         event->id               = atomic64_inc_return(&perf_event_id);
10221
10222         event->state            = PERF_EVENT_STATE_INACTIVE;
10223
10224         if (task) {
10225                 event->attach_state = PERF_ATTACH_TASK;
10226                 /*
10227                  * XXX pmu::event_init needs to know what task to account to
10228                  * and we cannot use the ctx information because we need the
10229                  * pmu before we get a ctx.
10230                  */
10231                 get_task_struct(task);
10232                 event->hw.target = task;
10233         }
10234
10235         event->clock = &local_clock;
10236         if (parent_event)
10237                 event->clock = parent_event->clock;
10238
10239         if (!overflow_handler && parent_event) {
10240                 overflow_handler = parent_event->overflow_handler;
10241                 context = parent_event->overflow_handler_context;
10242 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
10243                 if (overflow_handler == bpf_overflow_handler) {
10244                         struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
10245
10246                         if (IS_ERR(prog)) {
10247                                 err = PTR_ERR(prog);
10248                                 goto err_ns;
10249                         }
10250                         event->prog = prog;
10251                         event->orig_overflow_handler =
10252                                 parent_event->orig_overflow_handler;
10253                 }
10254 #endif
10255         }
10256
10257         if (overflow_handler) {
10258                 event->overflow_handler = overflow_handler;
10259                 event->overflow_handler_context = context;
10260         } else if (is_write_backward(event)){
10261                 event->overflow_handler = perf_event_output_backward;
10262                 event->overflow_handler_context = NULL;
10263         } else {
10264                 event->overflow_handler = perf_event_output_forward;
10265                 event->overflow_handler_context = NULL;
10266         }
10267
10268         perf_event__state_init(event);
10269
10270         pmu = NULL;
10271
10272         hwc = &event->hw;
10273         hwc->sample_period = attr->sample_period;
10274         if (attr->freq && attr->sample_freq)
10275                 hwc->sample_period = 1;
10276         hwc->last_period = hwc->sample_period;
10277
10278         local64_set(&hwc->period_left, hwc->sample_period);
10279
10280         /*
10281          * We currently do not support PERF_SAMPLE_READ on inherited events.
10282          * See perf_output_read().
10283          */
10284         if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
10285                 goto err_ns;
10286
10287         if (!has_branch_stack(event))
10288                 event->attr.branch_sample_type = 0;
10289
10290         if (cgroup_fd != -1) {
10291                 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
10292                 if (err)
10293                         goto err_ns;
10294         }
10295
10296         pmu = perf_init_event(event);
10297         if (IS_ERR(pmu)) {
10298                 err = PTR_ERR(pmu);
10299                 goto err_ns;
10300         }
10301
10302         err = exclusive_event_init(event);
10303         if (err)
10304                 goto err_pmu;
10305
10306         if (has_addr_filter(event)) {
10307                 event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
10308                                                    sizeof(unsigned long),
10309                                                    GFP_KERNEL);
10310                 if (!event->addr_filters_offs) {
10311                         err = -ENOMEM;
10312                         goto err_per_task;
10313                 }
10314
10315                 /* force hw sync on the address filters */
10316                 event->addr_filters_gen = 1;
10317         }
10318
10319         if (!event->parent) {
10320                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
10321                         err = get_callchain_buffers(attr->sample_max_stack);
10322                         if (err)
10323                                 goto err_addr_filters;
10324                 }
10325         }
10326
10327         /* symmetric to unaccount_event() in _free_event() */
10328         account_event(event);
10329
10330         return event;
10331
10332 err_addr_filters:
10333         kfree(event->addr_filters_offs);
10334
10335 err_per_task:
10336         exclusive_event_destroy(event);
10337
10338 err_pmu:
10339         if (event->destroy)
10340                 event->destroy(event);
10341         module_put(pmu->module);
10342 err_ns:
10343         if (is_cgroup_event(event))
10344                 perf_detach_cgroup(event);
10345         if (event->ns)
10346                 put_pid_ns(event->ns);
10347         if (event->hw.target)
10348                 put_task_struct(event->hw.target);
10349         kfree(event);
10350
10351         return ERR_PTR(err);
10352 }
10353
10354 static int perf_copy_attr(struct perf_event_attr __user *uattr,
10355                           struct perf_event_attr *attr)
10356 {
10357         u32 size;
10358         int ret;
10359
10360         if (!access_ok(uattr, PERF_ATTR_SIZE_VER0))
10361                 return -EFAULT;
10362
10363         /*
10364          * zero the full structure, so that a short copy will be nice.
10365          */
10366         memset(attr, 0, sizeof(*attr));
10367
10368         ret = get_user(size, &uattr->size);
10369         if (ret)
10370                 return ret;
10371
10372         if (size > PAGE_SIZE)   /* silly large */
10373                 goto err_size;
10374
10375         if (!size)              /* abi compat */
10376                 size = PERF_ATTR_SIZE_VER0;
10377
10378         if (size < PERF_ATTR_SIZE_VER0)
10379                 goto err_size;
10380
10381         /*
10382          * If we're handed a bigger struct than we know of,
10383          * ensure all the unknown bits are 0 - i.e. new
10384          * user-space does not rely on any kernel feature
10385          * extensions we dont know about yet.
10386          */
10387         if (size > sizeof(*attr)) {
10388                 unsigned char __user *addr;
10389                 unsigned char __user *end;
10390                 unsigned char val;
10391
10392                 addr = (void __user *)uattr + sizeof(*attr);
10393                 end  = (void __user *)uattr + size;
10394
10395                 for (; addr < end; addr++) {
10396                         ret = get_user(val, addr);
10397                         if (ret)
10398                                 return ret;
10399                         if (val)
10400                                 goto err_size;
10401                 }
10402                 size = sizeof(*attr);
10403         }
10404
10405         ret = copy_from_user(attr, uattr, size);
10406         if (ret)
10407                 return -EFAULT;
10408
10409         attr->size = size;
10410
10411         if (attr->__reserved_1)
10412                 return -EINVAL;
10413
10414         if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
10415                 return -EINVAL;
10416
10417         if (attr->read_format & ~(PERF_FORMAT_MAX-1))
10418                 return -EINVAL;
10419
10420         if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
10421                 u64 mask = attr->branch_sample_type;
10422
10423                 /* only using defined bits */
10424                 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
10425                         return -EINVAL;
10426
10427                 /* at least one branch bit must be set */
10428                 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
10429                         return -EINVAL;
10430
10431                 /* propagate priv level, when not set for branch */
10432                 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
10433
10434                         /* exclude_kernel checked on syscall entry */
10435                         if (!attr->exclude_kernel)
10436                                 mask |= PERF_SAMPLE_BRANCH_KERNEL;
10437
10438                         if (!attr->exclude_user)
10439                                 mask |= PERF_SAMPLE_BRANCH_USER;
10440
10441                         if (!attr->exclude_hv)
10442                                 mask |= PERF_SAMPLE_BRANCH_HV;
10443                         /*
10444                          * adjust user setting (for HW filter setup)
10445                          */
10446                         attr->branch_sample_type = mask;
10447                 }
10448                 /* privileged levels capture (kernel, hv): check permissions */
10449                 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
10450                     && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
10451                         return -EACCES;
10452         }
10453
10454         if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
10455                 ret = perf_reg_validate(attr->sample_regs_user);
10456                 if (ret)
10457                         return ret;
10458         }
10459
10460         if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
10461                 if (!arch_perf_have_user_stack_dump())
10462                         return -ENOSYS;
10463
10464                 /*
10465                  * We have __u32 type for the size, but so far
10466                  * we can only use __u16 as maximum due to the
10467                  * __u16 sample size limit.
10468                  */
10469                 if (attr->sample_stack_user >= USHRT_MAX)
10470                         return -EINVAL;
10471                 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
10472                         return -EINVAL;
10473         }
10474
10475         if (!attr->sample_max_stack)
10476                 attr->sample_max_stack = sysctl_perf_event_max_stack;
10477
10478         if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
10479                 ret = perf_reg_validate(attr->sample_regs_intr);
10480 out:
10481         return ret;
10482
10483 err_size:
10484         put_user(sizeof(*attr), &uattr->size);
10485         ret = -E2BIG;
10486         goto out;
10487 }
10488
10489 static int
10490 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
10491 {
10492         struct ring_buffer *rb = NULL;
10493         int ret = -EINVAL;
10494
10495         if (!output_event)
10496                 goto set;
10497
10498         /* don't allow circular references */
10499         if (event == output_event)
10500                 goto out;
10501
10502         /*
10503          * Don't allow cross-cpu buffers
10504          */
10505         if (output_event->cpu != event->cpu)
10506                 goto out;
10507
10508         /*
10509          * If its not a per-cpu rb, it must be the same task.
10510          */
10511         if (output_event->cpu == -1 && output_event->ctx != event->ctx)
10512                 goto out;
10513
10514         /*
10515          * Mixing clocks in the same buffer is trouble you don't need.
10516          */
10517         if (output_event->clock != event->clock)
10518                 goto out;
10519
10520         /*
10521          * Either writing ring buffer from beginning or from end.
10522          * Mixing is not allowed.
10523          */
10524         if (is_write_backward(output_event) != is_write_backward(event))
10525                 goto out;
10526
10527         /*
10528          * If both events generate aux data, they must be on the same PMU
10529          */
10530         if (has_aux(event) && has_aux(output_event) &&
10531             event->pmu != output_event->pmu)
10532                 goto out;
10533
10534 set:
10535         mutex_lock(&event->mmap_mutex);
10536         /* Can't redirect output if we've got an active mmap() */
10537         if (atomic_read(&event->mmap_count))
10538                 goto unlock;
10539
10540         if (output_event) {
10541                 /* get the rb we want to redirect to */
10542                 rb = ring_buffer_get(output_event);
10543                 if (!rb)
10544                         goto unlock;
10545         }
10546
10547         ring_buffer_attach(event, rb);
10548
10549         ret = 0;
10550 unlock:
10551         mutex_unlock(&event->mmap_mutex);
10552
10553 out:
10554         return ret;
10555 }
10556
10557 static void mutex_lock_double(struct mutex *a, struct mutex *b)
10558 {
10559         if (b < a)
10560                 swap(a, b);
10561
10562         mutex_lock(a);
10563         mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
10564 }
10565
10566 static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
10567 {
10568         bool nmi_safe = false;
10569
10570         switch (clk_id) {
10571         case CLOCK_MONOTONIC:
10572                 event->clock = &ktime_get_mono_fast_ns;
10573                 nmi_safe = true;
10574                 break;
10575
10576         case CLOCK_MONOTONIC_RAW:
10577                 event->clock = &ktime_get_raw_fast_ns;
10578                 nmi_safe = true;
10579                 break;
10580
10581         case CLOCK_REALTIME:
10582                 event->clock = &ktime_get_real_ns;
10583                 break;
10584
10585         case CLOCK_BOOTTIME:
10586                 event->clock = &ktime_get_boot_ns;
10587                 break;
10588
10589         case CLOCK_TAI:
10590                 event->clock = &ktime_get_tai_ns;
10591                 break;
10592
10593         default:
10594                 return -EINVAL;
10595         }
10596
10597         if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
10598                 return -EINVAL;
10599
10600         return 0;
10601 }
10602
10603 /*
10604  * Variation on perf_event_ctx_lock_nested(), except we take two context
10605  * mutexes.
10606  */
10607 static struct perf_event_context *
10608 __perf_event_ctx_lock_double(struct perf_event *group_leader,
10609                              struct perf_event_context *ctx)
10610 {
10611         struct perf_event_context *gctx;
10612
10613 again:
10614         rcu_read_lock();
10615         gctx = READ_ONCE(group_leader->ctx);
10616         if (!refcount_inc_not_zero(&gctx->refcount)) {
10617                 rcu_read_unlock();
10618                 goto again;
10619         }
10620         rcu_read_unlock();
10621
10622         mutex_lock_double(&gctx->mutex, &ctx->mutex);
10623
10624         if (group_leader->ctx != gctx) {
10625                 mutex_unlock(&ctx->mutex);
10626                 mutex_unlock(&gctx->mutex);
10627                 put_ctx(gctx);
10628                 goto again;
10629         }
10630
10631         return gctx;
10632 }
10633
10634 /**
10635  * sys_perf_event_open - open a performance event, associate it to a task/cpu
10636  *
10637  * @attr_uptr:  event_id type attributes for monitoring/sampling
10638  * @pid:                target pid
10639  * @cpu:                target cpu
10640  * @group_fd:           group leader event fd
10641  */
10642 SYSCALL_DEFINE5(perf_event_open,
10643                 struct perf_event_attr __user *, attr_uptr,
10644                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
10645 {
10646         struct perf_event *group_leader = NULL, *output_event = NULL;
10647         struct perf_event *event, *sibling;
10648         struct perf_event_attr attr;
10649         struct perf_event_context *ctx, *uninitialized_var(gctx);
10650         struct file *event_file = NULL;
10651         struct fd group = {NULL, 0};
10652         struct task_struct *task = NULL;
10653         struct pmu *pmu;
10654         int event_fd;
10655         int move_group = 0;
10656         int err;
10657         int f_flags = O_RDWR;
10658         int cgroup_fd = -1;
10659
10660         /* for future expandability... */
10661         if (flags & ~PERF_FLAG_ALL)
10662                 return -EINVAL;
10663
10664         err = perf_copy_attr(attr_uptr, &attr);
10665         if (err)
10666                 return err;
10667
10668         if (!attr.exclude_kernel) {
10669                 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
10670                         return -EACCES;
10671         }
10672
10673         if (attr.namespaces) {
10674                 if (!capable(CAP_SYS_ADMIN))
10675                         return -EACCES;
10676         }
10677
10678         if (attr.freq) {
10679                 if (attr.sample_freq > sysctl_perf_event_sample_rate)
10680                         return -EINVAL;
10681         } else {
10682                 if (attr.sample_period & (1ULL << 63))
10683                         return -EINVAL;
10684         }
10685
10686         /* Only privileged users can get physical addresses */
10687         if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
10688             perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
10689                 return -EACCES;
10690
10691         /*
10692          * In cgroup mode, the pid argument is used to pass the fd
10693          * opened to the cgroup directory in cgroupfs. The cpu argument
10694          * designates the cpu on which to monitor threads from that
10695          * cgroup.
10696          */
10697         if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
10698                 return -EINVAL;
10699
10700         if (flags & PERF_FLAG_FD_CLOEXEC)
10701                 f_flags |= O_CLOEXEC;
10702
10703         event_fd = get_unused_fd_flags(f_flags);
10704         if (event_fd < 0)
10705                 return event_fd;
10706
10707         if (group_fd != -1) {
10708                 err = perf_fget_light(group_fd, &group);
10709                 if (err)
10710                         goto err_fd;
10711                 group_leader = group.file->private_data;
10712                 if (flags & PERF_FLAG_FD_OUTPUT)
10713                         output_event = group_leader;
10714                 if (flags & PERF_FLAG_FD_NO_GROUP)
10715                         group_leader = NULL;
10716         }
10717
10718         if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
10719                 task = find_lively_task_by_vpid(pid);
10720                 if (IS_ERR(task)) {
10721                         err = PTR_ERR(task);
10722                         goto err_group_fd;
10723                 }
10724         }
10725
10726         if (task && group_leader &&
10727             group_leader->attr.inherit != attr.inherit) {
10728                 err = -EINVAL;
10729                 goto err_task;
10730         }
10731
10732         if (task) {
10733                 err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
10734                 if (err)
10735                         goto err_task;
10736
10737                 /*
10738                  * Reuse ptrace permission checks for now.
10739                  *
10740                  * We must hold cred_guard_mutex across this and any potential
10741                  * perf_install_in_context() call for this new event to
10742                  * serialize against exec() altering our credentials (and the
10743                  * perf_event_exit_task() that could imply).
10744                  */
10745                 err = -EACCES;
10746                 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
10747                         goto err_cred;
10748         }
10749
10750         if (flags & PERF_FLAG_PID_CGROUP)
10751                 cgroup_fd = pid;
10752
10753         event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
10754                                  NULL, NULL, cgroup_fd);
10755         if (IS_ERR(event)) {
10756                 err = PTR_ERR(event);
10757                 goto err_cred;
10758         }
10759
10760         if (is_sampling_event(event)) {
10761                 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
10762                         err = -EOPNOTSUPP;
10763                         goto err_alloc;
10764                 }
10765         }
10766
10767         /*
10768          * Special case software events and allow them to be part of
10769          * any hardware group.
10770          */
10771         pmu = event->pmu;
10772
10773         if (attr.use_clockid) {
10774                 err = perf_event_set_clock(event, attr.clockid);
10775                 if (err)
10776                         goto err_alloc;
10777         }
10778
10779         if (pmu->task_ctx_nr == perf_sw_context)
10780                 event->event_caps |= PERF_EV_CAP_SOFTWARE;
10781
10782         if (group_leader) {
10783                 if (is_software_event(event) &&
10784                     !in_software_context(group_leader)) {
10785                         /*
10786                          * If the event is a sw event, but the group_leader
10787                          * is on hw context.
10788                          *
10789                          * Allow the addition of software events to hw
10790                          * groups, this is safe because software events
10791                          * never fail to schedule.
10792                          */
10793                         pmu = group_leader->ctx->pmu;
10794                 } else if (!is_software_event(event) &&
10795                            is_software_event(group_leader) &&
10796                            (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
10797                         /*
10798                          * In case the group is a pure software group, and we
10799                          * try to add a hardware event, move the whole group to
10800                          * the hardware context.
10801                          */
10802                         move_group = 1;
10803                 }
10804         }
10805
10806         /*
10807          * Get the target context (task or percpu):
10808          */
10809         ctx = find_get_context(pmu, task, event);
10810         if (IS_ERR(ctx)) {
10811                 err = PTR_ERR(ctx);
10812                 goto err_alloc;
10813         }
10814
10815         if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
10816                 err = -EBUSY;
10817                 goto err_context;
10818         }
10819
10820         /*
10821          * Look up the group leader (we will attach this event to it):
10822          */
10823         if (group_leader) {
10824                 err = -EINVAL;
10825
10826                 /*
10827                  * Do not allow a recursive hierarchy (this new sibling
10828                  * becoming part of another group-sibling):
10829                  */
10830                 if (group_leader->group_leader != group_leader)
10831                         goto err_context;
10832
10833                 /* All events in a group should have the same clock */
10834                 if (group_leader->clock != event->clock)
10835                         goto err_context;
10836
10837                 /*
10838                  * Make sure we're both events for the same CPU;
10839                  * grouping events for different CPUs is broken; since
10840                  * you can never concurrently schedule them anyhow.
10841                  */
10842                 if (group_leader->cpu != event->cpu)
10843                         goto err_context;
10844
10845                 /*
10846                  * Make sure we're both on the same task, or both
10847                  * per-CPU events.
10848                  */
10849                 if (group_leader->ctx->task != ctx->task)
10850                         goto err_context;
10851
10852                 /*
10853                  * Do not allow to attach to a group in a different task
10854                  * or CPU context. If we're moving SW events, we'll fix
10855                  * this up later, so allow that.
10856                  */
10857                 if (!move_group && group_leader->ctx != ctx)
10858                         goto err_context;
10859
10860                 /*
10861                  * Only a group leader can be exclusive or pinned
10862                  */
10863                 if (attr.exclusive || attr.pinned)
10864                         goto err_context;
10865         }
10866
10867         if (output_event) {
10868                 err = perf_event_set_output(event, output_event);
10869                 if (err)
10870                         goto err_context;
10871         }
10872
10873         event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
10874                                         f_flags);
10875         if (IS_ERR(event_file)) {
10876                 err = PTR_ERR(event_file);
10877                 event_file = NULL;
10878                 goto err_context;
10879         }
10880
10881         if (move_group) {
10882                 gctx = __perf_event_ctx_lock_double(group_leader, ctx);
10883
10884                 if (gctx->task == TASK_TOMBSTONE) {
10885                         err = -ESRCH;
10886                         goto err_locked;
10887                 }
10888
10889                 /*
10890                  * Check if we raced against another sys_perf_event_open() call
10891                  * moving the software group underneath us.
10892                  */
10893                 if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
10894                         /*
10895                          * If someone moved the group out from under us, check
10896                          * if this new event wound up on the same ctx, if so
10897                          * its the regular !move_group case, otherwise fail.
10898                          */
10899                         if (gctx != ctx) {
10900                                 err = -EINVAL;
10901                                 goto err_locked;
10902                         } else {
10903                                 perf_event_ctx_unlock(group_leader, gctx);
10904                                 move_group = 0;
10905                         }
10906                 }
10907         } else {
10908                 mutex_lock(&ctx->mutex);
10909         }
10910
10911         if (ctx->task == TASK_TOMBSTONE) {
10912                 err = -ESRCH;
10913                 goto err_locked;
10914         }
10915
10916         if (!perf_event_validate_size(event)) {
10917                 err = -E2BIG;
10918                 goto err_locked;
10919         }
10920
10921         if (!task) {
10922                 /*
10923                  * Check if the @cpu we're creating an event for is online.
10924                  *
10925                  * We use the perf_cpu_context::ctx::mutex to serialize against
10926                  * the hotplug notifiers. See perf_event_{init,exit}_cpu().
10927                  */
10928                 struct perf_cpu_context *cpuctx =
10929                         container_of(ctx, struct perf_cpu_context, ctx);
10930
10931                 if (!cpuctx->online) {
10932                         err = -ENODEV;
10933                         goto err_locked;
10934                 }
10935         }
10936
10937
10938         /*
10939          * Must be under the same ctx::mutex as perf_install_in_context(),
10940          * because we need to serialize with concurrent event creation.
10941          */
10942         if (!exclusive_event_installable(event, ctx)) {
10943                 /* exclusive and group stuff are assumed mutually exclusive */
10944                 WARN_ON_ONCE(move_group);
10945
10946                 err = -EBUSY;
10947                 goto err_locked;
10948         }
10949
10950         WARN_ON_ONCE(ctx->parent_ctx);
10951
10952         /*
10953          * This is the point on no return; we cannot fail hereafter. This is
10954          * where we start modifying current state.
10955          */
10956
10957         if (move_group) {
10958                 /*
10959                  * See perf_event_ctx_lock() for comments on the details
10960                  * of swizzling perf_event::ctx.
10961                  */
10962                 perf_remove_from_context(group_leader, 0);
10963                 put_ctx(gctx);
10964
10965                 for_each_sibling_event(sibling, group_leader) {
10966                         perf_remove_from_context(sibling, 0);
10967                         put_ctx(gctx);
10968                 }
10969
10970                 /*
10971                  * Wait for everybody to stop referencing the events through
10972                  * the old lists, before installing it on new lists.
10973                  */
10974                 synchronize_rcu();
10975
10976                 /*
10977                  * Install the group siblings before the group leader.
10978                  *
10979                  * Because a group leader will try and install the entire group
10980                  * (through the sibling list, which is still in-tact), we can
10981                  * end up with siblings installed in the wrong context.
10982                  *
10983                  * By installing siblings first we NO-OP because they're not
10984                  * reachable through the group lists.
10985                  */
10986                 for_each_sibling_event(sibling, group_leader) {
10987                         perf_event__state_init(sibling);
10988                         perf_install_in_context(ctx, sibling, sibling->cpu);
10989                         get_ctx(ctx);
10990                 }
10991
10992                 /*
10993                  * Removing from the context ends up with disabled
10994                  * event. What we want here is event in the initial
10995                  * startup state, ready to be add into new context.
10996                  */
10997                 perf_event__state_init(group_leader);
10998                 perf_install_in_context(ctx, group_leader, group_leader->cpu);
10999                 get_ctx(ctx);
11000         }
11001
11002         /*
11003          * Precalculate sample_data sizes; do while holding ctx::mutex such
11004          * that we're serialized against further additions and before
11005          * perf_install_in_context() which is the point the event is active and
11006          * can use these values.
11007          */
11008         perf_event__header_size(event);
11009         perf_event__id_header_size(event);
11010
11011         event->owner = current;
11012
11013         perf_install_in_context(ctx, event, event->cpu);
11014         perf_unpin_context(ctx);
11015
11016         if (move_group)
11017                 perf_event_ctx_unlock(group_leader, gctx);
11018         mutex_unlock(&ctx->mutex);
11019
11020         if (task) {
11021                 mutex_unlock(&task->signal->cred_guard_mutex);
11022                 put_task_struct(task);
11023         }
11024
11025         mutex_lock(&current->perf_event_mutex);
11026         list_add_tail(&event->owner_entry, &current->perf_event_list);
11027         mutex_unlock(&current->perf_event_mutex);
11028
11029         /*
11030          * Drop the reference on the group_event after placing the
11031          * new event on the sibling_list. This ensures destruction
11032          * of the group leader will find the pointer to itself in
11033          * perf_group_detach().
11034          */
11035         fdput(group);
11036         fd_install(event_fd, event_file);
11037         return event_fd;
11038
11039 err_locked:
11040         if (move_group)
11041                 perf_event_ctx_unlock(group_leader, gctx);
11042         mutex_unlock(&ctx->mutex);
11043 /* err_file: */
11044         fput(event_file);
11045 err_context:
11046         perf_unpin_context(ctx);
11047         put_ctx(ctx);
11048 err_alloc:
11049         /*
11050          * If event_file is set, the fput() above will have called ->release()
11051          * and that will take care of freeing the event.
11052          */
11053         if (!event_file)
11054                 free_event(event);
11055 err_cred:
11056         if (task)
11057                 mutex_unlock(&task->signal->cred_guard_mutex);
11058 err_task:
11059         if (task)
11060                 put_task_struct(task);
11061 err_group_fd:
11062         fdput(group);
11063 err_fd:
11064         put_unused_fd(event_fd);
11065         return err;
11066 }
11067
11068 /**
11069  * perf_event_create_kernel_counter
11070  *
11071  * @attr: attributes of the counter to create
11072  * @cpu: cpu in which the counter is bound
11073  * @task: task to profile (NULL for percpu)
11074  */
11075 struct perf_event *
11076 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
11077                                  struct task_struct *task,
11078                                  perf_overflow_handler_t overflow_handler,
11079                                  void *context)
11080 {
11081         struct perf_event_context *ctx;
11082         struct perf_event *event;
11083         int err;
11084
11085         /*
11086          * Get the target context (task or percpu):
11087          */
11088
11089         event = perf_event_alloc(attr, cpu, task, NULL, NULL,
11090                                  overflow_handler, context, -1);
11091         if (IS_ERR(event)) {
11092                 err = PTR_ERR(event);
11093                 goto err;
11094         }
11095
11096         /* Mark owner so we could distinguish it from user events. */
11097         event->owner = TASK_TOMBSTONE;
11098
11099         ctx = find_get_context(event->pmu, task, event);
11100         if (IS_ERR(ctx)) {
11101                 err = PTR_ERR(ctx);
11102                 goto err_free;
11103         }
11104
11105         WARN_ON_ONCE(ctx->parent_ctx);
11106         mutex_lock(&ctx->mutex);
11107         if (ctx->task == TASK_TOMBSTONE) {
11108                 err = -ESRCH;
11109                 goto err_unlock;
11110         }
11111
11112         if (!task) {
11113                 /*
11114                  * Check if the @cpu we're creating an event for is online.
11115                  *
11116                  * We use the perf_cpu_context::ctx::mutex to serialize against
11117                  * the hotplug notifiers. See perf_event_{init,exit}_cpu().
11118                  */
11119                 struct perf_cpu_context *cpuctx =
11120                         container_of(ctx, struct perf_cpu_context, ctx);
11121                 if (!cpuctx->online) {
11122                         err = -ENODEV;
11123                         goto err_unlock;
11124                 }
11125         }
11126
11127         if (!exclusive_event_installable(event, ctx)) {
11128                 err = -EBUSY;
11129                 goto err_unlock;
11130         }
11131
11132         perf_install_in_context(ctx, event, cpu);
11133         perf_unpin_context(ctx);
11134         mutex_unlock(&ctx->mutex);
11135
11136         return event;
11137
11138 err_unlock:
11139         mutex_unlock(&ctx->mutex);
11140         perf_unpin_context(ctx);
11141         put_ctx(ctx);
11142 err_free:
11143         free_event(event);
11144 err:
11145         return ERR_PTR(err);
11146 }
11147 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
11148
11149 void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
11150 {
11151         struct perf_event_context *src_ctx;
11152         struct perf_event_context *dst_ctx;
11153         struct perf_event *event, *tmp;
11154         LIST_HEAD(events);
11155
11156         src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
11157         dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
11158
11159         /*
11160          * See perf_event_ctx_lock() for comments on the details
11161          * of swizzling perf_event::ctx.
11162          */
11163         mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
11164         list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
11165                                  event_entry) {
11166                 perf_remove_from_context(event, 0);
11167                 unaccount_event_cpu(event, src_cpu);
11168                 put_ctx(src_ctx);
11169                 list_add(&event->migrate_entry, &events);
11170         }
11171
11172         /*
11173          * Wait for the events to quiesce before re-instating them.
11174          */
11175         synchronize_rcu();
11176
11177         /*
11178          * Re-instate events in 2 passes.
11179          *
11180          * Skip over group leaders and only install siblings on this first
11181          * pass, siblings will not get enabled without a leader, however a
11182          * leader will enable its siblings, even if those are still on the old
11183          * context.
11184          */
11185         list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
11186                 if (event->group_leader == event)
11187                         continue;
11188
11189                 list_del(&event->migrate_entry);
11190                 if (event->state >= PERF_EVENT_STATE_OFF)
11191                         event->state = PERF_EVENT_STATE_INACTIVE;
11192                 account_event_cpu(event, dst_cpu);
11193                 perf_install_in_context(dst_ctx, event, dst_cpu);
11194                 get_ctx(dst_ctx);
11195         }
11196
11197         /*
11198          * Once all the siblings are setup properly, install the group leaders
11199          * to make it go.
11200          */
11201         list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
11202                 list_del(&event->migrate_entry);
11203                 if (event->state >= PERF_EVENT_STATE_OFF)
11204                         event->state = PERF_EVENT_STATE_INACTIVE;
11205                 account_event_cpu(event, dst_cpu);
11206                 perf_install_in_context(dst_ctx, event, dst_cpu);
11207                 get_ctx(dst_ctx);
11208         }
11209         mutex_unlock(&dst_ctx->mutex);
11210         mutex_unlock(&src_ctx->mutex);
11211 }
11212 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
11213
11214 static void sync_child_event(struct perf_event *child_event,
11215                                struct task_struct *child)
11216 {
11217         struct perf_event *parent_event = child_event->parent;
11218         u64 child_val;
11219
11220         if (child_event->attr.inherit_stat)
11221                 perf_event_read_event(child_event, child);
11222
11223         child_val = perf_event_count(child_event);
11224
11225         /*
11226          * Add back the child's count to the parent's count:
11227          */
11228         atomic64_add(child_val, &parent_event->child_count);
11229         atomic64_add(child_event->total_time_enabled,
11230                      &parent_event->child_total_time_enabled);
11231         atomic64_add(child_event->total_time_running,
11232                      &parent_event->child_total_time_running);
11233 }
11234
11235 static void
11236 perf_event_exit_event(struct perf_event *child_event,
11237                       struct perf_event_context *child_ctx,
11238                       struct task_struct *child)
11239 {
11240         struct perf_event *parent_event = child_event->parent;
11241
11242         /*
11243          * Do not destroy the 'original' grouping; because of the context
11244          * switch optimization the original events could've ended up in a
11245          * random child task.
11246          *
11247          * If we were to destroy the original group, all group related
11248          * operations would cease to function properly after this random
11249          * child dies.
11250          *
11251          * Do destroy all inherited groups, we don't care about those
11252          * and being thorough is better.
11253          */
11254         raw_spin_lock_irq(&child_ctx->lock);
11255         WARN_ON_ONCE(child_ctx->is_active);
11256
11257         if (parent_event)
11258                 perf_group_detach(child_event);
11259         list_del_event(child_event, child_ctx);
11260         perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */
11261         raw_spin_unlock_irq(&child_ctx->lock);
11262
11263         /*
11264          * Parent events are governed by their filedesc, retain them.
11265          */
11266         if (!parent_event) {
11267                 perf_event_wakeup(child_event);
11268                 return;
11269         }
11270         /*
11271          * Child events can be cleaned up.
11272          */
11273
11274         sync_child_event(child_event, child);
11275
11276         /*
11277          * Remove this event from the parent's list
11278          */
11279         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
11280         mutex_lock(&parent_event->child_mutex);
11281         list_del_init(&child_event->child_list);
11282         mutex_unlock(&parent_event->child_mutex);
11283
11284         /*
11285          * Kick perf_poll() for is_event_hup().
11286          */
11287         perf_event_wakeup(parent_event);
11288         free_event(child_event);
11289         put_event(parent_event);
11290 }
11291
11292 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
11293 {
11294         struct perf_event_context *child_ctx, *clone_ctx = NULL;
11295         struct perf_event *child_event, *next;
11296
11297         WARN_ON_ONCE(child != current);
11298
11299         child_ctx = perf_pin_task_context(child, ctxn);
11300         if (!child_ctx)
11301                 return;
11302
11303         /*
11304          * In order to reduce the amount of tricky in ctx tear-down, we hold
11305          * ctx::mutex over the entire thing. This serializes against almost
11306          * everything that wants to access the ctx.
11307          *
11308          * The exception is sys_perf_event_open() /
11309          * perf_event_create_kernel_count() which does find_get_context()
11310          * without ctx::mutex (it cannot because of the move_group double mutex
11311          * lock thing). See the comments in perf_install_in_context().
11312          */
11313         mutex_lock(&child_ctx->mutex);
11314
11315         /*
11316          * In a single ctx::lock section, de-schedule the events and detach the
11317          * context from the task such that we cannot ever get it scheduled back
11318          * in.
11319          */
11320         raw_spin_lock_irq(&child_ctx->lock);
11321         task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
11322
11323         /*
11324          * Now that the context is inactive, destroy the task <-> ctx relation
11325          * and mark the context dead.
11326          */
11327         RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
11328         put_ctx(child_ctx); /* cannot be last */
11329         WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
11330         put_task_struct(current); /* cannot be last */
11331
11332         clone_ctx = unclone_ctx(child_ctx);
11333         raw_spin_unlock_irq(&child_ctx->lock);
11334
11335         if (clone_ctx)
11336                 put_ctx(clone_ctx);
11337
11338         /*
11339          * Report the task dead after unscheduling the events so that we
11340          * won't get any samples after PERF_RECORD_EXIT. We can however still
11341          * get a few PERF_RECORD_READ events.
11342          */
11343         perf_event_task(child, child_ctx, 0);
11344
11345         list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
11346                 perf_event_exit_event(child_event, child_ctx, child);
11347
11348         mutex_unlock(&child_ctx->mutex);
11349
11350         put_ctx(child_ctx);
11351 }
11352
11353 /*
11354  * When a child task exits, feed back event values to parent events.
11355  *
11356  * Can be called with cred_guard_mutex held when called from
11357  * install_exec_creds().
11358  */
11359 void perf_event_exit_task(struct task_struct *child)
11360 {
11361         struct perf_event *event, *tmp;
11362         int ctxn;
11363
11364         mutex_lock(&child->perf_event_mutex);
11365         list_for_each_entry_safe(event, tmp, &child->perf_event_list,
11366                                  owner_entry) {
11367                 list_del_init(&event->owner_entry);
11368
11369                 /*
11370                  * Ensure the list deletion is visible before we clear
11371                  * the owner, closes a race against perf_release() where
11372                  * we need to serialize on the owner->perf_event_mutex.
11373                  */
11374                 smp_store_release(&event->owner, NULL);
11375         }
11376         mutex_unlock(&child->perf_event_mutex);
11377
11378         for_each_task_context_nr(ctxn)
11379                 perf_event_exit_task_context(child, ctxn);
11380
11381         /*
11382          * The perf_event_exit_task_context calls perf_event_task
11383          * with child's task_ctx, which generates EXIT events for
11384          * child contexts and sets child->perf_event_ctxp[] to NULL.
11385          * At this point we need to send EXIT events to cpu contexts.
11386          */
11387         perf_event_task(child, NULL, 0);
11388 }
11389
11390 static void perf_free_event(struct perf_event *event,
11391                             struct perf_event_context *ctx)
11392 {
11393         struct perf_event *parent = event->parent;
11394
11395         if (WARN_ON_ONCE(!parent))
11396                 return;
11397
11398         mutex_lock(&parent->child_mutex);
11399         list_del_init(&event->child_list);
11400         mutex_unlock(&parent->child_mutex);
11401
11402         put_event(parent);
11403
11404         raw_spin_lock_irq(&ctx->lock);
11405         perf_group_detach(event);
11406         list_del_event(event, ctx);
11407         raw_spin_unlock_irq(&ctx->lock);
11408         free_event(event);
11409 }
11410
11411 /*
11412  * Free an unexposed, unused context as created by inheritance by
11413  * perf_event_init_task below, used by fork() in case of fail.
11414  *
11415  * Not all locks are strictly required, but take them anyway to be nice and
11416  * help out with the lockdep assertions.
11417  */
11418 void perf_event_free_task(struct task_struct *task)
11419 {
11420         struct perf_event_context *ctx;
11421         struct perf_event *event, *tmp;
11422         int ctxn;
11423
11424         for_each_task_context_nr(ctxn) {
11425                 ctx = task->perf_event_ctxp[ctxn];
11426                 if (!ctx)
11427                         continue;
11428
11429                 mutex_lock(&ctx->mutex);
11430                 raw_spin_lock_irq(&ctx->lock);
11431                 /*
11432                  * Destroy the task <-> ctx relation and mark the context dead.
11433                  *
11434                  * This is important because even though the task hasn't been
11435                  * exposed yet the context has been (through child_list).
11436                  */
11437                 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
11438                 WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
11439                 put_task_struct(task); /* cannot be last */
11440                 raw_spin_unlock_irq(&ctx->lock);
11441
11442                 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
11443                         perf_free_event(event, ctx);
11444
11445                 mutex_unlock(&ctx->mutex);
11446                 put_ctx(ctx);
11447         }
11448 }
11449
11450 void perf_event_delayed_put(struct task_struct *task)
11451 {
11452         int ctxn;
11453
11454         for_each_task_context_nr(ctxn)
11455                 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
11456 }
11457
11458 struct file *perf_event_get(unsigned int fd)
11459 {
11460         struct file *file;
11461
11462         file = fget_raw(fd);
11463         if (!file)
11464                 return ERR_PTR(-EBADF);
11465
11466         if (file->f_op != &perf_fops) {
11467                 fput(file);
11468                 return ERR_PTR(-EBADF);
11469         }
11470
11471         return file;
11472 }
11473
11474 const struct perf_event *perf_get_event(struct file *file)
11475 {
11476         if (file->f_op != &perf_fops)
11477                 return ERR_PTR(-EINVAL);
11478
11479         return file->private_data;
11480 }
11481
11482 const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
11483 {
11484         if (!event)
11485                 return ERR_PTR(-EINVAL);
11486
11487         return &event->attr;
11488 }
11489
11490 /*
11491  * Inherit an event from parent task to child task.
11492  *
11493  * Returns:
11494  *  - valid pointer on success
11495  *  - NULL for orphaned events
11496  *  - IS_ERR() on error
11497  */
11498 static struct perf_event *
11499 inherit_event(struct perf_event *parent_event,
11500               struct task_struct *parent,
11501               struct perf_event_context *parent_ctx,
11502               struct task_struct *child,
11503               struct perf_event *group_leader,
11504               struct perf_event_context *child_ctx)
11505 {
11506         enum perf_event_state parent_state = parent_event->state;
11507         struct perf_event *child_event;
11508         unsigned long flags;
11509
11510         /*
11511          * Instead of creating recursive hierarchies of events,
11512          * we link inherited events back to the original parent,
11513          * which has a filp for sure, which we use as the reference
11514          * count:
11515          */
11516         if (parent_event->parent)
11517                 parent_event = parent_event->parent;
11518
11519         child_event = perf_event_alloc(&parent_event->attr,
11520                                            parent_event->cpu,
11521                                            child,
11522                                            group_leader, parent_event,
11523                                            NULL, NULL, -1);
11524         if (IS_ERR(child_event))
11525                 return child_event;
11526
11527
11528         if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
11529             !child_ctx->task_ctx_data) {
11530                 struct pmu *pmu = child_event->pmu;
11531
11532                 child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size,
11533                                                    GFP_KERNEL);
11534                 if (!child_ctx->task_ctx_data) {
11535                         free_event(child_event);
11536                         return NULL;
11537                 }
11538         }
11539
11540         /*
11541          * is_orphaned_event() and list_add_tail(&parent_event->child_list)
11542          * must be under the same lock in order to serialize against
11543          * perf_event_release_kernel(), such that either we must observe
11544          * is_orphaned_event() or they will observe us on the child_list.
11545          */
11546         mutex_lock(&parent_event->child_mutex);
11547         if (is_orphaned_event(parent_event) ||
11548             !atomic_long_inc_not_zero(&parent_event->refcount)) {
11549                 mutex_unlock(&parent_event->child_mutex);
11550                 /* task_ctx_data is freed with child_ctx */
11551                 free_event(child_event);
11552                 return NULL;
11553         }
11554
11555         get_ctx(child_ctx);
11556
11557         /*
11558          * Make the child state follow the state of the parent event,
11559          * not its attr.disabled bit.  We hold the parent's mutex,
11560          * so we won't race with perf_event_{en, dis}able_family.
11561          */
11562         if (parent_state >= PERF_EVENT_STATE_INACTIVE)
11563                 child_event->state = PERF_EVENT_STATE_INACTIVE;
11564         else
11565                 child_event->state = PERF_EVENT_STATE_OFF;
11566
11567         if (parent_event->attr.freq) {
11568                 u64 sample_period = parent_event->hw.sample_period;
11569                 struct hw_perf_event *hwc = &child_event->hw;
11570
11571                 hwc->sample_period = sample_period;
11572                 hwc->last_period   = sample_period;
11573
11574                 local64_set(&hwc->period_left, sample_period);
11575         }
11576
11577         child_event->ctx = child_ctx;
11578         child_event->overflow_handler = parent_event->overflow_handler;
11579         child_event->overflow_handler_context
11580                 = parent_event->overflow_handler_context;
11581
11582         /*
11583          * Precalculate sample_data sizes
11584          */
11585         perf_event__header_size(child_event);
11586         perf_event__id_header_size(child_event);
11587
11588         /*
11589          * Link it up in the child's context:
11590          */
11591         raw_spin_lock_irqsave(&child_ctx->lock, flags);
11592         add_event_to_ctx(child_event, child_ctx);
11593         raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
11594
11595         /*
11596          * Link this into the parent event's child list
11597          */
11598         list_add_tail(&child_event->child_list, &parent_event->child_list);
11599         mutex_unlock(&parent_event->child_mutex);
11600
11601         return child_event;
11602 }
11603
11604 /*
11605  * Inherits an event group.
11606  *
11607  * This will quietly suppress orphaned events; !inherit_event() is not an error.
11608  * This matches with perf_event_release_kernel() removing all child events.
11609  *
11610  * Returns:
11611  *  - 0 on success
11612  *  - <0 on error
11613  */
11614 static int inherit_group(struct perf_event *parent_event,
11615               struct task_struct *parent,
11616               struct perf_event_context *parent_ctx,
11617               struct task_struct *child,
11618               struct perf_event_context *child_ctx)
11619 {
11620         struct perf_event *leader;
11621         struct perf_event *sub;
11622         struct perf_event *child_ctr;
11623
11624         leader = inherit_event(parent_event, parent, parent_ctx,
11625                                  child, NULL, child_ctx);
11626         if (IS_ERR(leader))
11627                 return PTR_ERR(leader);
11628         /*
11629          * @leader can be NULL here because of is_orphaned_event(). In this
11630          * case inherit_event() will create individual events, similar to what
11631          * perf_group_detach() would do anyway.
11632          */
11633         for_each_sibling_event(sub, parent_event) {
11634                 child_ctr = inherit_event(sub, parent, parent_ctx,
11635                                             child, leader, child_ctx);
11636                 if (IS_ERR(child_ctr))
11637                         return PTR_ERR(child_ctr);
11638         }
11639         return 0;
11640 }
11641
11642 /*
11643  * Creates the child task context and tries to inherit the event-group.
11644  *
11645  * Clears @inherited_all on !attr.inherited or error. Note that we'll leave
11646  * inherited_all set when we 'fail' to inherit an orphaned event; this is
11647  * consistent with perf_event_release_kernel() removing all child events.
11648  *
11649  * Returns:
11650  *  - 0 on success
11651  *  - <0 on error
11652  */
11653 static int
11654 inherit_task_group(struct perf_event *event, struct task_struct *parent,
11655                    struct perf_event_context *parent_ctx,
11656                    struct task_struct *child, int ctxn,
11657                    int *inherited_all)
11658 {
11659         int ret;
11660         struct perf_event_context *child_ctx;
11661
11662         if (!event->attr.inherit) {
11663                 *inherited_all = 0;
11664                 return 0;
11665         }
11666
11667         child_ctx = child->perf_event_ctxp[ctxn];
11668         if (!child_ctx) {
11669                 /*
11670                  * This is executed from the parent task context, so
11671                  * inherit events that have been marked for cloning.
11672                  * First allocate and initialize a context for the
11673                  * child.
11674                  */
11675                 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
11676                 if (!child_ctx)
11677                         return -ENOMEM;
11678
11679                 child->perf_event_ctxp[ctxn] = child_ctx;
11680         }
11681
11682         ret = inherit_group(event, parent, parent_ctx,
11683                             child, child_ctx);
11684
11685         if (ret)
11686                 *inherited_all = 0;
11687
11688         return ret;
11689 }
11690
11691 /*
11692  * Initialize the perf_event context in task_struct
11693  */
11694 static int perf_event_init_context(struct task_struct *child, int ctxn)
11695 {
11696         struct perf_event_context *child_ctx, *parent_ctx;
11697         struct perf_event_context *cloned_ctx;
11698         struct perf_event *event;
11699         struct task_struct *parent = current;
11700         int inherited_all = 1;
11701         unsigned long flags;
11702         int ret = 0;
11703
11704         if (likely(!parent->perf_event_ctxp[ctxn]))
11705                 return 0;
11706
11707         /*
11708          * If the parent's context is a clone, pin it so it won't get
11709          * swapped under us.
11710          */
11711         parent_ctx = perf_pin_task_context(parent, ctxn);
11712         if (!parent_ctx)
11713                 return 0;
11714
11715         /*
11716          * No need to check if parent_ctx != NULL here; since we saw
11717          * it non-NULL earlier, the only reason for it to become NULL
11718          * is if we exit, and since we're currently in the middle of
11719          * a fork we can't be exiting at the same time.
11720          */
11721
11722         /*
11723          * Lock the parent list. No need to lock the child - not PID
11724          * hashed yet and not running, so nobody can access it.
11725          */
11726         mutex_lock(&parent_ctx->mutex);
11727
11728         /*
11729          * We dont have to disable NMIs - we are only looking at
11730          * the list, not manipulating it:
11731          */
11732         perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
11733                 ret = inherit_task_group(event, parent, parent_ctx,
11734                                          child, ctxn, &inherited_all);
11735                 if (ret)
11736                         goto out_unlock;
11737         }
11738
11739         /*
11740          * We can't hold ctx->lock when iterating the ->flexible_group list due
11741          * to allocations, but we need to prevent rotation because
11742          * rotate_ctx() will change the list from interrupt context.
11743          */
11744         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
11745         parent_ctx->rotate_disable = 1;
11746         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
11747
11748         perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
11749                 ret = inherit_task_group(event, parent, parent_ctx,
11750                                          child, ctxn, &inherited_all);
11751                 if (ret)
11752                         goto out_unlock;
11753         }
11754
11755         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
11756         parent_ctx->rotate_disable = 0;
11757
11758         child_ctx = child->perf_event_ctxp[ctxn];
11759
11760         if (child_ctx && inherited_all) {
11761                 /*
11762                  * Mark the child context as a clone of the parent
11763                  * context, or of whatever the parent is a clone of.
11764                  *
11765                  * Note that if the parent is a clone, the holding of
11766                  * parent_ctx->lock avoids it from being uncloned.
11767                  */
11768                 cloned_ctx = parent_ctx->parent_ctx;
11769                 if (cloned_ctx) {
11770                         child_ctx->parent_ctx = cloned_ctx;
11771                         child_ctx->parent_gen = parent_ctx->parent_gen;
11772                 } else {
11773                         child_ctx->parent_ctx = parent_ctx;
11774                         child_ctx->parent_gen = parent_ctx->generation;
11775                 }
11776                 get_ctx(child_ctx->parent_ctx);
11777         }
11778
11779         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
11780 out_unlock:
11781         mutex_unlock(&parent_ctx->mutex);
11782
11783         perf_unpin_context(parent_ctx);
11784         put_ctx(parent_ctx);
11785
11786         return ret;
11787 }
11788
11789 /*
11790  * Initialize the perf_event context in task_struct
11791  */
11792 int perf_event_init_task(struct task_struct *child)
11793 {
11794         int ctxn, ret;
11795
11796         memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
11797         mutex_init(&child->perf_event_mutex);
11798         INIT_LIST_HEAD(&child->perf_event_list);
11799
11800         for_each_task_context_nr(ctxn) {
11801                 ret = perf_event_init_context(child, ctxn);
11802                 if (ret) {
11803                         perf_event_free_task(child);
11804                         return ret;
11805                 }
11806         }
11807
11808         return 0;
11809 }
11810
11811 static void __init perf_event_init_all_cpus(void)
11812 {
11813         struct swevent_htable *swhash;
11814         int cpu;
11815
11816         zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
11817
11818         for_each_possible_cpu(cpu) {
11819                 swhash = &per_cpu(swevent_htable, cpu);
11820                 mutex_init(&swhash->hlist_mutex);
11821                 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
11822
11823                 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
11824                 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
11825
11826 #ifdef CONFIG_CGROUP_PERF
11827                 INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
11828 #endif
11829                 INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
11830         }
11831 }
11832
11833 void perf_swevent_init_cpu(unsigned int cpu)
11834 {
11835         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
11836
11837         mutex_lock(&swhash->hlist_mutex);
11838         if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
11839                 struct swevent_hlist *hlist;
11840
11841                 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
11842                 WARN_ON(!hlist);
11843                 rcu_assign_pointer(swhash->swevent_hlist, hlist);
11844         }
11845         mutex_unlock(&swhash->hlist_mutex);
11846 }
11847
11848 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
11849 static void __perf_event_exit_context(void *__info)
11850 {
11851         struct perf_event_context *ctx = __info;
11852         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
11853         struct perf_event *event;
11854
11855         raw_spin_lock(&ctx->lock);
11856         ctx_sched_out(ctx, cpuctx, EVENT_TIME);
11857         list_for_each_entry(event, &ctx->event_list, event_entry)
11858                 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
11859         raw_spin_unlock(&ctx->lock);
11860 }
11861
11862 static void perf_event_exit_cpu_context(int cpu)
11863 {
11864         struct perf_cpu_context *cpuctx;
11865         struct perf_event_context *ctx;
11866         struct pmu *pmu;
11867
11868         mutex_lock(&pmus_lock);
11869         list_for_each_entry(pmu, &pmus, entry) {
11870                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
11871                 ctx = &cpuctx->ctx;
11872
11873                 mutex_lock(&ctx->mutex);
11874                 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
11875                 cpuctx->online = 0;
11876                 mutex_unlock(&ctx->mutex);
11877         }
11878         cpumask_clear_cpu(cpu, perf_online_mask);
11879         mutex_unlock(&pmus_lock);
11880 }
11881 #else
11882
11883 static void perf_event_exit_cpu_context(int cpu) { }
11884
11885 #endif
11886
11887 int perf_event_init_cpu(unsigned int cpu)
11888 {
11889         struct perf_cpu_context *cpuctx;
11890         struct perf_event_context *ctx;
11891         struct pmu *pmu;
11892
11893         perf_swevent_init_cpu(cpu);
11894
11895         mutex_lock(&pmus_lock);
11896         cpumask_set_cpu(cpu, perf_online_mask);
11897         list_for_each_entry(pmu, &pmus, entry) {
11898                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
11899                 ctx = &cpuctx->ctx;
11900
11901                 mutex_lock(&ctx->mutex);
11902                 cpuctx->online = 1;
11903                 mutex_unlock(&ctx->mutex);
11904         }
11905         mutex_unlock(&pmus_lock);
11906
11907         return 0;
11908 }
11909
11910 int perf_event_exit_cpu(unsigned int cpu)
11911 {
11912         perf_event_exit_cpu_context(cpu);
11913         return 0;
11914 }
11915
11916 static int
11917 perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
11918 {
11919         int cpu;
11920
11921         for_each_online_cpu(cpu)
11922                 perf_event_exit_cpu(cpu);
11923
11924         return NOTIFY_OK;
11925 }
11926
11927 /*
11928  * Run the perf reboot notifier at the very last possible moment so that
11929  * the generic watchdog code runs as long as possible.
11930  */
11931 static struct notifier_block perf_reboot_notifier = {
11932         .notifier_call = perf_reboot,
11933         .priority = INT_MIN,
11934 };
11935
11936 void __init perf_event_init(void)
11937 {
11938         int ret;
11939
11940         idr_init(&pmu_idr);
11941
11942         perf_event_init_all_cpus();
11943         init_srcu_struct(&pmus_srcu);
11944         perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
11945         perf_pmu_register(&perf_cpu_clock, NULL, -1);
11946         perf_pmu_register(&perf_task_clock, NULL, -1);
11947         perf_tp_register();
11948         perf_event_init_cpu(smp_processor_id());
11949         register_reboot_notifier(&perf_reboot_notifier);
11950
11951         ret = init_hw_breakpoint();
11952         WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
11953
11954         /*
11955          * Build time assertion that we keep the data_head at the intended
11956          * location.  IOW, validation we got the __reserved[] size right.
11957          */
11958         BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
11959                      != 1024);
11960 }
11961
11962 ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
11963                               char *page)
11964 {
11965         struct perf_pmu_events_attr *pmu_attr =
11966                 container_of(attr, struct perf_pmu_events_attr, attr);
11967
11968         if (pmu_attr->event_str)
11969                 return sprintf(page, "%s\n", pmu_attr->event_str);
11970
11971         return 0;
11972 }
11973 EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
11974
11975 static int __init perf_event_sysfs_init(void)
11976 {
11977         struct pmu *pmu;
11978         int ret;
11979
11980         mutex_lock(&pmus_lock);
11981
11982         ret = bus_register(&pmu_bus);
11983         if (ret)
11984                 goto unlock;
11985
11986         list_for_each_entry(pmu, &pmus, entry) {
11987                 if (!pmu->name || pmu->type < 0)
11988                         continue;
11989
11990                 ret = pmu_dev_alloc(pmu);
11991                 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
11992         }
11993         pmu_bus_running = 1;
11994         ret = 0;
11995
11996 unlock:
11997         mutex_unlock(&pmus_lock);
11998
11999         return ret;
12000 }
12001 device_initcall(perf_event_sysfs_init);
12002
12003 #ifdef CONFIG_CGROUP_PERF
12004 static struct cgroup_subsys_state *
12005 perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
12006 {
12007         struct perf_cgroup *jc;
12008
12009         jc = kzalloc(sizeof(*jc), GFP_KERNEL);
12010         if (!jc)
12011                 return ERR_PTR(-ENOMEM);
12012
12013         jc->info = alloc_percpu(struct perf_cgroup_info);
12014         if (!jc->info) {
12015                 kfree(jc);
12016                 return ERR_PTR(-ENOMEM);
12017         }
12018
12019         return &jc->css;
12020 }
12021
12022 static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
12023 {
12024         struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
12025
12026         free_percpu(jc->info);
12027         kfree(jc);
12028 }
12029
12030 static int __perf_cgroup_move(void *info)
12031 {
12032         struct task_struct *task = info;
12033         rcu_read_lock();
12034         perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
12035         rcu_read_unlock();
12036         return 0;
12037 }
12038
12039 static void perf_cgroup_attach(struct cgroup_taskset *tset)
12040 {
12041         struct task_struct *task;
12042         struct cgroup_subsys_state *css;
12043
12044         cgroup_taskset_for_each(task, css, tset)
12045                 task_function_call(task, __perf_cgroup_move, task);
12046 }
12047
12048 struct cgroup_subsys perf_event_cgrp_subsys = {
12049         .css_alloc      = perf_cgroup_css_alloc,
12050         .css_free       = perf_cgroup_css_free,
12051         .attach         = perf_cgroup_attach,
12052         /*
12053          * Implicitly enable on dfl hierarchy so that perf events can
12054          * always be filtered by cgroup2 path as long as perf_event
12055          * controller is not mounted on a legacy hierarchy.
12056          */
12057         .implicit_on_dfl = true,
12058         .threaded       = true,
12059 };
12060 #endif /* CONFIG_CGROUP_PERF */