Merge branches 'pm-core', 'pm-qos', 'pm-domains' and 'pm-opp'

[linux.git] / kernel / events / core.c
diff --git a/kernel/events/core.c b/kernel/events/core.c

index e235bb991bdd8fd0d1c7bef06c2f3b3dcaa4ddaf..77a932b54a64fbeb2640b35c1cc4c096994bf1d7 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -355,6 +355,8 @@ enum event_type_t {
         EVENT_FLEXIBLE = 0x1,
         EVENT_PINNED = 0x2,
         EVENT_TIME = 0x4,
+       /* see ctx_resched() for details */
+       EVENT_CPU = 0x8,
         EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
  };
  
@@ -678,6 +680,8 @@ perf_cgroup_set_timestamp(struct task_struct *task,
         info->timestamp = ctx->timestamp;
  }
  
+static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
+
  #define PERF_CGROUP_SWOUT      0x1 /* cgroup switch out every event */
  #define PERF_CGROUP_SWIN       0x2 /* cgroup switch in events based on task */
  
@@ -690,61 +694,46 @@ perf_cgroup_set_timestamp(struct task_struct *task,
  static void perf_cgroup_switch(struct task_struct *task, int mode)
  {
         struct perf_cpu_context *cpuctx;
-       struct pmu *pmu;
+       struct list_head *list;
         unsigned long flags;
  
         /*
-        * disable interrupts to avoid geting nr_cgroup
-        * changes via __perf_event_disable(). Also
-        * avoids preemption.
+        * Disable interrupts and preemption to avoid this CPU's
+        * cgrp_cpuctx_entry to change under us.
          */
         local_irq_save(flags);
  
-       /*
-        * we reschedule only in the presence of cgroup
-        * constrained events.
-        */
+       list = this_cpu_ptr(&cgrp_cpuctx_list);
+       list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
+               WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
  
-       list_for_each_entry_rcu(pmu, &pmus, entry) {
-               cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-               if (cpuctx->unique_pmu != pmu)
-                       continue; /* ensure we process each cpuctx once */
-
-               /*
-                * perf_cgroup_events says at least one
-                * context on this CPU has cgroup events.
-                *
-                * ctx->nr_cgroups reports the number of cgroup
-                * events for a context.
-                */
-               if (cpuctx->ctx.nr_cgroups > 0) {
-                       perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-                       perf_pmu_disable(cpuctx->ctx.pmu);
+               perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+               perf_pmu_disable(cpuctx->ctx.pmu);
  
-                       if (mode & PERF_CGROUP_SWOUT) {
-                               cpu_ctx_sched_out(cpuctx, EVENT_ALL);
-                               /*
-                                * must not be done before ctxswout due
-                                * to event_filter_match() in event_sched_out()
-                                */
-                               cpuctx->cgrp = NULL;
-                       }
+               if (mode & PERF_CGROUP_SWOUT) {
+                       cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+                       /*
+                        * must not be done before ctxswout due
+                        * to event_filter_match() in event_sched_out()
+                        */
+                       cpuctx->cgrp = NULL;
+               }
  
-                       if (mode & PERF_CGROUP_SWIN) {
-                               WARN_ON_ONCE(cpuctx->cgrp);
-                               /*
-                                * set cgrp before ctxsw in to allow
-                                * event_filter_match() to not have to pass
-                                * task around
-                                * we pass the cpuctx->ctx to perf_cgroup_from_task()
-                                * because cgorup events are only per-cpu
-                                */
-                               cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx);
-                               cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
-                       }
-                       perf_pmu_enable(cpuctx->ctx.pmu);
-                       perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+               if (mode & PERF_CGROUP_SWIN) {
+                       WARN_ON_ONCE(cpuctx->cgrp);
+                       /*
+                        * set cgrp before ctxsw in to allow
+                        * event_filter_match() to not have to pass
+                        * task around
+                        * we pass the cpuctx->ctx to perf_cgroup_from_task()
+                        * because cgorup events are only per-cpu
+                        */
+                       cpuctx->cgrp = perf_cgroup_from_task(task,
+                                                            &cpuctx->ctx);
+                       cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
                 }
+               perf_pmu_enable(cpuctx->ctx.pmu);
+               perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
         }
  
         local_irq_restore(flags);
@@ -889,6 +878,7 @@ list_update_cgroup_event(struct perf_event *event,
                          struct perf_event_context *ctx, bool add)
  {
         struct perf_cpu_context *cpuctx;
+       struct list_head *cpuctx_entry;
  
         if (!is_cgroup_event(event))
                 return;
@@ -902,15 +892,16 @@ list_update_cgroup_event(struct perf_event *event,
          * this will always be called from the right CPU.
          */
         cpuctx = __get_cpu_context(ctx);
-
-       /*
-        * cpuctx->cgrp is NULL until a cgroup event is sched in or
-        * ctx->nr_cgroup == 0 .
-        */
-       if (add && perf_cgroup_from_task(current, ctx) == event->cgrp)
-               cpuctx->cgrp = event->cgrp;
-       else if (!add)
+       cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
+       /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/
+       if (add) {
+               list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
+               if (perf_cgroup_from_task(current, ctx) == event->cgrp)
+                       cpuctx->cgrp = event->cgrp;
+       } else {
+               list_del(cpuctx_entry);
                 cpuctx->cgrp = NULL;
+       }
  }
  
  #else /* !CONFIG_CGROUP_PERF */
@@ -1453,6 +1444,20 @@ static void update_group_times(struct perf_event *leader)
                 update_event_times(event);
  }
  
+static enum event_type_t get_event_type(struct perf_event *event)
+{
+       struct perf_event_context *ctx = event->ctx;
+       enum event_type_t event_type;
+
+       lockdep_assert_held(&ctx->lock);
+
+       event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
+       if (!ctx->task)
+               event_type |= EVENT_CPU;
+
+       return event_type;
+}
+
  static struct list_head *
  ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
  {
@@ -2226,7 +2231,8 @@ ctx_sched_in(struct perf_event_context *ctx,
              struct task_struct *task);
  
  static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
-                              struct perf_event_context *ctx)
+                              struct perf_event_context *ctx,
+                              enum event_type_t event_type)
  {
         if (!cpuctx->task_ctx)
                 return;
@@ -2234,7 +2240,7 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
                 return;
  
-       ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+       ctx_sched_out(ctx, cpuctx, event_type);
  }
  
  static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
@@ -2249,13 +2255,51 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
                 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
  }
  
+/*
+ * We want to maintain the following priority of scheduling:
+ *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
+ *  - task pinned (EVENT_PINNED)
+ *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
+ *  - task flexible (EVENT_FLEXIBLE).
+ *
+ * In order to avoid unscheduling and scheduling back in everything every
+ * time an event is added, only do it for the groups of equal priority and
+ * below.
+ *
+ * This can be called after a batch operation on task events, in which case
+ * event_type is a bit mask of the types of events involved. For CPU events,
+ * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
+ */
  static void ctx_resched(struct perf_cpu_context *cpuctx,
-                       struct perf_event_context *task_ctx)
+                       struct perf_event_context *task_ctx,
+                       enum event_type_t event_type)
  {
+       enum event_type_t ctx_event_type = event_type & EVENT_ALL;
+       bool cpu_event = !!(event_type & EVENT_CPU);
+
+       /*
+        * If pinned groups are involved, flexible groups also need to be
+        * scheduled out.
+        */
+       if (event_type & EVENT_PINNED)
+               event_type |= EVENT_FLEXIBLE;
+
         perf_pmu_disable(cpuctx->ctx.pmu);
         if (task_ctx)
-               task_ctx_sched_out(cpuctx, task_ctx);
-       cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+               task_ctx_sched_out(cpuctx, task_ctx, event_type);
+
+       /*
+        * Decide which cpu ctx groups to schedule out based on the types
+        * of events that caused rescheduling:
+        *  - EVENT_CPU: schedule out corresponding groups;
+        *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
+        *  - otherwise, do nothing more.
+        */
+       if (cpu_event)
+               cpu_ctx_sched_out(cpuctx, ctx_event_type);
+       else if (ctx_event_type & EVENT_PINNED)
+               cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+
         perf_event_sched_in(cpuctx, task_ctx, current);
         perf_pmu_enable(cpuctx->ctx.pmu);
  }
@@ -2302,7 +2346,7 @@ static int  __perf_install_in_context(void *info)
         if (reprogram) {
                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
                 add_event_to_ctx(event, ctx);
-               ctx_resched(cpuctx, task_ctx);
+               ctx_resched(cpuctx, task_ctx, get_event_type(event));
         } else {
                 add_event_to_ctx(event, ctx);
         }
@@ -2469,7 +2513,7 @@ static void __perf_event_enable(struct perf_event *event,
         if (ctx->task)
                 WARN_ON_ONCE(task_ctx != ctx);
  
-       ctx_resched(cpuctx, task_ctx);
+       ctx_resched(cpuctx, task_ctx, get_event_type(event));
  }
  
  /*
@@ -2896,7 +2940,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
  
         if (do_switch) {
                 raw_spin_lock(&ctx->lock);
-               task_ctx_sched_out(cpuctx, ctx);
+               task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
                 raw_spin_unlock(&ctx->lock);
         }
  }
@@ -2943,7 +2987,7 @@ static void perf_pmu_sched_task(struct task_struct *prev,
                 return;
  
         list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
-               pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */
+               pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
  
                 if (WARN_ON_ONCE(!pmu->sched_task))
                         continue;
@@ -3133,8 +3177,12 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
          * We want to keep the following priority order:
          * cpu pinned (that don't need to move), task pinned,
          * cpu flexible, task flexible.
+        *
+        * However, if task's ctx is not carrying any pinned
+        * events, no need to flip the cpuctx's events around.
          */
-       cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+       if (!list_empty(&ctx->pinned_groups))
+               cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
         perf_event_sched_in(cpuctx, ctx, task);
         perf_pmu_enable(ctx->pmu);
         perf_ctx_unlock(cpuctx, ctx);
@@ -3449,6 +3497,7 @@ static int event_enable_on_exec(struct perf_event *event,
  static void perf_event_enable_on_exec(int ctxn)
  {
         struct perf_event_context *ctx, *clone_ctx = NULL;
+       enum event_type_t event_type = 0;
         struct perf_cpu_context *cpuctx;
         struct perf_event *event;
         unsigned long flags;
@@ -3462,15 +3511,17 @@ static void perf_event_enable_on_exec(int ctxn)
         cpuctx = __get_cpu_context(ctx);
         perf_ctx_lock(cpuctx, ctx);
         ctx_sched_out(ctx, cpuctx, EVENT_TIME);
-       list_for_each_entry(event, &ctx->event_list, event_entry)
+       list_for_each_entry(event, &ctx->event_list, event_entry) {
                 enabled |= event_enable_on_exec(event, ctx);
+               event_type |= get_event_type(event);
+       }
  
         /*
          * Unclone and reschedule this context if we enabled any event.
          */
         if (enabled) {
                 clone_ctx = unclone_ctx(ctx);
-               ctx_resched(cpuctx, ctx);
+               ctx_resched(cpuctx, ctx, event_type);
         }
         perf_ctx_unlock(cpuctx, ctx);
  
@@ -8044,6 +8095,9 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
         if (task == TASK_TOMBSTONE)
                 return;
  
+       if (!ifh->nr_file_filters)
+               return;
+
         mm = get_task_mm(event->ctx->task);
         if (!mm)
                 goto restart;
@@ -8214,6 +8268,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
                  * attribute.
                  */
                 if (state == IF_STATE_END) {
+                       ret = -EINVAL;
                         if (kernel && event->attr.exclude_kernel)
                                 goto fail;
  
@@ -8221,6 +8276,18 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
                                 if (!filename)
                                         goto fail;
  
+                               /*
+                                * For now, we only support file-based filters
+                                * in per-task events; doing so for CPU-wide
+                                * events requires additional context switching
+                                * trickery, since same object code will be
+                                * mapped at different virtual addresses in
+                                * different processes.
+                                */
+                               ret = -EOPNOTSUPP;
+                               if (!event->ctx->task)
+                                       goto fail_free_name;
+
                                 /* look up the path and grab its inode */
                                 ret = kern_path(filename, LOOKUP_FOLLOW, &path);
                                 if (ret)
@@ -8236,6 +8303,8 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
                                     !S_ISREG(filter->inode->i_mode))
                                         /* free_filters_list() will iput() */
                                         goto fail;
+
+                               event->addr_filters.nr_file_filters++;
                         }
  
                         /* ready to consume more filters */
@@ -8275,24 +8344,13 @@ perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
         if (WARN_ON_ONCE(event->parent))
                 return -EINVAL;
  
-       /*
-        * For now, we only support filtering in per-task events; doing so
-        * for CPU-wide events requires additional context switching trickery,
-        * since same object code will be mapped at different virtual
-        * addresses in different processes.
-        */
-       if (!event->ctx->task)
-               return -EOPNOTSUPP;
-
         ret = perf_event_parse_addr_filter(event, filter_str, &filters);
         if (ret)
-               return ret;
+               goto fail_clear_files;
  
         ret = event->pmu->addr_filters_validate(&filters);
-       if (ret) {
-               free_filters_list(&filters);
-               return ret;
-       }
+       if (ret)
+               goto fail_free_filters;
  
         /* remove existing filters, if any */
         perf_addr_filters_splice(event, &filters);
@@ -8300,6 +8358,14 @@ perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
         /* install new filters */
         perf_event_for_each_child(event, perf_event_addr_filters_apply);
  
+       return ret;
+
+fail_free_filters:
+       free_filters_list(&filters);
+
+fail_clear_files:
+       event->addr_filters.nr_file_filters = 0;
+
         return ret;
  }
  
@@ -8652,37 +8718,10 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
         return NULL;
  }
  
-static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
-{
-       int cpu;
-
-       for_each_possible_cpu(cpu) {
-               struct perf_cpu_context *cpuctx;
-
-               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-
-               if (cpuctx->unique_pmu == old_pmu)
-                       cpuctx->unique_pmu = pmu;
-       }
-}
-
  static void free_pmu_context(struct pmu *pmu)
  {
-       struct pmu *i;
-
         mutex_lock(&pmus_lock);
-       /*
-        * Like a real lame refcount.
-        */
-       list_for_each_entry(i, &pmus, entry) {
-               if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
-                       update_pmu_context(i, pmu);
-                       goto out;
-               }
-       }
-
         free_percpu(pmu->pmu_cpu_context);
-out:
         mutex_unlock(&pmus_lock);
  }
  
@@ -8886,8 +8925,6 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
                 cpuctx->ctx.pmu = pmu;
  
                 __perf_mux_hrtimer_init(cpuctx, cpu);
-
-               cpuctx->unique_pmu = pmu;
         }
  
  got_cpu_context:
@@ -9005,6 +9042,14 @@ static struct pmu *perf_init_event(struct perf_event *event)
  
         idx = srcu_read_lock(&pmus_srcu);
  
+       /* Try parent's PMU first: */
+       if (event->parent && event->parent->pmu) {
+               pmu = event->parent->pmu;
+               ret = perf_try_init_event(pmu, event);
+               if (!ret)
+                       goto unlock;
+       }
+
         rcu_read_lock();
         pmu = idr_find(&pmu_idr, event->attr.type);
         rcu_read_unlock();
@@ -10265,7 +10310,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
          * in.
          */
         raw_spin_lock_irq(&child_ctx->lock);
-       task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
+       task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
  
         /*
          * Now that the context is inactive, destroy the task <-> ctx relation
@@ -10714,6 +10759,9 @@ static void __init perf_event_init_all_cpus(void)
                 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
                 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
  
+#ifdef CONFIG_CGROUP_PERF
+               INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
+#endif
                 INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
         }
  }