]> asedeno.scripts.mit.edu Git - linux.git/blobdiff - kernel/events/core.c
Merge branches 'pm-core', 'pm-qos', 'pm-domains' and 'pm-opp'
[linux.git] / kernel / events / core.c
index e235bb991bdd8fd0d1c7bef06c2f3b3dcaa4ddaf..77a932b54a64fbeb2640b35c1cc4c096994bf1d7 100644 (file)
@@ -355,6 +355,8 @@ enum event_type_t {
        EVENT_FLEXIBLE = 0x1,
        EVENT_PINNED = 0x2,
        EVENT_TIME = 0x4,
+       /* see ctx_resched() for details */
+       EVENT_CPU = 0x8,
        EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 };
 
@@ -678,6 +680,8 @@ perf_cgroup_set_timestamp(struct task_struct *task,
        info->timestamp = ctx->timestamp;
 }
 
+static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
+
 #define PERF_CGROUP_SWOUT      0x1 /* cgroup switch out every event */
 #define PERF_CGROUP_SWIN       0x2 /* cgroup switch in events based on task */
 
@@ -690,61 +694,46 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 static void perf_cgroup_switch(struct task_struct *task, int mode)
 {
        struct perf_cpu_context *cpuctx;
-       struct pmu *pmu;
+       struct list_head *list;
        unsigned long flags;
 
        /*
-        * disable interrupts to avoid geting nr_cgroup
-        * changes via __perf_event_disable(). Also
-        * avoids preemption.
+        * Disable interrupts and preemption to avoid this CPU's
+        * cgrp_cpuctx_entry to change under us.
         */
        local_irq_save(flags);
 
-       /*
-        * we reschedule only in the presence of cgroup
-        * constrained events.
-        */
+       list = this_cpu_ptr(&cgrp_cpuctx_list);
+       list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
+               WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
 
-       list_for_each_entry_rcu(pmu, &pmus, entry) {
-               cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-               if (cpuctx->unique_pmu != pmu)
-                       continue; /* ensure we process each cpuctx once */
-
-               /*
-                * perf_cgroup_events says at least one
-                * context on this CPU has cgroup events.
-                *
-                * ctx->nr_cgroups reports the number of cgroup
-                * events for a context.
-                */
-               if (cpuctx->ctx.nr_cgroups > 0) {
-                       perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-                       perf_pmu_disable(cpuctx->ctx.pmu);
+               perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+               perf_pmu_disable(cpuctx->ctx.pmu);
 
-                       if (mode & PERF_CGROUP_SWOUT) {
-                               cpu_ctx_sched_out(cpuctx, EVENT_ALL);
-                               /*
-                                * must not be done before ctxswout due
-                                * to event_filter_match() in event_sched_out()
-                                */
-                               cpuctx->cgrp = NULL;
-                       }
+               if (mode & PERF_CGROUP_SWOUT) {
+                       cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+                       /*
+                        * must not be done before ctxswout due
+                        * to event_filter_match() in event_sched_out()
+                        */
+                       cpuctx->cgrp = NULL;
+               }
 
-                       if (mode & PERF_CGROUP_SWIN) {
-                               WARN_ON_ONCE(cpuctx->cgrp);
-                               /*
-                                * set cgrp before ctxsw in to allow
-                                * event_filter_match() to not have to pass
-                                * task around
-                                * we pass the cpuctx->ctx to perf_cgroup_from_task()
-                                * because cgorup events are only per-cpu
-                                */
-                               cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx);
-                               cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
-                       }
-                       perf_pmu_enable(cpuctx->ctx.pmu);
-                       perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+               if (mode & PERF_CGROUP_SWIN) {
+                       WARN_ON_ONCE(cpuctx->cgrp);
+                       /*
+                        * set cgrp before ctxsw in to allow
+                        * event_filter_match() to not have to pass
+                        * task around
+                        * we pass the cpuctx->ctx to perf_cgroup_from_task()
+                        * because cgorup events are only per-cpu
+                        */
+                       cpuctx->cgrp = perf_cgroup_from_task(task,
+                                                            &cpuctx->ctx);
+                       cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
                }
+               perf_pmu_enable(cpuctx->ctx.pmu);
+               perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
        }
 
        local_irq_restore(flags);
@@ -889,6 +878,7 @@ list_update_cgroup_event(struct perf_event *event,
                         struct perf_event_context *ctx, bool add)
 {
        struct perf_cpu_context *cpuctx;
+       struct list_head *cpuctx_entry;
 
        if (!is_cgroup_event(event))
                return;
@@ -902,15 +892,16 @@ list_update_cgroup_event(struct perf_event *event,
         * this will always be called from the right CPU.
         */
        cpuctx = __get_cpu_context(ctx);
-
-       /*
-        * cpuctx->cgrp is NULL until a cgroup event is sched in or
-        * ctx->nr_cgroup == 0 .
-        */
-       if (add && perf_cgroup_from_task(current, ctx) == event->cgrp)
-               cpuctx->cgrp = event->cgrp;
-       else if (!add)
+       cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
+       /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/
+       if (add) {
+               list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
+               if (perf_cgroup_from_task(current, ctx) == event->cgrp)
+                       cpuctx->cgrp = event->cgrp;
+       } else {
+               list_del(cpuctx_entry);
                cpuctx->cgrp = NULL;
+       }
 }
 
 #else /* !CONFIG_CGROUP_PERF */
@@ -1453,6 +1444,20 @@ static void update_group_times(struct perf_event *leader)
                update_event_times(event);
 }
 
+static enum event_type_t get_event_type(struct perf_event *event)
+{
+       struct perf_event_context *ctx = event->ctx;
+       enum event_type_t event_type;
+
+       lockdep_assert_held(&ctx->lock);
+
+       event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
+       if (!ctx->task)
+               event_type |= EVENT_CPU;
+
+       return event_type;
+}
+
 static struct list_head *
 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 {
@@ -2226,7 +2231,8 @@ ctx_sched_in(struct perf_event_context *ctx,
             struct task_struct *task);
 
 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
-                              struct perf_event_context *ctx)
+                              struct perf_event_context *ctx,
+                              enum event_type_t event_type)
 {
        if (!cpuctx->task_ctx)
                return;
@@ -2234,7 +2240,7 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
        if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
                return;
 
-       ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+       ctx_sched_out(ctx, cpuctx, event_type);
 }
 
 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
@@ -2249,13 +2255,51 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
                ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
 }
 
+/*
+ * We want to maintain the following priority of scheduling:
+ *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
+ *  - task pinned (EVENT_PINNED)
+ *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
+ *  - task flexible (EVENT_FLEXIBLE).
+ *
+ * In order to avoid unscheduling and scheduling back in everything every
+ * time an event is added, only do it for the groups of equal priority and
+ * below.
+ *
+ * This can be called after a batch operation on task events, in which case
+ * event_type is a bit mask of the types of events involved. For CPU events,
+ * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
+ */
 static void ctx_resched(struct perf_cpu_context *cpuctx,
-                       struct perf_event_context *task_ctx)
+                       struct perf_event_context *task_ctx,
+                       enum event_type_t event_type)
 {
+       enum event_type_t ctx_event_type = event_type & EVENT_ALL;
+       bool cpu_event = !!(event_type & EVENT_CPU);
+
+       /*
+        * If pinned groups are involved, flexible groups also need to be
+        * scheduled out.
+        */
+       if (event_type & EVENT_PINNED)
+               event_type |= EVENT_FLEXIBLE;
+
        perf_pmu_disable(cpuctx->ctx.pmu);
        if (task_ctx)
-               task_ctx_sched_out(cpuctx, task_ctx);
-       cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+               task_ctx_sched_out(cpuctx, task_ctx, event_type);
+
+       /*
+        * Decide which cpu ctx groups to schedule out based on the types
+        * of events that caused rescheduling:
+        *  - EVENT_CPU: schedule out corresponding groups;
+        *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
+        *  - otherwise, do nothing more.
+        */
+       if (cpu_event)
+               cpu_ctx_sched_out(cpuctx, ctx_event_type);
+       else if (ctx_event_type & EVENT_PINNED)
+               cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+
        perf_event_sched_in(cpuctx, task_ctx, current);
        perf_pmu_enable(cpuctx->ctx.pmu);
 }
@@ -2302,7 +2346,7 @@ static int  __perf_install_in_context(void *info)
        if (reprogram) {
                ctx_sched_out(ctx, cpuctx, EVENT_TIME);
                add_event_to_ctx(event, ctx);
-               ctx_resched(cpuctx, task_ctx);
+               ctx_resched(cpuctx, task_ctx, get_event_type(event));
        } else {
                add_event_to_ctx(event, ctx);
        }
@@ -2469,7 +2513,7 @@ static void __perf_event_enable(struct perf_event *event,
        if (ctx->task)
                WARN_ON_ONCE(task_ctx != ctx);
 
-       ctx_resched(cpuctx, task_ctx);
+       ctx_resched(cpuctx, task_ctx, get_event_type(event));
 }
 
 /*
@@ -2896,7 +2940,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 
        if (do_switch) {
                raw_spin_lock(&ctx->lock);
-               task_ctx_sched_out(cpuctx, ctx);
+               task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
                raw_spin_unlock(&ctx->lock);
        }
 }
@@ -2943,7 +2987,7 @@ static void perf_pmu_sched_task(struct task_struct *prev,
                return;
 
        list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
-               pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */
+               pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
 
                if (WARN_ON_ONCE(!pmu->sched_task))
                        continue;
@@ -3133,8 +3177,12 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
         * We want to keep the following priority order:
         * cpu pinned (that don't need to move), task pinned,
         * cpu flexible, task flexible.
+        *
+        * However, if task's ctx is not carrying any pinned
+        * events, no need to flip the cpuctx's events around.
         */
-       cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+       if (!list_empty(&ctx->pinned_groups))
+               cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
        perf_event_sched_in(cpuctx, ctx, task);
        perf_pmu_enable(ctx->pmu);
        perf_ctx_unlock(cpuctx, ctx);
@@ -3449,6 +3497,7 @@ static int event_enable_on_exec(struct perf_event *event,
 static void perf_event_enable_on_exec(int ctxn)
 {
        struct perf_event_context *ctx, *clone_ctx = NULL;
+       enum event_type_t event_type = 0;
        struct perf_cpu_context *cpuctx;
        struct perf_event *event;
        unsigned long flags;
@@ -3462,15 +3511,17 @@ static void perf_event_enable_on_exec(int ctxn)
        cpuctx = __get_cpu_context(ctx);
        perf_ctx_lock(cpuctx, ctx);
        ctx_sched_out(ctx, cpuctx, EVENT_TIME);
-       list_for_each_entry(event, &ctx->event_list, event_entry)
+       list_for_each_entry(event, &ctx->event_list, event_entry) {
                enabled |= event_enable_on_exec(event, ctx);
+               event_type |= get_event_type(event);
+       }
 
        /*
         * Unclone and reschedule this context if we enabled any event.
         */
        if (enabled) {
                clone_ctx = unclone_ctx(ctx);
-               ctx_resched(cpuctx, ctx);
+               ctx_resched(cpuctx, ctx, event_type);
        }
        perf_ctx_unlock(cpuctx, ctx);
 
@@ -8044,6 +8095,9 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
        if (task == TASK_TOMBSTONE)
                return;
 
+       if (!ifh->nr_file_filters)
+               return;
+
        mm = get_task_mm(event->ctx->task);
        if (!mm)
                goto restart;
@@ -8214,6 +8268,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
                 * attribute.
                 */
                if (state == IF_STATE_END) {
+                       ret = -EINVAL;
                        if (kernel && event->attr.exclude_kernel)
                                goto fail;
 
@@ -8221,6 +8276,18 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
                                if (!filename)
                                        goto fail;
 
+                               /*
+                                * For now, we only support file-based filters
+                                * in per-task events; doing so for CPU-wide
+                                * events requires additional context switching
+                                * trickery, since same object code will be
+                                * mapped at different virtual addresses in
+                                * different processes.
+                                */
+                               ret = -EOPNOTSUPP;
+                               if (!event->ctx->task)
+                                       goto fail_free_name;
+
                                /* look up the path and grab its inode */
                                ret = kern_path(filename, LOOKUP_FOLLOW, &path);
                                if (ret)
@@ -8236,6 +8303,8 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
                                    !S_ISREG(filter->inode->i_mode))
                                        /* free_filters_list() will iput() */
                                        goto fail;
+
+                               event->addr_filters.nr_file_filters++;
                        }
 
                        /* ready to consume more filters */
@@ -8275,24 +8344,13 @@ perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
        if (WARN_ON_ONCE(event->parent))
                return -EINVAL;
 
-       /*
-        * For now, we only support filtering in per-task events; doing so
-        * for CPU-wide events requires additional context switching trickery,
-        * since same object code will be mapped at different virtual
-        * addresses in different processes.
-        */
-       if (!event->ctx->task)
-               return -EOPNOTSUPP;
-
        ret = perf_event_parse_addr_filter(event, filter_str, &filters);
        if (ret)
-               return ret;
+               goto fail_clear_files;
 
        ret = event->pmu->addr_filters_validate(&filters);
-       if (ret) {
-               free_filters_list(&filters);
-               return ret;
-       }
+       if (ret)
+               goto fail_free_filters;
 
        /* remove existing filters, if any */
        perf_addr_filters_splice(event, &filters);
@@ -8300,6 +8358,14 @@ perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
        /* install new filters */
        perf_event_for_each_child(event, perf_event_addr_filters_apply);
 
+       return ret;
+
+fail_free_filters:
+       free_filters_list(&filters);
+
+fail_clear_files:
+       event->addr_filters.nr_file_filters = 0;
+
        return ret;
 }
 
@@ -8652,37 +8718,10 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
        return NULL;
 }
 
-static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
-{
-       int cpu;
-
-       for_each_possible_cpu(cpu) {
-               struct perf_cpu_context *cpuctx;
-
-               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-
-               if (cpuctx->unique_pmu == old_pmu)
-                       cpuctx->unique_pmu = pmu;
-       }
-}
-
 static void free_pmu_context(struct pmu *pmu)
 {
-       struct pmu *i;
-
        mutex_lock(&pmus_lock);
-       /*
-        * Like a real lame refcount.
-        */
-       list_for_each_entry(i, &pmus, entry) {
-               if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
-                       update_pmu_context(i, pmu);
-                       goto out;
-               }
-       }
-
        free_percpu(pmu->pmu_cpu_context);
-out:
        mutex_unlock(&pmus_lock);
 }
 
@@ -8886,8 +8925,6 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
                cpuctx->ctx.pmu = pmu;
 
                __perf_mux_hrtimer_init(cpuctx, cpu);
-
-               cpuctx->unique_pmu = pmu;
        }
 
 got_cpu_context:
@@ -9005,6 +9042,14 @@ static struct pmu *perf_init_event(struct perf_event *event)
 
        idx = srcu_read_lock(&pmus_srcu);
 
+       /* Try parent's PMU first: */
+       if (event->parent && event->parent->pmu) {
+               pmu = event->parent->pmu;
+               ret = perf_try_init_event(pmu, event);
+               if (!ret)
+                       goto unlock;
+       }
+
        rcu_read_lock();
        pmu = idr_find(&pmu_idr, event->attr.type);
        rcu_read_unlock();
@@ -10265,7 +10310,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
         * in.
         */
        raw_spin_lock_irq(&child_ctx->lock);
-       task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
+       task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
 
        /*
         * Now that the context is inactive, destroy the task <-> ctx relation
@@ -10714,6 +10759,9 @@ static void __init perf_event_init_all_cpus(void)
                INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
                raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
 
+#ifdef CONFIG_CGROUP_PERF
+               INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
+#endif
                INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
        }
 }