]> asedeno.scripts.mit.edu Git - linux.git/commitdiff
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 24 Mar 2016 16:42:50 +0000 (09:42 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 24 Mar 2016 16:42:50 +0000 (09:42 -0700)
Pull scheduler fixes from Ingo Molnar:
 "Misc fixes: a cgroup fix, a fair-scheduler migration accounting fix, a
  cputime fix and two cpuacct cleanups"

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/cpuacct: Simplify the cpuacct code
  sched/cpuacct: Rename parameter in cpuusage_write() for readability
  sched/fair: Add comments to explain select_idle_sibling()
  sched/fair: Fix fairness issue on migration
  sched/cgroup: Fix/cleanup cgroup teardown/init
  sched/cputime: Fix steal time accounting vs. CPU hotplug

1  2 
kernel/sched/core.c
kernel/sched/cpuacct.c
kernel/sched/fair.c
kernel/sched/sched.h

diff --combined kernel/sched/core.c
index 44db0fffa8be333c47d6aae90c3219a335cd5c22,2a87bdde8d4eb9c73afa9649b658479379c3117e..d8465eeab8b3d7878866dc1aee7a87bfcb3f1b5e
@@@ -73,7 -73,6 +73,7 @@@
  #include <linux/init_task.h>
  #include <linux/context_tracking.h>
  #include <linux/compiler.h>
 +#include <linux/frame.h>
  
  #include <asm/switch_to.h>
  #include <asm/tlb.h>
@@@ -2690,7 -2689,7 +2690,7 @@@ asmlinkage __visible void schedule_tail
  /*
   * context_switch - switch to the new MM and the new thread's register state.
   */
 -static inline struct rq *
 +static __always_inline struct rq *
  context_switch(struct rq *rq, struct task_struct *prev,
               struct task_struct *next)
  {
@@@ -3175,7 -3174,7 +3175,7 @@@ static void __sched notrace __schedule(
                        if (prev->flags & PF_WQ_WORKER) {
                                struct task_struct *to_wakeup;
  
 -                              to_wakeup = wq_worker_sleeping(prev, cpu);
 +                              to_wakeup = wq_worker_sleeping(prev);
                                if (to_wakeup)
                                        try_to_wake_up_local(to_wakeup);
                        }
  
        balance_callback(rq);
  }
 +STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */
  
  static inline void sched_submit_work(struct task_struct *tsk)
  {
@@@ -5371,6 -5369,7 +5371,7 @@@ migration_call(struct notifier_block *n
  
        case CPU_UP_PREPARE:
                rq->calc_load_update = calc_load_update;
+               account_reset_rq(rq);
                break;
  
        case CPU_ONLINE:
@@@ -7537,7 -7536,7 +7538,7 @@@ void set_curr_task(int cpu, struct task
  /* task_group_lock serializes the addition/removal of task groups */
  static DEFINE_SPINLOCK(task_group_lock);
  
- static void free_sched_group(struct task_group *tg)
+ static void sched_free_group(struct task_group *tg)
  {
        free_fair_sched_group(tg);
        free_rt_sched_group(tg);
@@@ -7563,7 -7562,7 +7564,7 @@@ struct task_group *sched_create_group(s
        return tg;
  
  err:
-       free_sched_group(tg);
+       sched_free_group(tg);
        return ERR_PTR(-ENOMEM);
  }
  
@@@ -7583,17 -7582,16 +7584,16 @@@ void sched_online_group(struct task_gro
  }
  
  /* rcu callback to free various structures associated with a task group */
- static void free_sched_group_rcu(struct rcu_head *rhp)
+ static void sched_free_group_rcu(struct rcu_head *rhp)
  {
        /* now it should be safe to free those cfs_rqs */
-       free_sched_group(container_of(rhp, struct task_group, rcu));
+       sched_free_group(container_of(rhp, struct task_group, rcu));
  }
  
- /* Destroy runqueue etc associated with a task group */
  void sched_destroy_group(struct task_group *tg)
  {
        /* wait for possible concurrent references to cfs_rqs complete */
-       call_rcu(&tg->rcu, free_sched_group_rcu);
+       call_rcu(&tg->rcu, sched_free_group_rcu);
  }
  
  void sched_offline_group(struct task_group *tg)
@@@ -8052,31 -8050,26 +8052,26 @@@ cpu_cgroup_css_alloc(struct cgroup_subs
        if (IS_ERR(tg))
                return ERR_PTR(-ENOMEM);
  
+       sched_online_group(tg, parent);
        return &tg->css;
  }
  
- static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+ static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
  {
        struct task_group *tg = css_tg(css);
-       struct task_group *parent = css_tg(css->parent);
  
-       if (parent)
-               sched_online_group(tg, parent);
-       return 0;
+       sched_offline_group(tg);
  }
  
  static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
  {
        struct task_group *tg = css_tg(css);
  
-       sched_destroy_group(tg);
- }
- static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
- {
-       struct task_group *tg = css_tg(css);
-       sched_offline_group(tg);
+       /*
+        * Relies on the RCU grace period between css_released() and this.
+        */
+       sched_free_group(tg);
  }
  
  static void cpu_cgroup_fork(struct task_struct *task)
@@@ -8436,14 -8429,13 +8431,13 @@@ static struct cftype cpu_files[] = 
  
  struct cgroup_subsys cpu_cgrp_subsys = {
        .css_alloc      = cpu_cgroup_css_alloc,
+       .css_released   = cpu_cgroup_css_released,
        .css_free       = cpu_cgroup_css_free,
-       .css_online     = cpu_cgroup_css_online,
-       .css_offline    = cpu_cgroup_css_offline,
        .fork           = cpu_cgroup_fork,
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
        .legacy_cftypes = cpu_files,
 -      .early_init     = 1,
 +      .early_init     = true,
  };
  
  #endif        /* CONFIG_CGROUP_SCHED */
diff --combined kernel/sched/cpuacct.c
index 2ddaebf7469a371444987c1b01acda5107093650,434c2fa4135239030affab33730da215d197be3e..4a811203c04a462478c85ab31d20ec2492e0f4b9
@@@ -145,13 -145,16 +145,16 @@@ static u64 cpuusage_read(struct cgroup_
  }
  
  static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
-                         u64 reset)
+                         u64 val)
  {
        struct cpuacct *ca = css_ca(css);
        int err = 0;
        int i;
  
-       if (reset) {
+       /*
+        * Only allow '0' here to do a reset.
+        */
+       if (val) {
                err = -EINVAL;
                goto out;
        }
@@@ -235,23 -238,10 +238,10 @@@ static struct cftype files[] = 
  void cpuacct_charge(struct task_struct *tsk, u64 cputime)
  {
        struct cpuacct *ca;
-       int cpu;
-       cpu = task_cpu(tsk);
  
        rcu_read_lock();
-       ca = task_ca(tsk);
-       while (true) {
-               u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-               *cpuusage += cputime;
-               ca = parent_ca(ca);
-               if (!ca)
-                       break;
-       }
+       for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
+               *this_cpu_ptr(ca->cpuusage) += cputime;
        rcu_read_unlock();
  }
  
   *
   * Note: it's the caller that updates the account of the root cgroup.
   */
- void cpuacct_account_field(struct task_struct *p, int index, u64 val)
+ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
  {
-       struct kernel_cpustat *kcpustat;
        struct cpuacct *ca;
  
        rcu_read_lock();
-       ca = task_ca(p);
-       while (ca != &root_cpuacct) {
-               kcpustat = this_cpu_ptr(ca->cpustat);
-               kcpustat->cpustat[index] += val;
-               ca = parent_ca(ca);
-       }
+       for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
+               this_cpu_ptr(ca->cpustat)->cpustat[index] += val;
        rcu_read_unlock();
  }
  
@@@ -279,5 -264,5 +264,5 @@@ struct cgroup_subsys cpuacct_cgrp_subsy
        .css_alloc      = cpuacct_css_alloc,
        .css_free       = cpuacct_css_free,
        .legacy_cftypes = files,
 -      .early_init     = 1,
 +      .early_init     = true,
  };
diff --combined kernel/sched/fair.c
index 46d64e4ccfde8ad99014761c93ef3ceca1e122b2,303d6392b38953a2f64f05e301a3de6be86788dd..0fe30e66aff1db44d58ec96cbee332a78257e4d3
@@@ -2856,8 -2856,7 +2856,8 @@@ static inline void update_load_avg(stru
  {
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        u64 now = cfs_rq_clock_task(cfs_rq);
 -      int cpu = cpu_of(rq_of(cfs_rq));
 +      struct rq *rq = rq_of(cfs_rq);
 +      int cpu = cpu_of(rq);
  
        /*
         * Track task load average for carrying it to new CPU after migrated, and
  
        if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
                update_tg_load_avg(cfs_rq, 0);
 +
 +      if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
 +              unsigned long max = rq->cpu_capacity_orig;
 +
 +              /*
 +               * There are a few boundary cases this might miss but it should
 +               * get called often enough that that should (hopefully) not be
 +               * a real problem -- added to that it only calls on the local
 +               * CPU, so if we enqueue remotely we'll miss an update, but
 +               * the next tick/schedule should update.
 +               *
 +               * It will not get called when we go idle, because the idle
 +               * thread is a different class (!fair), nor will the utilization
 +               * number include things like RT tasks.
 +               *
 +               * As is, the util number is not freq-invariant (we'd have to
 +               * implement arch_scale_freq_capacity() for that).
 +               *
 +               * See cpu_util().
 +               */
 +              cpufreq_update_util(rq_clock(rq),
 +                                  min(cfs_rq->avg.util_avg, max), max);
 +      }
  }
  
  static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@@ -3181,17 -3157,25 +3181,25 @@@ static inline void check_schedstat_requ
  static void
  enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  {
+       bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING);
+       bool curr = cfs_rq->curr == se;
        /*
-        * Update the normalized vruntime before updating min_vruntime
-        * through calling update_curr().
+        * If we're the current task, we must renormalise before calling
+        * update_curr().
         */
-       if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
+       if (renorm && curr)
                se->vruntime += cfs_rq->min_vruntime;
  
+       update_curr(cfs_rq);
        /*
-        * Update run-time statistics of the 'current'.
+        * Otherwise, renormalise after, such that we're placed at the current
+        * moment in time, instead of some random moment in the past.
         */
-       update_curr(cfs_rq);
+       if (renorm && !curr)
+               se->vruntime += cfs_rq->min_vruntime;
        enqueue_entity_load_avg(cfs_rq, se);
        account_entity_enqueue(cfs_rq, se);
        update_cfs_shares(cfs_rq);
                update_stats_enqueue(cfs_rq, se);
                check_spread(cfs_rq, se);
        }
-       if (se != cfs_rq->curr)
+       if (!curr)
                __enqueue_entity(cfs_rq, se);
        se->on_rq = 1;
  
@@@ -5071,7 -5055,19 +5079,19 @@@ static int select_idle_sibling(struct t
                return i;
  
        /*
-        * Otherwise, iterate the domains and find an elegible idle cpu.
+        * Otherwise, iterate the domains and find an eligible idle cpu.
+        *
+        * A completely idle sched group at higher domains is more
+        * desirable than an idle group at a lower level, because lower
+        * domains have smaller groups and usually share hardware
+        * resources which causes tasks to contend on them, e.g. x86
+        * hyperthread siblings in the lowest domain (SMT) can contend
+        * on the shared cpu pipeline.
+        *
+        * However, while we prefer idle groups at higher domains
+        * finding an idle cpu at the lowest domain is still better than
+        * returning 'target', which we've already established, isn't
+        * idle.
         */
        sd = rcu_dereference(per_cpu(sd_llc, target));
        for_each_lower_domain(sd) {
                                                tsk_cpus_allowed(p)))
                                goto next;
  
+                       /* Ensure the entire group is idle */
                        for_each_cpu(i, sched_group_cpus(sg)) {
                                if (i == target || !idle_cpu(i))
                                        goto next;
                        }
  
+                       /*
+                        * It doesn't matter which cpu we pick, the
+                        * whole group is idle.
+                        */
                        target = cpumask_first_and(sched_group_cpus(sg),
                                        tsk_cpus_allowed(p));
                        goto done;
diff --combined kernel/sched/sched.h
index 382848a24ed92beefdf2d49b6f72b16f0a279ade,e6d4a3fa3660beb506a1066f43e6873830514114..ec2e8d23527e6c92a4fe1b5ef45dfb9ac1e242a8
@@@ -1794,50 -1794,15 +1794,63 @@@ static inline u64 irq_time_read(int cpu
  #endif /* CONFIG_64BIT */
  #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
  
 +#ifdef CONFIG_CPU_FREQ
 +DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
 +
 +/**
 + * cpufreq_update_util - Take a note about CPU utilization changes.
 + * @time: Current time.
 + * @util: Current utilization.
 + * @max: Utilization ceiling.
 + *
 + * This function is called by the scheduler on every invocation of
 + * update_load_avg() on the CPU whose utilization is being updated.
 + *
 + * It can only be called from RCU-sched read-side critical sections.
 + */
 +static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max)
 +{
 +       struct update_util_data *data;
 +
 +       data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
 +       if (data)
 +               data->func(data, time, util, max);
 +}
 +
 +/**
 + * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed.
 + * @time: Current time.
 + *
 + * The way cpufreq is currently arranged requires it to evaluate the CPU
 + * performance state (frequency/voltage) on a regular basis to prevent it from
 + * being stuck in a completely inadequate performance level for too long.
 + * That is not guaranteed to happen if the updates are only triggered from CFS,
 + * though, because they may not be coming in if RT or deadline tasks are active
 + * all the time (or there are RT and DL tasks only).
 + *
 + * As a workaround for that issue, this function is called by the RT and DL
 + * sched classes to trigger extra cpufreq updates to prevent it from stalling,
 + * but that really is a band-aid.  Going forward it should be replaced with
 + * solutions targeted more specifically at RT and DL tasks.
 + */
 +static inline void cpufreq_trigger_update(u64 time)
 +{
 +      cpufreq_update_util(time, ULONG_MAX, 0);
 +}
 +#else
 +static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {}
 +static inline void cpufreq_trigger_update(u64 time) {}
 +#endif /* CONFIG_CPU_FREQ */
++
+ static inline void account_reset_rq(struct rq *rq)
+ {
+ #ifdef CONFIG_IRQ_TIME_ACCOUNTING
+       rq->prev_irq_time = 0;
+ #endif
+ #ifdef CONFIG_PARAVIRT
+       rq->prev_steal_time = 0;
+ #endif
+ #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+       rq->prev_steal_time_rq = 0;
+ #endif
+ }