Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 24 Mar 2016 16:42:50 +0000 (09:42 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 24 Mar 2016 16:42:50 +0000 (09:42 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 24 Mar 2016 16:42:50 +0000 (09:42 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 24 Mar 2016 16:42:50 +0000 (09:42 -0700)
diff --combined kernel/sched/core.c

index 44db0fffa8be333c47d6aae90c3219a335cd5c22,2a87bdde8d4eb9c73afa9649b658479379c3117e..d8465eeab8b3d7878866dc1aee7a87bfcb3f1b5e
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -73,7 -73,6 +73,7 @@@
   #include <linux/init_task.h>
   #include <linux/context_tracking.h>
   #include <linux/compiler.h>
+ +#include <linux/frame.h>
   
   #include <asm/switch_to.h>
   #include <asm/tlb.h>
@@@ -2690,7 -2689,7 +2690,7 @@@ asmlinkage __visible void schedule_tail
   /*
    * context_switch - switch to the new MM and the new thread's register state.
    */
- -static inline struct rq *
+ +static __always_inline struct rq *
   context_switch(struct rq *rq, struct task_struct *prev,
                struct task_struct *next)
   {
@@@ -3175,7 -3174,7 +3175,7 @@@ static void __sched notrace __schedule(
                         if (prev->flags & PF_WQ_WORKER) {
                                 struct task_struct *to_wakeup;
   
- -                              to_wakeup = wq_worker_sleeping(prev, cpu);
+ +                              to_wakeup = wq_worker_sleeping(prev);
                                 if (to_wakeup)
                                         try_to_wake_up_local(to_wakeup);
                         }
@@@ -3205,7 -3204,6 +3205,7 @@@
   
         balance_callback(rq);
   }
+ +STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */
   
   static inline void sched_submit_work(struct task_struct *tsk)
   {
@@@ -5371,6 -5369,7 +5371,7 @@@ migration_call(struct notifier_block *n
   
         case CPU_UP_PREPARE:
                 rq->calc_load_update = calc_load_update;
+               account_reset_rq(rq);
                 break;
   
         case CPU_ONLINE:
@@@ -7537,7 -7536,7 +7538,7 @@@ void set_curr_task(int cpu, struct task
   /* task_group_lock serializes the addition/removal of task groups */
   static DEFINE_SPINLOCK(task_group_lock);
   
- static void free_sched_group(struct task_group *tg)
+ static void sched_free_group(struct task_group *tg)
   {
         free_fair_sched_group(tg);
         free_rt_sched_group(tg);
@@@ -7563,7 -7562,7 +7564,7 @@@ struct task_group *sched_create_group(s
         return tg;
   
   err:
-       free_sched_group(tg);
+       sched_free_group(tg);
         return ERR_PTR(-ENOMEM);
   }
   
@@@ -7583,17 -7582,16 +7584,16 @@@ void sched_online_group(struct task_gro
   }
   
   /* rcu callback to free various structures associated with a task group */
- static void free_sched_group_rcu(struct rcu_head *rhp)
+ static void sched_free_group_rcu(struct rcu_head *rhp)
   {
         /* now it should be safe to free those cfs_rqs */
-       free_sched_group(container_of(rhp, struct task_group, rcu));
+       sched_free_group(container_of(rhp, struct task_group, rcu));
   }
   
- /* Destroy runqueue etc associated with a task group */
   void sched_destroy_group(struct task_group *tg)
   {
         /* wait for possible concurrent references to cfs_rqs complete */
-       call_rcu(&tg->rcu, free_sched_group_rcu);
+       call_rcu(&tg->rcu, sched_free_group_rcu);
   }
   
   void sched_offline_group(struct task_group *tg)
@@@ -8052,31 -8050,26 +8052,26 @@@ cpu_cgroup_css_alloc(struct cgroup_subs
         if (IS_ERR(tg))
                 return ERR_PTR(-ENOMEM);
   
+       sched_online_group(tg, parent);
+ 
         return &tg->css;
   }
   
- static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+ static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
   {
         struct task_group *tg = css_tg(css);
-       struct task_group *parent = css_tg(css->parent);
   
-       if (parent)
-               sched_online_group(tg, parent);
-       return 0;
+       sched_offline_group(tg);
   }
   
   static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
   {
         struct task_group *tg = css_tg(css);
   
-       sched_destroy_group(tg);
- }
- 
- static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
- {
-       struct task_group *tg = css_tg(css);
- 
-       sched_offline_group(tg);
+       /*
+        * Relies on the RCU grace period between css_released() and this.
+        */
+       sched_free_group(tg);
   }
   
   static void cpu_cgroup_fork(struct task_struct *task)
@@@ -8436,14 -8429,13 +8431,13 @@@ static struct cftype cpu_files[] = 
   
   struct cgroup_subsys cpu_cgrp_subsys = {
         .css_alloc      = cpu_cgroup_css_alloc,
+       .css_released   = cpu_cgroup_css_released,
         .css_free       = cpu_cgroup_css_free,
-       .css_online     = cpu_cgroup_css_online,
-       .css_offline    = cpu_cgroup_css_offline,
         .fork           = cpu_cgroup_fork,
         .can_attach     = cpu_cgroup_can_attach,
         .attach         = cpu_cgroup_attach,
         .legacy_cftypes = cpu_files,
- -      .early_init     = 1,
+ +      .early_init     = true,
   };
   
   #endif        /* CONFIG_CGROUP_SCHED */
diff --combined kernel/sched/cpuacct.c

index 2ddaebf7469a371444987c1b01acda5107093650,434c2fa4135239030affab33730da215d197be3e..4a811203c04a462478c85ab31d20ec2492e0f4b9
--- 1/kernel/sched/cpuacct.c
--- 2/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@@ -145,13 -145,16 +145,16 @@@ static u64 cpuusage_read(struct cgroup_
   }
   
   static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
-                         u64 reset)
+                         u64 val)
   {
         struct cpuacct *ca = css_ca(css);
         int err = 0;
         int i;
   
-       if (reset) {
+       /*
+        * Only allow '0' here to do a reset.
+        */
+       if (val) {
                 err = -EINVAL;
                 goto out;
         }
@@@ -235,23 -238,10 +238,10 @@@ static struct cftype files[] = 
   void cpuacct_charge(struct task_struct *tsk, u64 cputime)
   {
         struct cpuacct *ca;
-       int cpu;
- 
-       cpu = task_cpu(tsk);
   
         rcu_read_lock();
- 
-       ca = task_ca(tsk);
- 
-       while (true) {
-               u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-               *cpuusage += cputime;
- 
-               ca = parent_ca(ca);
-               if (!ca)
-                       break;
-       }
- 
+       for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
+               *this_cpu_ptr(ca->cpuusage) += cputime;
         rcu_read_unlock();
   }
   
@@@ -260,18 -250,13 +250,13 @@@
    *
    * Note: it's the caller that updates the account of the root cgroup.
    */
- void cpuacct_account_field(struct task_struct *p, int index, u64 val)
+ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
   {
-       struct kernel_cpustat *kcpustat;
         struct cpuacct *ca;
   
         rcu_read_lock();
-       ca = task_ca(p);
-       while (ca != &root_cpuacct) {
-               kcpustat = this_cpu_ptr(ca->cpustat);
-               kcpustat->cpustat[index] += val;
-               ca = parent_ca(ca);
-       }
+       for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
+               this_cpu_ptr(ca->cpustat)->cpustat[index] += val;
         rcu_read_unlock();
   }
   
@@@ -279,5 -264,5 +264,5 @@@ struct cgroup_subsys cpuacct_cgrp_subsy
         .css_alloc      = cpuacct_css_alloc,
         .css_free       = cpuacct_css_free,
         .legacy_cftypes = files,
- -      .early_init     = 1,
+ +      .early_init     = true,
   };
diff --combined kernel/sched/fair.c

index 46d64e4ccfde8ad99014761c93ef3ceca1e122b2,303d6392b38953a2f64f05e301a3de6be86788dd..0fe30e66aff1db44d58ec96cbee332a78257e4d3
--- 1/kernel/sched/fair.c
--- 2/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@@ -2856,8 -2856,7 +2856,8 @@@ static inline void update_load_avg(stru
   {
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
         u64 now = cfs_rq_clock_task(cfs_rq);
- -      int cpu = cpu_of(rq_of(cfs_rq));
+ +      struct rq *rq = rq_of(cfs_rq);
+ +      int cpu = cpu_of(rq);
   
         /*
          * Track task load average for carrying it to new CPU after migrated, and
@@@ -2869,29 -2868,6 +2869,29 @@@
   
         if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
                 update_tg_load_avg(cfs_rq, 0);
+ +
+ +      if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
+ +              unsigned long max = rq->cpu_capacity_orig;
+ +
+ +              /*
+ +               * There are a few boundary cases this might miss but it should
+ +               * get called often enough that that should (hopefully) not be
+ +               * a real problem -- added to that it only calls on the local
+ +               * CPU, so if we enqueue remotely we'll miss an update, but
+ +               * the next tick/schedule should update.
+ +               *
+ +               * It will not get called when we go idle, because the idle
+ +               * thread is a different class (!fair), nor will the utilization
+ +               * number include things like RT tasks.
+ +               *
+ +               * As is, the util number is not freq-invariant (we'd have to
+ +               * implement arch_scale_freq_capacity() for that).
+ +               *
+ +               * See cpu_util().
+ +               */
+ +              cpufreq_update_util(rq_clock(rq),
+ +                                  min(cfs_rq->avg.util_avg, max), max);
+ +      }
   }
   
   static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@@ -3181,17 -3157,25 +3181,25 @@@ static inline void check_schedstat_requ
   static void
   enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
   {
+       bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING);
+       bool curr = cfs_rq->curr == se;
+ 
         /*
-        * Update the normalized vruntime before updating min_vruntime
-        * through calling update_curr().
+        * If we're the current task, we must renormalise before calling
+        * update_curr().
          */
-       if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
+       if (renorm && curr)
                 se->vruntime += cfs_rq->min_vruntime;
   
+       update_curr(cfs_rq);
+ 
         /*
-        * Update run-time statistics of the 'current'.
+        * Otherwise, renormalise after, such that we're placed at the current
+        * moment in time, instead of some random moment in the past.
          */
-       update_curr(cfs_rq);
+       if (renorm && !curr)
+               se->vruntime += cfs_rq->min_vruntime;
+ 
         enqueue_entity_load_avg(cfs_rq, se);
         account_entity_enqueue(cfs_rq, se);
         update_cfs_shares(cfs_rq);
@@@ -3207,7 -3191,7 +3215,7 @@@
                 update_stats_enqueue(cfs_rq, se);
                 check_spread(cfs_rq, se);
         }
-       if (se != cfs_rq->curr)
+       if (!curr)
                 __enqueue_entity(cfs_rq, se);
         se->on_rq = 1;
   
@@@ -5071,7 -5055,19 +5079,19 @@@ static int select_idle_sibling(struct t
                 return i;
   
         /*
-        * Otherwise, iterate the domains and find an elegible idle cpu.
+        * Otherwise, iterate the domains and find an eligible idle cpu.
+        *
+        * A completely idle sched group at higher domains is more
+        * desirable than an idle group at a lower level, because lower
+        * domains have smaller groups and usually share hardware
+        * resources which causes tasks to contend on them, e.g. x86
+        * hyperthread siblings in the lowest domain (SMT) can contend
+        * on the shared cpu pipeline.
+        *
+        * However, while we prefer idle groups at higher domains
+        * finding an idle cpu at the lowest domain is still better than
+        * returning 'target', which we've already established, isn't
+        * idle.
          */
         sd = rcu_dereference(per_cpu(sd_llc, target));
         for_each_lower_domain(sd) {
@@@ -5081,11 -5077,16 +5101,16 @@@
                                                 tsk_cpus_allowed(p)))
                                 goto next;
   
+                       /* Ensure the entire group is idle */
                         for_each_cpu(i, sched_group_cpus(sg)) {
                                 if (i == target || !idle_cpu(i))
                                         goto next;
                         }
   
+                       /*
+                        * It doesn't matter which cpu we pick, the
+                        * whole group is idle.
+                        */
                         target = cpumask_first_and(sched_group_cpus(sg),
                                         tsk_cpus_allowed(p));
                         goto done;
diff --combined kernel/sched/sched.h

index 382848a24ed92beefdf2d49b6f72b16f0a279ade,e6d4a3fa3660beb506a1066f43e6873830514114..ec2e8d23527e6c92a4fe1b5ef45dfb9ac1e242a8
--- 1/kernel/sched/sched.h
--- 2/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@@ -1794,50 -1794,15 +1794,63 @@@ static inline u64 irq_time_read(int cpu
   #endif /* CONFIG_64BIT */
   #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
   
+ +#ifdef CONFIG_CPU_FREQ
+ +DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+ +
+ +/**
+ + * cpufreq_update_util - Take a note about CPU utilization changes.
+ + * @time: Current time.
+ + * @util: Current utilization.
+ + * @max: Utilization ceiling.
+ + *
+ + * This function is called by the scheduler on every invocation of
+ + * update_load_avg() on the CPU whose utilization is being updated.
+ + *
+ + * It can only be called from RCU-sched read-side critical sections.
+ + */
+ +static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max)
+ +{
+ +       struct update_util_data *data;
+ +
+ +       data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+ +       if (data)
+ +               data->func(data, time, util, max);
+ +}
+ +
+ +/**
+ + * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed.
+ + * @time: Current time.
+ + *
+ + * The way cpufreq is currently arranged requires it to evaluate the CPU
+ + * performance state (frequency/voltage) on a regular basis to prevent it from
+ + * being stuck in a completely inadequate performance level for too long.
+ + * That is not guaranteed to happen if the updates are only triggered from CFS,
+ + * though, because they may not be coming in if RT or deadline tasks are active
+ + * all the time (or there are RT and DL tasks only).
+ + *
+ + * As a workaround for that issue, this function is called by the RT and DL
+ + * sched classes to trigger extra cpufreq updates to prevent it from stalling,
+ + * but that really is a band-aid.  Going forward it should be replaced with
+ + * solutions targeted more specifically at RT and DL tasks.
+ + */
+ +static inline void cpufreq_trigger_update(u64 time)
+ +{
+ +      cpufreq_update_util(time, ULONG_MAX, 0);
+ +}
+ +#else
+ +static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {}
+ +static inline void cpufreq_trigger_update(u64 time) {}
+ +#endif /* CONFIG_CPU_FREQ */
++
+ static inline void account_reset_rq(struct rq *rq)
+ {
+ #ifdef CONFIG_IRQ_TIME_ACCOUNTING
+       rq->prev_irq_time = 0;
+ #endif
+ #ifdef CONFIG_PARAVIRT
+       rq->prev_steal_time = 0;
+ #endif
+ #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+       rq->prev_steal_time_rq = 0;
+ #endif
+ }
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 24 Mar 2016 16:42:50 +0000 (09:42 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 24 Mar 2016 16:42:50 +0000 (09:42 -0700)
		1	2
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/cpuacct.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/fair.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history