Merge branches 'pm-core', 'pm-qos', 'pm-domains' and 'pm-opp'

[linux.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index c242944f5cbd560223ce91990ca92666460110bf..274c747a01ce4862307f4a97286db68e6a753824 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -37,7 +37,6 @@
  
  /*
   * Targeted preemption latency for CPU-bound tasks:
- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
   *
   * NOTE: this latency value is not the same as the concept of
   * 'timeslice length' - timeslices in CFS are of variable length
@@ -46,31 +45,35 @@
   *
   * (to see the precise effective timeslice length of your workload,
   *  run vmstat and monitor the context-switches (cs) field)
+ *
+ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
   */
-unsigned int sysctl_sched_latency = 6000000ULL;
-unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+unsigned int sysctl_sched_latency                      = 6000000ULL;
+unsigned int normalized_sysctl_sched_latency           = 6000000ULL;
  
  /*
   * The initial- and re-scaling of tunables is configurable
- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
   *
   * Options are:
- * SCHED_TUNABLESCALING_NONE - unscaled, always *1
- * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
- * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+ *
+ *   SCHED_TUNABLESCALING_NONE - unscaled, always *1
+ *   SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
+ *   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+ *
+ * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
   */
-enum sched_tunable_scaling sysctl_sched_tunable_scaling
-       = SCHED_TUNABLESCALING_LOG;
+enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
  
  /*
   * Minimal preemption granularity for CPU-bound tasks:
+ *
   * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
   */
-unsigned int sysctl_sched_min_granularity = 750000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
+unsigned int sysctl_sched_min_granularity              = 750000ULL;
+unsigned int normalized_sysctl_sched_min_granularity   = 750000ULL;
  
  /*
- * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
+ * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
   */
  static unsigned int sched_nr_latency = 8;
  
@@ -82,23 +85,27 @@ unsigned int sysctl_sched_child_runs_first __read_mostly;
  
  /*
   * SCHED_OTHER wake-up granularity.
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
   *
   * This option delays the preemption effects of decoupled workloads
   * and reduces their over-scheduling. Synchronous workloads will still
   * have immediate wakeup/sleep latencies.
+ *
+ * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
   */
-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
-unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int sysctl_sched_wakeup_granularity           = 1000000UL;
+unsigned int normalized_sysctl_sched_wakeup_granularity        = 1000000UL;
  
-const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+const_debug unsigned int sysctl_sched_migration_cost   = 500000UL;
  
+#ifdef CONFIG_SMP
  /*
- * The exponential sliding  window over which load is averaged for shares
- * distribution.
- * (default: 10msec)
+ * For asym packing, by default the lower numbered cpu has higher priority.
   */
-unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
+int __weak arch_asym_cpu_priority(int cpu)
+{
+       return -cpu;
+}
+#endif
  
  #ifdef CONFIG_CFS_BANDWIDTH
  /*
@@ -109,16 +116,18 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
   * to consumption or the quota being specified to be smaller than the slice)
   * we will always only issue the remaining available time.
   *
- * default: 5 msec, units: microseconds
-  */
-unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+ * (default: 5 msec, units: microseconds)
+ */
+unsigned int sysctl_sched_cfs_bandwidth_slice          = 5000UL;
  #endif
  
  /*
   * The margin used when comparing utilization with CPU capacity:
- * util * 1024 < capacity * margin
+ * util * margin < capacity * 1024
+ *
+ * (default: ~20%)
   */
-unsigned int capacity_margin = 1280; /* ~20% */
+unsigned int capacity_margin                           = 1280;
  
  static inline void update_load_add(struct load_weight *lw, unsigned long inc)
  {
@@ -290,19 +299,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
  static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  {
         if (!cfs_rq->on_list) {
+               struct rq *rq = rq_of(cfs_rq);
+               int cpu = cpu_of(rq);
                 /*
                  * Ensure we either appear before our parent (if already
                  * enqueued) or force our parent to appear after us when it is
-                * enqueued.  The fact that we always enqueue bottom-up
-                * reduces this to two cases.
+                * enqueued. The fact that we always enqueue bottom-up
+                * reduces this to two cases and a special case for the root
+                * cfs_rq. Furthermore, it also means that we will always reset
+                * tmp_alone_branch either when the branch is connected
+                * to a tree or when we reach the beg of the tree
                  */
                 if (cfs_rq->tg->parent &&
-                   cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
-                       list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
-                               &rq_of(cfs_rq)->leaf_cfs_rq_list);
-               } else {
+                   cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
+                       /*
+                        * If parent is already on the list, we add the child
+                        * just before. Thanks to circular linked property of
+                        * the list, this means to put the child at the tail
+                        * of the list that starts by parent.
+                        */
+                       list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+                               &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
+                       /*
+                        * The branch is now connected to its tree so we can
+                        * reset tmp_alone_branch to the beginning of the
+                        * list.
+                        */
+                       rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+               } else if (!cfs_rq->tg->parent) {
+                       /*
+                        * cfs rq without parent should be put
+                        * at the tail of the list.
+                        */
                         list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
-                               &rq_of(cfs_rq)->leaf_cfs_rq_list);
+                               &rq->leaf_cfs_rq_list);
+                       /*
+                        * We have reach the beg of a tree so we can reset
+                        * tmp_alone_branch to the beginning of the list.
+                        */
+                       rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+               } else {
+                       /*
+                        * The parent has not already been added so we want to
+                        * make sure that it will be put after us.
+                        * tmp_alone_branch points to the beg of the branch
+                        * where we will add parent.
+                        */
+                       list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+                               rq->tmp_alone_branch);
+                       /*
+                        * update tmp_alone_branch to points to the new beg
+                        * of the branch
+                        */
+                       rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
                 }
  
                 cfs_rq->on_list = 1;
@@ -708,9 +757,7 @@ void init_entity_runnable_average(struct sched_entity *se)
  }
  
  static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
-static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
+static void attach_entity_cfs_rq(struct sched_entity *se);
  
  /*
   * With new tasks being created, their initial util_avgs are extrapolated
@@ -742,7 +789,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
         struct sched_avg *sa = &se->avg;
         long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
-       u64 now = cfs_rq_clock_task(cfs_rq);
  
         if (cap > 0) {
                 if (cfs_rq->avg.util_avg != 0) {
@@ -770,14 +816,12 @@ void post_init_entity_util_avg(struct sched_entity *se)
                          * such that the next switched_to_fair() has the
                          * expected state.
                          */
-                       se->avg.last_update_time = now;
+                       se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
                         return;
                 }
         }
  
-       update_cfs_rq_load_avg(now, cfs_rq, false);
-       attach_entity_load_avg(cfs_rq, se);
-       update_tg_load_avg(cfs_rq, false);
+       attach_entity_cfs_rq(se);
  }
  
  #else /* !CONFIG_SMP */
@@ -2613,6 +2657,18 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
         if (tg_weight)
                 shares /= tg_weight;
  
+       /*
+        * MIN_SHARES has to be unscaled here to support per-CPU partitioning
+        * of a group with small tg->shares value. It is a floor value which is
+        * assigned as a minimum load.weight to the sched_entity representing
+        * the group on a CPU.
+        *
+        * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
+        * on an 8-core system with 8 tasks each runnable on one CPU shares has
+        * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
+        * case no task is runnable on a CPU MIN_SHARES=2 should be returned
+        * instead of 0.
+        */
         if (shares < MIN_SHARES)
                 shares = MIN_SHARES;
         if (shares > tg->shares)
@@ -2645,16 +2701,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
  
  static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
  
-static void update_cfs_shares(struct cfs_rq *cfs_rq)
+static void update_cfs_shares(struct sched_entity *se)
  {
+       struct cfs_rq *cfs_rq = group_cfs_rq(se);
         struct task_group *tg;
-       struct sched_entity *se;
         long shares;
  
-       tg = cfs_rq->tg;
-       se = tg->se[cpu_of(rq_of(cfs_rq))];
-       if (!se || throttled_hierarchy(cfs_rq))
+       if (!cfs_rq)
+               return;
+
+       if (throttled_hierarchy(cfs_rq))
                 return;
+
+       tg = cfs_rq->tg;
+
  #ifndef CONFIG_SMP
         if (likely(se->load.weight == tg->shares))
                 return;
@@ -2663,8 +2723,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
  
         reweight_entity(cfs_rq_of(se), se, shares);
  }
+
  #else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+static inline void update_cfs_shares(struct sched_entity *se)
  {
  }
  #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -2890,6 +2951,26 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
         return decayed;
  }
  
+/*
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do {                           \
+       typeof(_ptr) ptr = (_ptr);                              \
+       typeof(_val) val = (_val);                              \
+       typeof(*ptr) res, var = READ_ONCE(*ptr);                \
+                                                               \
+       res = var + val;                                        \
+                                                               \
+       if (val < 0 && res > var)                               \
+               res = 0;                                        \
+                                                               \
+       WRITE_ONCE(*ptr, res);                                  \
+} while (0)
+
  #ifdef CONFIG_FAIR_GROUP_SCHED
  /**
   * update_tg_load_avg - update the tg's load avg
@@ -2969,8 +3050,138 @@ void set_task_rq_fair(struct sched_entity *se,
                 se->avg.last_update_time = n_last_update_time;
         }
  }
+
+/* Take into account change of utilization of a child task group */
+static inline void
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+       long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
+
+       /* Nothing to update */
+       if (!delta)
+               return;
+
+       /* Set new sched_entity's utilization */
+       se->avg.util_avg = gcfs_rq->avg.util_avg;
+       se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
+
+       /* Update parent cfs_rq utilization */
+       add_positive(&cfs_rq->avg.util_avg, delta);
+       cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
+}
+
+/* Take into account change of load of a child task group */
+static inline void
+update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+       long delta, load = gcfs_rq->avg.load_avg;
+
+       /*
+        * If the load of group cfs_rq is null, the load of the
+        * sched_entity will also be null so we can skip the formula
+        */
+       if (load) {
+               long tg_load;
+
+               /* Get tg's load and ensure tg_load > 0 */
+               tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
+
+               /* Ensure tg_load >= load and updated with current load*/
+               tg_load -= gcfs_rq->tg_load_avg_contrib;
+               tg_load += load;
+
+               /*
+                * We need to compute a correction term in the case that the
+                * task group is consuming more CPU than a task of equal
+                * weight. A task with a weight equals to tg->shares will have
+                * a load less or equal to scale_load_down(tg->shares).
+                * Similarly, the sched_entities that represent the task group
+                * at parent level, can't have a load higher than
+                * scale_load_down(tg->shares). And the Sum of sched_entities'
+                * load must be <= scale_load_down(tg->shares).
+                */
+               if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
+                       /* scale gcfs_rq's load into tg's shares*/
+                       load *= scale_load_down(gcfs_rq->tg->shares);
+                       load /= tg_load;
+               }
+       }
+
+       delta = load - se->avg.load_avg;
+
+       /* Nothing to update */
+       if (!delta)
+               return;
+
+       /* Set new sched_entity's load */
+       se->avg.load_avg = load;
+       se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+
+       /* Update parent cfs_rq load */
+       add_positive(&cfs_rq->avg.load_avg, delta);
+       cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+
+       /*
+        * If the sched_entity is already enqueued, we also have to update the
+        * runnable load avg.
+        */
+       if (se->on_rq) {
+               /* Update parent cfs_rq runnable_load_avg */
+               add_positive(&cfs_rq->runnable_load_avg, delta);
+               cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+       }
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
+{
+       cfs_rq->propagate_avg = 1;
+}
+
+static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
+{
+       struct cfs_rq *cfs_rq = group_cfs_rq(se);
+
+       if (!cfs_rq->propagate_avg)
+               return 0;
+
+       cfs_rq->propagate_avg = 0;
+       return 1;
+}
+
+/* Update task and its cfs_rq load average */
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+       struct cfs_rq *cfs_rq;
+
+       if (entity_is_task(se))
+               return 0;
+
+       if (!test_and_clear_tg_cfs_propagate(se))
+               return 0;
+
+       cfs_rq = cfs_rq_of(se);
+
+       set_tg_cfs_propagate(cfs_rq);
+
+       update_tg_cfs_util(cfs_rq, se);
+       update_tg_cfs_load(cfs_rq, se);
+
+       return 1;
+}
+
  #else /* CONFIG_FAIR_GROUP_SCHED */
+
  static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
+
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+       return 0;
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
+
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
  static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
@@ -3041,6 +3252,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
                 sub_positive(&sa->load_avg, r);
                 sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
                 removed_load = 1;
+               set_tg_cfs_propagate(cfs_rq);
         }
  
         if (atomic_long_read(&cfs_rq->removed_util_avg)) {
@@ -3048,6 +3260,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
                 sub_positive(&sa->util_avg, r);
                 sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
                 removed_util = 1;
+               set_tg_cfs_propagate(cfs_rq);
         }
  
         decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -3064,23 +3277,35 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
         return decayed || removed_load;
  }
  
+/*
+ * Optional action to be done while updating the load average
+ */
+#define UPDATE_TG      0x1
+#define SKIP_AGE_LOAD  0x2
+
  /* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int update_tg)
+static inline void update_load_avg(struct sched_entity *se, int flags)
  {
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
         u64 now = cfs_rq_clock_task(cfs_rq);
         struct rq *rq = rq_of(cfs_rq);
         int cpu = cpu_of(rq);
+       int decayed;
  
         /*
          * Track task load average for carrying it to new CPU after migrated, and
          * track group sched_entity load average for task_h_load calc in migration
          */
-       __update_load_avg(now, cpu, &se->avg,
+       if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
+               __update_load_avg(now, cpu, &se->avg,
                           se->on_rq * scale_load_down(se->load.weight),
                           cfs_rq->curr == se, NULL);
+       }
  
-       if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
+       decayed  = update_cfs_rq_load_avg(now, cfs_rq, true);
+       decayed |= propagate_entity_load_avg(se);
+
+       if (decayed && (flags & UPDATE_TG))
                 update_tg_load_avg(cfs_rq, 0);
  }
  
@@ -3094,31 +3319,12 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
   */
  static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
-       if (!sched_feat(ATTACH_AGE_LOAD))
-               goto skip_aging;
-
-       /*
-        * If we got migrated (either between CPUs or between cgroups) we'll
-        * have aged the average right before clearing @last_update_time.
-        *
-        * Or we're fresh through post_init_entity_util_avg().
-        */
-       if (se->avg.last_update_time) {
-               __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
-                                 &se->avg, 0, 0, NULL);
-
-               /*
-                * XXX: we could have just aged the entire load away if we've been
-                * absent from the fair class for too long.
-                */
-       }
-
-skip_aging:
         se->avg.last_update_time = cfs_rq->avg.last_update_time;
         cfs_rq->avg.load_avg += se->avg.load_avg;
         cfs_rq->avg.load_sum += se->avg.load_sum;
         cfs_rq->avg.util_avg += se->avg.util_avg;
         cfs_rq->avg.util_sum += se->avg.util_sum;
+       set_tg_cfs_propagate(cfs_rq);
  
         cfs_rq_util_change(cfs_rq);
  }
@@ -3133,14 +3339,12 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
   */
  static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
-       __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
-                         &se->avg, se->on_rq * scale_load_down(se->load.weight),
-                         cfs_rq->curr == se, NULL);
  
         sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
         sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
         sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
         sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+       set_tg_cfs_propagate(cfs_rq);
  
         cfs_rq_util_change(cfs_rq);
  }
@@ -3150,34 +3354,20 @@ static inline void
  enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
         struct sched_avg *sa = &se->avg;
-       u64 now = cfs_rq_clock_task(cfs_rq);
-       int migrated, decayed;
-
-       migrated = !sa->last_update_time;
-       if (!migrated) {
-               __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
-                       se->on_rq * scale_load_down(se->load.weight),
-                       cfs_rq->curr == se, NULL);
-       }
-
-       decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
  
         cfs_rq->runnable_load_avg += sa->load_avg;
         cfs_rq->runnable_load_sum += sa->load_sum;
  
-       if (migrated)
+       if (!sa->last_update_time) {
                 attach_entity_load_avg(cfs_rq, se);
-
-       if (decayed || migrated)
                 update_tg_load_avg(cfs_rq, 0);
+       }
  }
  
  /* Remove the runnable load generated by se from cfs_rq's runnable load average */
  static inline void
  dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
-       update_load_avg(se, 1);
-
         cfs_rq->runnable_load_avg =
                 max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
         cfs_rq->runnable_load_sum =
@@ -3205,6 +3395,19 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
  }
  #endif
  
+/*
+ * Synchronize entity load avg of dequeued entity without locking
+ * the previous rq.
+ */
+void sync_entity_load_avg(struct sched_entity *se)
+{
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+       u64 last_update_time;
+
+       last_update_time = cfs_rq_last_update_time(cfs_rq);
+       __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+}
+
  /*
   * Task first catches up with cfs_rq, and then subtract
   * itself from the cfs_rq (task must be off the queue now).
@@ -3212,7 +3415,6 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
  void remove_entity_load_avg(struct sched_entity *se)
  {
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
-       u64 last_update_time;
  
         /*
          * tasks cannot exit without having gone through wake_up_new_task() ->
@@ -3224,9 +3426,7 @@ void remove_entity_load_avg(struct sched_entity *se)
          * calls this.
          */
  
-       last_update_time = cfs_rq_last_update_time(cfs_rq);
-
-       __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+       sync_entity_load_avg(se);
         atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
         atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
  }
@@ -3241,7 +3441,7 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
         return cfs_rq->avg.load_avg;
  }
  
-static int idle_balance(struct rq *this_rq);
+static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
  
  #else /* CONFIG_SMP */
  
@@ -3251,7 +3451,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
         return 0;
  }
  
-static inline void update_load_avg(struct sched_entity *se, int not_used)
+#define UPDATE_TG      0x0
+#define SKIP_AGE_LOAD  0x0
+
+static inline void update_load_avg(struct sched_entity *se, int not_used1)
  {
         cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
  }
@@ -3267,7 +3470,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
  static inline void
  detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
  
-static inline int idle_balance(struct rq *rq)
+static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
  {
         return 0;
  }
@@ -3396,9 +3599,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         if (renorm && !curr)
                 se->vruntime += cfs_rq->min_vruntime;
  
+       /*
+        * When enqueuing a sched_entity, we must:
+        *   - Update loads to have both entity and cfs_rq synced with now.
+        *   - Add its load to cfs_rq->runnable_avg
+        *   - For group_entity, update its weight to reflect the new share of
+        *     its group cfs_rq
+        *   - Add its new weight to cfs_rq->load.weight
+        */
+       update_load_avg(se, UPDATE_TG);
         enqueue_entity_load_avg(cfs_rq, se);
+       update_cfs_shares(se);
         account_entity_enqueue(cfs_rq, se);
-       update_cfs_shares(cfs_rq);
  
         if (flags & ENQUEUE_WAKEUP)
                 place_entity(cfs_rq, se, 0);
@@ -3470,6 +3682,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
          * Update run-time statistics of the 'current'.
          */
         update_curr(cfs_rq);
+
+       /*
+        * When dequeuing a sched_entity, we must:
+        *   - Update loads to have both entity and cfs_rq synced with now.
+        *   - Substract its load from the cfs_rq->runnable_avg.
+        *   - Substract its previous weight from cfs_rq->load.weight.
+        *   - For group entity, update its weight to reflect the new share
+        *     of its group cfs_rq.
+        */
+       update_load_avg(se, UPDATE_TG);
         dequeue_entity_load_avg(cfs_rq, se);
  
         update_stats_dequeue(cfs_rq, se, flags);
@@ -3493,7 +3715,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         /* return excess runtime on last dequeue */
         return_cfs_rq_runtime(cfs_rq);
  
-       update_cfs_shares(cfs_rq);
+       update_cfs_shares(se);
  
         /*
          * Now advance min_vruntime if @se was the entity holding it back,
@@ -3557,7 +3779,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
                  */
                 update_stats_wait_end(cfs_rq, se);
                 __dequeue_entity(cfs_rq, se);
-               update_load_avg(se, 1);
+               update_load_avg(se, UPDATE_TG);
         }
  
         update_stats_curr_start(cfs_rq, se);
@@ -3675,8 +3897,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
         /*
          * Ensure that runnable average is periodically updated.
          */
-       update_load_avg(curr, 1);
-       update_cfs_shares(cfs_rq);
+       update_load_avg(curr, UPDATE_TG);
+       update_cfs_shares(curr);
  
  #ifdef CONFIG_SCHED_HRTICK
         /*
@@ -4572,8 +4794,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 if (cfs_rq_throttled(cfs_rq))
                         break;
  
-               update_load_avg(se, 1);
-               update_cfs_shares(cfs_rq);
+               update_load_avg(se, UPDATE_TG);
+               update_cfs_shares(se);
         }
  
         if (!se)
@@ -4631,8 +4853,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 if (cfs_rq_throttled(cfs_rq))
                         break;
  
-               update_load_avg(se, 1);
-               update_cfs_shares(cfs_rq);
+               update_load_avg(se, UPDATE_TG);
+               update_cfs_shares(se);
         }
  
         if (!se)
@@ -5199,6 +5421,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
         return 1;
  }
  
+static inline int task_util(struct task_struct *p);
+static int cpu_util_wake(int cpu, struct task_struct *p);
+
+static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+{
+       return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
+}
+
  /*
   * find_idlest_group finds and returns the least busy CPU group within the
   * domain.
@@ -5208,15 +5438,21 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                   int this_cpu, int sd_flag)
  {
         struct sched_group *idlest = NULL, *group = sd->groups;
-       unsigned long min_load = ULONG_MAX, this_load = 0;
+       struct sched_group *most_spare_sg = NULL;
+       unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
+       unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
+       unsigned long most_spare = 0, this_spare = 0;
         int load_idx = sd->forkexec_idx;
-       int imbalance = 100 + (sd->imbalance_pct-100)/2;
+       int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
+       unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
+                               (sd->imbalance_pct-100) / 100;
  
         if (sd_flag & SD_BALANCE_WAKE)
                 load_idx = sd->wake_idx;
  
         do {
-               unsigned long load, avg_load;
+               unsigned long load, avg_load, runnable_load;
+               unsigned long spare_cap, max_spare_cap;
                 int local_group;
                 int i;
  
@@ -5228,8 +5464,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                 local_group = cpumask_test_cpu(this_cpu,
                                                sched_group_cpus(group));
  
-               /* Tally up the load of all CPUs in the group */
+               /*
+                * Tally up the load of all CPUs in the group and find
+                * the group containing the CPU with most spare capacity.
+                */
                 avg_load = 0;
+               runnable_load = 0;
+               max_spare_cap = 0;
  
                 for_each_cpu(i, sched_group_cpus(group)) {
                         /* Bias balancing toward cpus of our domain */
@@ -5238,22 +5479,84 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                         else
                                 load = target_load(i, load_idx);
  
-                       avg_load += load;
+                       runnable_load += load;
+
+                       avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
+
+                       spare_cap = capacity_spare_wake(i, p);
+
+                       if (spare_cap > max_spare_cap)
+                               max_spare_cap = spare_cap;
                 }
  
                 /* Adjust by relative CPU capacity of the group */
-               avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
+               avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
+                                       group->sgc->capacity;
+               runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
+                                       group->sgc->capacity;
  
                 if (local_group) {
-                       this_load = avg_load;
-               } else if (avg_load < min_load) {
-                       min_load = avg_load;
-                       idlest = group;
+                       this_runnable_load = runnable_load;
+                       this_avg_load = avg_load;
+                       this_spare = max_spare_cap;
+               } else {
+                       if (min_runnable_load > (runnable_load + imbalance)) {
+                               /*
+                                * The runnable load is significantly smaller
+                                * so we can pick this new cpu
+                                */
+                               min_runnable_load = runnable_load;
+                               min_avg_load = avg_load;
+                               idlest = group;
+                       } else if ((runnable_load < (min_runnable_load + imbalance)) &&
+                                  (100*min_avg_load > imbalance_scale*avg_load)) {
+                               /*
+                                * The runnable loads are close so take the
+                                * blocked load into account through avg_load.
+                                */
+                               min_avg_load = avg_load;
+                               idlest = group;
+                       }
+
+                       if (most_spare < max_spare_cap) {
+                               most_spare = max_spare_cap;
+                               most_spare_sg = group;
+                       }
                 }
         } while (group = group->next, group != sd->groups);
  
-       if (!idlest || 100*this_load < imbalance*min_load)
+       /*
+        * The cross-over point between using spare capacity or least load
+        * is too conservative for high utilization tasks on partially
+        * utilized systems if we require spare_capacity > task_util(p),
+        * so we allow for some task stuffing by using
+        * spare_capacity > task_util(p)/2.
+        *
+        * Spare capacity can't be used for fork because the utilization has
+        * not been set yet, we must first select a rq to compute the initial
+        * utilization.
+        */
+       if (sd_flag & SD_BALANCE_FORK)
+               goto skip_spare;
+
+       if (this_spare > task_util(p) / 2 &&
+           imbalance_scale*this_spare > 100*most_spare)
+               return NULL;
+
+       if (most_spare > task_util(p) / 2)
+               return most_spare_sg;
+
+skip_spare:
+       if (!idlest)
+               return NULL;
+
+       if (min_runnable_load > (this_runnable_load + imbalance))
+               return NULL;
+
+       if ((this_runnable_load < (min_runnable_load + imbalance)) &&
+            (100*this_avg_load < imbalance_scale*min_avg_load))
                 return NULL;
+
         return idlest;
  }
  
@@ -5589,6 +5892,24 @@ static inline int task_util(struct task_struct *p)
         return p->se.avg.util_avg;
  }
  
+/*
+ * cpu_util_wake: Compute cpu utilization with any contributions from
+ * the waking task p removed.
+ */
+static int cpu_util_wake(int cpu, struct task_struct *p)
+{
+       unsigned long util, capacity;
+
+       /* Task has no contribution or is new */
+       if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
+               return cpu_util(cpu);
+
+       capacity = capacity_orig_of(cpu);
+       util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
+
+       return (util >= capacity) ? capacity : util;
+}
+
  /*
   * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
   * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
@@ -5607,6 +5928,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
         if (max_cap - min_cap < max_cap >> 3)
                 return 0;
  
+       /* Bring task utilization in sync with prev_cpu */
+       sync_entity_load_avg(&p->se);
+
         return min_cap * 1024 < task_util(p) * capacity_margin;
  }
  
@@ -5923,7 +6247,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
  }
  
  static struct task_struct *
-pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
+pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  {
         struct cfs_rq *cfs_rq = &rq->cfs;
         struct sched_entity *se;
@@ -6030,15 +6354,8 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie c
         return p;
  
  idle:
-       /*
-        * This is OK, because current is on_cpu, which avoids it being picked
-        * for load-balance and preemption/IRQs are still disabled avoiding
-        * further scheduler activity on it and we're being very careful to
-        * re-start the picking loop.
-        */
-       lockdep_unpin_lock(&rq->lock, cookie);
-       new_tasks = idle_balance(rq);
-       lockdep_repin_lock(&rq->lock, cookie);
+       new_tasks = idle_balance(rq, rf);
+
         /*
          * Because idle_balance() releases (and re-acquires) rq->lock, it is
          * possible for any higher priority task to appear. In that case we
@@ -6641,6 +6958,10 @@ static void update_blocked_averages(int cpu)
  
                 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
                         update_tg_load_avg(cfs_rq, 0);
+
+               /* Propagate pending load changes to the parent */
+               if (cfs_rq->tg->se[cpu])
+                       update_load_avg(cfs_rq->tg->se[cpu], 0);
         }
         raw_spin_unlock_irqrestore(&rq->lock, flags);
  }
@@ -6845,13 +7166,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
  
         cpu_rq(cpu)->cpu_capacity = capacity;
         sdg->sgc->capacity = capacity;
+       sdg->sgc->min_capacity = capacity;
  }
  
  void update_group_capacity(struct sched_domain *sd, int cpu)
  {
         struct sched_domain *child = sd->child;
         struct sched_group *group, *sdg = sd->groups;
-       unsigned long capacity;
+       unsigned long capacity, min_capacity;
         unsigned long interval;
  
         interval = msecs_to_jiffies(sd->balance_interval);
@@ -6864,6 +7186,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
         }
  
         capacity = 0;
+       min_capacity = ULONG_MAX;
  
         if (child->flags & SD_OVERLAP) {
                 /*
@@ -6888,11 +7211,12 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
                          */
                         if (unlikely(!rq->sd)) {
                                 capacity += capacity_of(cpu);
-                               continue;
+                       } else {
+                               sgc = rq->sd->groups->sgc;
+                               capacity += sgc->capacity;
                         }
  
-                       sgc = rq->sd->groups->sgc;
-                       capacity += sgc->capacity;
+                       min_capacity = min(capacity, min_capacity);
                 }
         } else  {
                 /*
@@ -6902,12 +7226,16 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
  
                 group = child->groups;
                 do {
-                       capacity += group->sgc->capacity;
+                       struct sched_group_capacity *sgc = group->sgc;
+
+                       capacity += sgc->capacity;
+                       min_capacity = min(sgc->min_capacity, min_capacity);
                         group = group->next;
                 } while (group != child->groups);
         }
  
         sdg->sgc->capacity = capacity;
+       sdg->sgc->min_capacity = min_capacity;
  }
  
  /*
@@ -6930,8 +7258,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
   * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
   * Something like:
   *
- *     { 0 1 2 3 } { 4 5 6 7 }
- *             *     * * *
+ *     { 0 1 2 3 } { 4 5 6 7 }
+ *             *     * * *
   *
   * If we were to balance group-wise we'd place two tasks in the first group and
   * two tasks in the second group. Clearly this is undesired as it will overload
@@ -7002,6 +7330,17 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
         return false;
  }
  
+/*
+ * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
+ * per-CPU capacity than sched_group ref.
+ */
+static inline bool
+group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+{
+       return sg->sgc->min_capacity * capacity_margin <
+                                               ref->sgc->min_capacity * 1024;
+}
+
  static inline enum
  group_type group_classify(struct sched_group *group,
                           struct sg_lb_stats *sgs)
@@ -7105,6 +7444,20 @@ static bool update_sd_pick_busiest(struct lb_env *env,
         if (sgs->avg_load <= busiest->avg_load)
                 return false;
  
+       if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
+               goto asym_packing;
+
+       /*
+        * Candidate sg has no more than one task per CPU and
+        * has higher per-CPU capacity. Migrating tasks to less
+        * capable CPUs may harm throughput. Maximize throughput,
+        * power/energy consequences are not considered.
+        */
+       if (sgs->sum_nr_running <= sgs->group_weight &&
+           group_smaller_cpu_capacity(sds->local, sg))
+               return false;
+
+asym_packing:
         /* This is the busiest node in its class. */
         if (!(env->sd->flags & SD_ASYM_PACKING))
                 return true;
@@ -7113,16 +7466,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,
         if (env->idle == CPU_NOT_IDLE)
                 return true;
         /*
-        * ASYM_PACKING needs to move all the work to the lowest
-        * numbered CPUs in the group, therefore mark all groups
-        * higher than ourself as busy.
+        * ASYM_PACKING needs to move all the work to the highest
+        * prority CPUs in the group, therefore mark all groups
+        * of lower priority than ourself as busy.
          */
-       if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
+       if (sgs->sum_nr_running &&
+           sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
                 if (!sds->busiest)
                         return true;
  
-               /* Prefer to move from highest possible cpu's work */
-               if (group_first_cpu(sds->busiest) < group_first_cpu(sg))
+               /* Prefer to move from lowest priority cpu's work */
+               if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
+                                     sg->asym_prefer_cpu))
                         return true;
         }
  
@@ -7274,8 +7629,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
         if (!sds->busiest)
                 return 0;
  
-       busiest_cpu = group_first_cpu(sds->busiest);
-       if (env->dst_cpu > busiest_cpu)
+       busiest_cpu = sds->busiest->asym_prefer_cpu;
+       if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
                 return 0;
  
         env->imbalance = DIV_ROUND_CLOSEST(
@@ -7613,10 +7968,11 @@ static int need_active_balance(struct lb_env *env)
  
                 /*
                  * ASYM_PACKING needs to force migrate tasks from busy but
-                * higher numbered CPUs in order to pack all tasks in the
-                * lowest numbered CPUs.
+                * lower priority CPUs in order to pack all tasks in the
+                * highest priority CPUs.
                  */
-               if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
+               if ((sd->flags & SD_ASYM_PACKING) &&
+                   sched_asym_prefer(env->dst_cpu, env->src_cpu))
                         return 1;
         }
  
@@ -7748,6 +8104,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
  
  more_balance:
                 raw_spin_lock_irqsave(&busiest->lock, flags);
+               update_rq_clock(busiest);
  
                 /*
                  * cur_ld_moved - load moved in current iteration
@@ -7968,7 +8325,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
   * idle_balance is called by schedule() if this_cpu is about to become
   * idle. Attempts to pull tasks from other CPUs.
   */
-static int idle_balance(struct rq *this_rq)
+static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
  {
         unsigned long next_balance = jiffies + HZ;
         int this_cpu = this_rq->cpu;
@@ -7982,6 +8339,14 @@ static int idle_balance(struct rq *this_rq)
          */
         this_rq->idle_stamp = rq_clock(this_rq);
  
+       /*
+        * This is OK, because current is on_cpu, which avoids it being picked
+        * for load-balance and preemption/IRQs are still disabled avoiding
+        * further scheduler activity on it and we're being very careful to
+        * re-start the picking loop.
+        */
+       rq_unpin_lock(this_rq, rf);
+
         if (this_rq->avg_idle < sysctl_sched_migration_cost ||
             !this_rq->rd->overload) {
                 rcu_read_lock();
@@ -8059,6 +8424,8 @@ static int idle_balance(struct rq *this_rq)
         if (pulled_task)
                 this_rq->idle_stamp = 0;
  
+       rq_repin_lock(this_rq, rf);
+
         return pulled_task;
  }
  
@@ -8114,6 +8481,7 @@ static int active_load_balance_cpu_stop(void *data)
                 };
  
                 schedstat_inc(sd->alb_count);
+               update_rq_clock(busiest_rq);
  
                 p = detach_one_task(&env);
                 if (p) {
@@ -8465,7 +8833,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
         unsigned long now = jiffies;
         struct sched_domain_shared *sds;
         struct sched_domain *sd;
-       int nr_busy, cpu = rq->cpu;
+       int nr_busy, i, cpu = rq->cpu;
         bool kick = false;
  
         if (unlikely(rq->idle_balance))
@@ -8516,12 +8884,18 @@ static inline bool nohz_kick_needed(struct rq *rq)
         }
  
         sd = rcu_dereference(per_cpu(sd_asym, cpu));
-       if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
-                                 sched_domain_span(sd)) < cpu)) {
-               kick = true;
-               goto unlock;
-       }
+       if (sd) {
+               for_each_cpu(i, sched_domain_span(sd)) {
+                       if (i == cpu ||
+                           !cpumask_test_cpu(i, nohz.idle_cpus_mask))
+                               continue;
  
+                       if (sched_asym_prefer(i, cpu)) {
+                               kick = true;
+                               goto unlock;
+                       }
+               }
+       }
  unlock:
         rcu_read_unlock();
         return kick;
@@ -8687,32 +9061,45 @@ static inline bool vruntime_normalized(struct task_struct *p)
         return false;
  }
  
-static void detach_task_cfs_rq(struct task_struct *p)
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Propagate the changes of the sched_entity across the tg tree to make it
+ * visible to the root
+ */
+static void propagate_entity_cfs_rq(struct sched_entity *se)
  {
-       struct sched_entity *se = &p->se;
-       struct cfs_rq *cfs_rq = cfs_rq_of(se);
-       u64 now = cfs_rq_clock_task(cfs_rq);
+       struct cfs_rq *cfs_rq;
  
-       if (!vruntime_normalized(p)) {
-               /*
-                * Fix up our vruntime so that the current sleep doesn't
-                * cause 'unlimited' sleep bonus.
-                */
-               place_entity(cfs_rq, se, 0);
-               se->vruntime -= cfs_rq->min_vruntime;
+       /* Start to propagate at parent */
+       se = se->parent;
+
+       for_each_sched_entity(se) {
+               cfs_rq = cfs_rq_of(se);
+
+               if (cfs_rq_throttled(cfs_rq))
+                       break;
+
+               update_load_avg(se, UPDATE_TG);
         }
+}
+#else
+static void propagate_entity_cfs_rq(struct sched_entity *se) { }
+#endif
+
+static void detach_entity_cfs_rq(struct sched_entity *se)
+{
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
  
         /* Catch up with the cfs_rq and remove our load when we leave */
-       update_cfs_rq_load_avg(now, cfs_rq, false);
+       update_load_avg(se, 0);
         detach_entity_load_avg(cfs_rq, se);
         update_tg_load_avg(cfs_rq, false);
+       propagate_entity_cfs_rq(se);
  }
  
-static void attach_task_cfs_rq(struct task_struct *p)
+static void attach_entity_cfs_rq(struct sched_entity *se)
  {
-       struct sched_entity *se = &p->se;
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
-       u64 now = cfs_rq_clock_task(cfs_rq);
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
         /*
@@ -8722,10 +9109,36 @@ static void attach_task_cfs_rq(struct task_struct *p)
         se->depth = se->parent ? se->parent->depth + 1 : 0;
  #endif
  
-       /* Synchronize task with its cfs_rq */
-       update_cfs_rq_load_avg(now, cfs_rq, false);
+       /* Synchronize entity with its cfs_rq */
+       update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
         attach_entity_load_avg(cfs_rq, se);
         update_tg_load_avg(cfs_rq, false);
+       propagate_entity_cfs_rq(se);
+}
+
+static void detach_task_cfs_rq(struct task_struct *p)
+{
+       struct sched_entity *se = &p->se;
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+       if (!vruntime_normalized(p)) {
+               /*
+                * Fix up our vruntime so that the current sleep doesn't
+                * cause 'unlimited' sleep bonus.
+                */
+               place_entity(cfs_rq, se, 0);
+               se->vruntime -= cfs_rq->min_vruntime;
+       }
+
+       detach_entity_cfs_rq(se);
+}
+
+static void attach_task_cfs_rq(struct task_struct *p)
+{
+       struct sched_entity *se = &p->se;
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+       attach_entity_cfs_rq(se);
  
         if (!vruntime_normalized(p))
                 se->vruntime += cfs_rq->min_vruntime;
@@ -8779,6 +9192,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
         cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
  #endif
  #ifdef CONFIG_SMP
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       cfs_rq->propagate_avg = 0;
+#endif
         atomic_long_set(&cfs_rq->removed_load_avg, 0);
         atomic_long_set(&cfs_rq->removed_util_avg, 0);
  #endif
@@ -8887,7 +9303,8 @@ void online_fair_sched_group(struct task_group *tg)
                 se = tg->se[i];
  
                 raw_spin_lock_irq(&rq->lock);
-               post_init_entity_util_avg(se);
+               update_rq_clock(rq);
+               attach_entity_cfs_rq(se);
                 sync_throttle(tg, i);
                 raw_spin_unlock_irq(&rq->lock);
         }
@@ -8979,8 +9396,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
  
                 /* Possible calls to update_curr() need rq clock */
                 update_rq_clock(rq);
-               for_each_sched_entity(se)
-                       update_cfs_shares(group_cfs_rq(se));
+               for_each_sched_entity(se) {
+                       update_load_avg(se, UPDATE_TG);
+                       update_cfs_shares(se);
+               }
                 raw_spin_unlock_irqrestore(&rq->lock, flags);
         }