sched/fair: Optimize select_idle_cpu

[linux.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 682a754ea3e1a867e7b1c0cb2c8dee0dfc5c1461..280d54ccb4bed6503a22b267a996a812d4432304 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -229,8 +229,7 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
                 }
         }
  
-       /* hint to use a 32x32->64 mul */
-       fact = (u64)(u32)fact * lw->inv_weight;
+       fact = mul_u32_u32(fact, lw->inv_weight);
  
         while (fact >> 32) {
                 fact >>= 1;
@@ -1474,7 +1473,12 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
                group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
  }
  
-static unsigned long cpu_runnable_load(struct rq *rq);
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
+
+static unsigned long cpu_runnable_load(struct rq *rq)
+{
+       return cfs_rq_runnable_load_avg(&rq->cfs);
+}
  
  /* Cached statistics for all CPUs within a node */
  struct numa_stats {
@@ -3504,9 +3508,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
         cfs_rq->load_last_update_time_copy = sa->last_update_time;
  #endif
  
-       if (decayed)
-               cfs_rq_util_change(cfs_rq, 0);
-
         return decayed;
  }
  
@@ -3616,8 +3617,12 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
                 attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
                 update_tg_load_avg(cfs_rq, 0);
  
-       } else if (decayed && (flags & UPDATE_TG))
-               update_tg_load_avg(cfs_rq, 0);
+       } else if (decayed) {
+               cfs_rq_util_change(cfs_rq, 0);
+
+               if (flags & UPDATE_TG)
+                       update_tg_load_avg(cfs_rq, 0);
+       }
  }
  
  #ifndef CONFIG_64BIT
@@ -3763,11 +3768,22 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
         if (ue.enqueued & UTIL_AVG_UNCHANGED)
                 return;
  
+       /*
+        * Reset EWMA on utilization increases, the moving average is used only
+        * to smooth utilization decreases.
+        */
+       ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
+       if (sched_feat(UTIL_EST_FASTUP)) {
+               if (ue.ewma < ue.enqueued) {
+                       ue.ewma = ue.enqueued;
+                       goto done;
+               }
+       }
+
         /*
          * Skip update of task's estimated utilization when its EWMA is
          * already ~1% close to its last activation value.
          */
-       ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
         last_ewma_diff = ue.enqueued - ue.ewma;
         if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
                 return;
@@ -3800,6 +3816,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
         ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
         ue.ewma  += last_ewma_diff;
         ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
+done:
         WRITE_ONCE(p->se.avg.util_est, ue);
  }
  
@@ -5370,26 +5387,45 @@ static int sched_idle_cpu(int cpu)
                         rq->nr_running);
  }
  
-static unsigned long cpu_runnable_load(struct rq *rq)
+static unsigned long cpu_load(struct rq *rq)
  {
-       return cfs_rq_runnable_load_avg(&rq->cfs);
+       return cfs_rq_load_avg(&rq->cfs);
  }
  
-static unsigned long capacity_of(int cpu)
+/*
+ * cpu_load_without - compute CPU load without any contributions from *p
+ * @cpu: the CPU which load is requested
+ * @p: the task which load should be discounted
+ *
+ * The load of a CPU is defined by the load of tasks currently enqueued on that
+ * CPU as well as tasks which are currently sleeping after an execution on that
+ * CPU.
+ *
+ * This method returns the load of the specified CPU by discounting the load of
+ * the specified task, whenever the task is currently contributing to the CPU
+ * load.
+ */
+static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
  {
-       return cpu_rq(cpu)->cpu_capacity;
-}
+       struct cfs_rq *cfs_rq;
+       unsigned int load;
  
-static unsigned long cpu_avg_load_per_task(int cpu)
-{
-       struct rq *rq = cpu_rq(cpu);
-       unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
-       unsigned long load_avg = cpu_runnable_load(rq);
+       /* Task has no contribution or is new */
+       if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+               return cpu_load(rq);
  
-       if (nr_running)
-               return load_avg / nr_running;
+       cfs_rq = &rq->cfs;
+       load = READ_ONCE(cfs_rq->avg.load_avg);
  
-       return 0;
+       /* Discount task's util from CPU's util */
+       lsub_positive(&load, task_h_load(p));
+
+       return load;
+}
+
+static unsigned long capacity_of(int cpu)
+{
+       return cpu_rq(cpu)->cpu_capacity;
  }
  
  static void record_wakee(struct task_struct *p)
@@ -5482,7 +5518,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
         s64 this_eff_load, prev_eff_load;
         unsigned long task_load;
  
-       this_eff_load = cpu_runnable_load(cpu_rq(this_cpu));
+       this_eff_load = cpu_load(cpu_rq(this_cpu));
  
         if (sync) {
                 unsigned long current_load = task_h_load(current);
@@ -5500,7 +5536,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
                 this_eff_load *= 100;
         this_eff_load *= capacity_of(prev_cpu);
  
-       prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu));
+       prev_eff_load = cpu_load(cpu_rq(prev_cpu));
         prev_eff_load -= task_load;
         if (sched_feat(WA_BIAS))
                 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
@@ -5538,149 +5574,9 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
         return target;
  }
  
-static unsigned long cpu_util_without(int cpu, struct task_struct *p);
-
-static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
-{
-       return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
-}
-
-/*
- * find_idlest_group finds and returns the least busy CPU group within the
- * domain.
- *
- * Assumes p is allowed on at least one CPU in sd.
- */
  static struct sched_group *
  find_idlest_group(struct sched_domain *sd, struct task_struct *p,
-                 int this_cpu, int sd_flag)
-{
-       struct sched_group *idlest = NULL, *group = sd->groups;
-       struct sched_group *most_spare_sg = NULL;
-       unsigned long min_runnable_load = ULONG_MAX;
-       unsigned long this_runnable_load = ULONG_MAX;
-       unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
-       unsigned long most_spare = 0, this_spare = 0;
-       int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
-       unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
-                               (sd->imbalance_pct-100) / 100;
-
-       do {
-               unsigned long load, avg_load, runnable_load;
-               unsigned long spare_cap, max_spare_cap;
-               int local_group;
-               int i;
-
-               /* Skip over this group if it has no CPUs allowed */
-               if (!cpumask_intersects(sched_group_span(group),
-                                       p->cpus_ptr))
-                       continue;
-
-               local_group = cpumask_test_cpu(this_cpu,
-                                              sched_group_span(group));
-
-               /*
-                * Tally up the load of all CPUs in the group and find
-                * the group containing the CPU with most spare capacity.
-                */
-               avg_load = 0;
-               runnable_load = 0;
-               max_spare_cap = 0;
-
-               for_each_cpu(i, sched_group_span(group)) {
-                       load = cpu_runnable_load(cpu_rq(i));
-                       runnable_load += load;
-
-                       avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
-
-                       spare_cap = capacity_spare_without(i, p);
-
-                       if (spare_cap > max_spare_cap)
-                               max_spare_cap = spare_cap;
-               }
-
-               /* Adjust by relative CPU capacity of the group */
-               avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
-                                       group->sgc->capacity;
-               runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
-                                       group->sgc->capacity;
-
-               if (local_group) {
-                       this_runnable_load = runnable_load;
-                       this_avg_load = avg_load;
-                       this_spare = max_spare_cap;
-               } else {
-                       if (min_runnable_load > (runnable_load + imbalance)) {
-                               /*
-                                * The runnable load is significantly smaller
-                                * so we can pick this new CPU:
-                                */
-                               min_runnable_load = runnable_load;
-                               min_avg_load = avg_load;
-                               idlest = group;
-                       } else if ((runnable_load < (min_runnable_load + imbalance)) &&
-                                  (100*min_avg_load > imbalance_scale*avg_load)) {
-                               /*
-                                * The runnable loads are close so take the
-                                * blocked load into account through avg_load:
-                                */
-                               min_avg_load = avg_load;
-                               idlest = group;
-                       }
-
-                       if (most_spare < max_spare_cap) {
-                               most_spare = max_spare_cap;
-                               most_spare_sg = group;
-                       }
-               }
-       } while (group = group->next, group != sd->groups);
-
-       /*
-        * The cross-over point between using spare capacity or least load
-        * is too conservative for high utilization tasks on partially
-        * utilized systems if we require spare_capacity > task_util(p),
-        * so we allow for some task stuffing by using
-        * spare_capacity > task_util(p)/2.
-        *
-        * Spare capacity can't be used for fork because the utilization has
-        * not been set yet, we must first select a rq to compute the initial
-        * utilization.
-        */
-       if (sd_flag & SD_BALANCE_FORK)
-               goto skip_spare;
-
-       if (this_spare > task_util(p) / 2 &&
-           imbalance_scale*this_spare > 100*most_spare)
-               return NULL;
-
-       if (most_spare > task_util(p) / 2)
-               return most_spare_sg;
-
-skip_spare:
-       if (!idlest)
-               return NULL;
-
-       /*
-        * When comparing groups across NUMA domains, it's possible for the
-        * local domain to be very lightly loaded relative to the remote
-        * domains but "imbalance" skews the comparison making remote CPUs
-        * look much more favourable. When considering cross-domain, add
-        * imbalance to the runnable load on the remote node and consider
-        * staying local.
-        */
-       if ((sd->flags & SD_NUMA) &&
-           min_runnable_load + imbalance >= this_runnable_load)
-               return NULL;
-
-       if (min_runnable_load > (this_runnable_load + imbalance))
-               return NULL;
-
-       if ((this_runnable_load < (min_runnable_load + imbalance)) &&
-            (100*this_avg_load < imbalance_scale*min_avg_load))
-               return NULL;
-
-       return idlest;
-}
+                 int this_cpu, int sd_flag);
  
  /*
   * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
@@ -5729,7 +5625,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
                                 continue;
                         }
  
-                       load = cpu_runnable_load(cpu_rq(i));
+                       load = cpu_load(cpu_rq(i));
                         if (load < min_load) {
                                 min_load = load;
                                 least_loaded_cpu = i;
@@ -5753,7 +5649,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
                 return prev_cpu;
  
         /*
-        * We need task's util for capacity_spare_without, sync it up to
+        * We need task's util for cpu_util_without, sync it up to
          * prev_cpu's last_update_time.
          */
         if (!(sd_flag & SD_BALANCE_FORK))
@@ -5932,6 +5828,7 @@ static inline int select_idle_smt(struct task_struct *p, int target)
   */
  static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
  {
+       struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
         struct sched_domain *this_sd;
         u64 avg_cost, avg_idle;
         u64 time, cost;
@@ -5963,11 +5860,11 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
  
         time = cpu_clock(this);
  
-       for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
+       cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+
+       for_each_cpu_wrap(cpu, cpus, target) {
                 if (!--nr)
                         return si_cpu;
-               if (!cpumask_test_cpu(cpu, p->cpus_ptr))
-                       continue;
                 if (available_idle_cpu(cpu))
                         break;
                 if (si_cpu == -1 && sched_idle_cpu(cpu))
@@ -6570,6 +6467,15 @@ static void task_dead_fair(struct task_struct *p)
  {
         remove_entity_load_avg(&p->se);
  }
+
+static int
+balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+       if (rq->nr_running)
+               return 1;
+
+       return newidle_balance(rq, rf) != 0;
+}
  #endif /* CONFIG_SMP */
  
  static unsigned long wakeup_gran(struct sched_entity *se)
@@ -6737,7 +6643,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
                 set_last_buddy(se);
  }
  
-static struct task_struct *
+struct task_struct *
  pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  {
         struct cfs_rq *cfs_rq = &rq->cfs;
@@ -6746,7 +6652,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
         int new_tasks;
  
  again:
-       if (!cfs_rq->nr_running)
+       if (!sched_fair_runnable(rq))
                 goto idle;
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -6881,10 +6787,15 @@ done: __maybe_unused;
         return NULL;
  }
  
+static struct task_struct *__pick_next_task_fair(struct rq *rq)
+{
+       return pick_next_task_fair(rq, NULL, NULL);
+}
+
  /*
   * Account for a descheduled task:
   */
-static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
  {
         struct sched_entity *se = &prev->se;
         struct cfs_rq *cfs_rq;
@@ -7070,11 +6981,49 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
  
  enum fbq_type { regular, remote, all };
  
+/*
+ * 'group_type' describes the group of CPUs at the moment of load balancing.
+ *
+ * The enum is ordered by pulling priority, with the group with lowest priority
+ * first so the group_type can simply be compared when selecting the busiest
+ * group. See update_sd_pick_busiest().
+ */
  enum group_type {
-       group_other = 0,
+       /* The group has spare capacity that can be used to run more tasks.  */
+       group_has_spare = 0,
+       /*
+        * The group is fully used and the tasks don't compete for more CPU
+        * cycles. Nevertheless, some tasks might wait before running.
+        */
+       group_fully_busy,
+       /*
+        * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
+        * and must be migrated to a more powerful CPU.
+        */
         group_misfit_task,
+       /*
+        * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
+        * and the task should be migrated to it instead of running on the
+        * current CPU.
+        */
+       group_asym_packing,
+       /*
+        * The tasks' affinity constraints previously prevented the scheduler
+        * from balancing the load across the system.
+        */
         group_imbalanced,
-       group_overloaded,
+       /*
+        * The CPU is overloaded and can't provide expected CPU cycles to all
+        * tasks.
+        */
+       group_overloaded
+};
+
+enum migration_type {
+       migrate_load = 0,
+       migrate_util,
+       migrate_task,
+       migrate_misfit
  };
  
  #define LBF_ALL_PINNED 0x01
@@ -7107,7 +7056,7 @@ struct lb_env {
         unsigned int            loop_max;
  
         enum fbq_type           fbq_type;
-       enum group_type         src_grp_type;
+       enum migration_type     migration_type;
         struct list_head        tasks;
  };
  
@@ -7330,7 +7279,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
  static const unsigned int sched_nr_migrate_break = 32;
  
  /*
- * detach_tasks() -- tries to detach up to imbalance runnable load from
+ * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
   * busiest_rq, as part of a balancing operation within domain "sd".
   *
   * Returns number of detached tasks if successful and 0 otherwise.
@@ -7338,8 +7287,8 @@ static const unsigned int sched_nr_migrate_break = 32;
  static int detach_tasks(struct lb_env *env)
  {
         struct list_head *tasks = &env->src_rq->cfs_tasks;
+       unsigned long util, load;
         struct task_struct *p;
-       unsigned long load;
         int detached = 0;
  
         lockdep_assert_held(&env->src_rq->lock);
@@ -7372,19 +7321,46 @@ static int detach_tasks(struct lb_env *env)
                 if (!can_migrate_task(p, env))
                         goto next;
  
-               load = task_h_load(p);
+               switch (env->migration_type) {
+               case migrate_load:
+                       load = task_h_load(p);
  
-               if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
-                       goto next;
+                       if (sched_feat(LB_MIN) &&
+                           load < 16 && !env->sd->nr_balance_failed)
+                               goto next;
  
-               if ((load / 2) > env->imbalance)
-                       goto next;
+                       if (load/2 > env->imbalance)
+                               goto next;
+
+                       env->imbalance -= load;
+                       break;
+
+               case migrate_util:
+                       util = task_util_est(p);
+
+                       if (util > env->imbalance)
+                               goto next;
+
+                       env->imbalance -= util;
+                       break;
+
+               case migrate_task:
+                       env->imbalance--;
+                       break;
+
+               case migrate_misfit:
+                       /* This is not a misfit task */
+                       if (task_fits_capacity(p, capacity_of(env->src_cpu)))
+                               goto next;
+
+                       env->imbalance = 0;
+                       break;
+               }
  
                 detach_task(p, env);
                 list_add(&p->se.group_node, &env->tasks);
  
                 detached++;
-               env->imbalance -= load;
  
  #ifdef CONFIG_PREEMPTION
                 /*
@@ -7398,7 +7374,7 @@ static int detach_tasks(struct lb_env *env)
  
                 /*
                  * We only want to steal up to the prescribed amount of
-                * runnable load.
+                * load/util/tasks.
                  */
                 if (env->imbalance <= 0)
                         break;
@@ -7508,6 +7484,28 @@ static inline bool others_have_blocked(struct rq *rq) { return false; }
  static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
  #endif
  
+static bool __update_blocked_others(struct rq *rq, bool *done)
+{
+       const struct sched_class *curr_class;
+       u64 now = rq_clock_pelt(rq);
+       bool decayed;
+
+       /*
+        * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
+        * DL and IRQ signals have been updated before updating CFS.
+        */
+       curr_class = rq->curr->sched_class;
+
+       decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
+                 update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
+                 update_irq_load_avg(rq, 0);
+
+       if (others_have_blocked(rq))
+               *done = false;
+
+       return decayed;
+}
+
  #ifdef CONFIG_FAIR_GROUP_SCHED
  
  static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
@@ -7527,16 +7525,11 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
         return true;
  }
  
-static void update_blocked_averages(int cpu)
+static bool __update_blocked_fair(struct rq *rq, bool *done)
  {
-       struct rq *rq = cpu_rq(cpu);
         struct cfs_rq *cfs_rq, *pos;
-       const struct sched_class *curr_class;
-       struct rq_flags rf;
-       bool done = true;
-
-       rq_lock_irqsave(rq, &rf);
-       update_rq_clock(rq);
+       bool decayed = false;
+       int cpu = cpu_of(rq);
  
         /*
          * Iterates the task_group tree in a bottom up fashion, see
@@ -7545,9 +7538,13 @@ static void update_blocked_averages(int cpu)
         for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
                 struct sched_entity *se;
  
-               if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
+               if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
                         update_tg_load_avg(cfs_rq, 0);
  
+                       if (cfs_rq == &rq->cfs)
+                               decayed = true;
+               }
+
                 /* Propagate pending load changes to the parent, if any: */
                 se = cfs_rq->tg->se[cpu];
                 if (se && !skip_blocked_update(se))
@@ -7562,19 +7559,10 @@ static void update_blocked_averages(int cpu)
  
                 /* Don't need periodic decay once load/util_avg are null */
                 if (cfs_rq_has_blocked(cfs_rq))
-                       done = false;
+                       *done = false;
         }
  
-       curr_class = rq->curr->sched_class;
-       update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
-       update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
-       update_irq_load_avg(rq, 0);
-       /* Don't need periodic decay once load/util_avg are null */
-       if (others_have_blocked(rq))
-               done = false;
-
-       update_blocked_load_status(rq, !done);
-       rq_unlock_irqrestore(rq, &rf);
+       return decayed;
  }
  
  /*
@@ -7624,23 +7612,16 @@ static unsigned long task_h_load(struct task_struct *p)
                         cfs_rq_load_avg(cfs_rq) + 1);
  }
  #else
-static inline void update_blocked_averages(int cpu)
+static bool __update_blocked_fair(struct rq *rq, bool *done)
  {
-       struct rq *rq = cpu_rq(cpu);
         struct cfs_rq *cfs_rq = &rq->cfs;
-       const struct sched_class *curr_class;
-       struct rq_flags rf;
+       bool decayed;
  
-       rq_lock_irqsave(rq, &rf);
-       update_rq_clock(rq);
-       update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
+       decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
+       if (cfs_rq_has_blocked(cfs_rq))
+               *done = false;
  
-       curr_class = rq->curr->sched_class;
-       update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
-       update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
-       update_irq_load_avg(rq, 0);
-       update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq));
-       rq_unlock_irqrestore(rq, &rf);
+       return decayed;
  }
  
  static unsigned long task_h_load(struct task_struct *p)
@@ -7649,6 +7630,24 @@ static unsigned long task_h_load(struct task_struct *p)
  }
  #endif
  
+static void update_blocked_averages(int cpu)
+{
+       bool decayed = false, done = true;
+       struct rq *rq = cpu_rq(cpu);
+       struct rq_flags rf;
+
+       rq_lock_irqsave(rq, &rf);
+       update_rq_clock(rq);
+
+       decayed |= __update_blocked_others(rq, &done);
+       decayed |= __update_blocked_fair(rq, &done);
+
+       update_blocked_load_status(rq, !done);
+       if (decayed)
+               cpufreq_update_util(rq, 0);
+       rq_unlock_irqrestore(rq, &rf);
+}
+
  /********** Helpers for find_busiest_group ************************/
  
  /*
@@ -7657,14 +7656,14 @@ static unsigned long task_h_load(struct task_struct *p)
  struct sg_lb_stats {
         unsigned long avg_load; /*Avg load across the CPUs of the group */
         unsigned long group_load; /* Total load over the CPUs of the group */
-       unsigned long load_per_task;
         unsigned long group_capacity;
         unsigned long group_util; /* Total utilization of the group */
-       unsigned int sum_nr_running; /* Nr tasks running in the group */
+       unsigned int sum_nr_running; /* Nr of tasks running in the group */
+       unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
         unsigned int idle_cpus;
         unsigned int group_weight;
         enum group_type group_type;
-       int group_no_capacity;
+       unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
         unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
  #ifdef CONFIG_NUMA_BALANCING
         unsigned int nr_numa_running;
@@ -7679,10 +7678,10 @@ struct sg_lb_stats {
  struct sd_lb_stats {
         struct sched_group *busiest;    /* Busiest group in this sd */
         struct sched_group *local;      /* Local group in this sd */
-       unsigned long total_running;
         unsigned long total_load;       /* Total load of all groups in sd */
         unsigned long total_capacity;   /* Total capacity of all groups in sd */
         unsigned long avg_load; /* Average load across all groups in sd */
+       unsigned int prefer_sibling; /* tasks should go to sibling first */
  
         struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
         struct sg_lb_stats local_stat;  /* Statistics of the local group */
@@ -7693,19 +7692,18 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
         /*
          * Skimp on the clearing to avoid duplicate work. We can avoid clearing
          * local_stat because update_sg_lb_stats() does a full clear/assignment.
-        * We must however clear busiest_stat::avg_load because
-        * update_sd_pick_busiest() reads this before assignment.
+        * We must however set busiest_stat::group_type and
+        * busiest_stat::idle_cpus to the worst busiest group because
+        * update_sd_pick_busiest() reads these before assignment.
          */
         *sds = (struct sd_lb_stats){
                 .busiest = NULL,
                 .local = NULL,
-               .total_running = 0UL,
                 .total_load = 0UL,
                 .total_capacity = 0UL,
                 .busiest_stat = {
-                       .avg_load = 0UL,
-                       .sum_nr_running = 0,
-                       .group_type = group_other,
+                       .idle_cpus = UINT_MAX,
+                       .group_type = group_has_spare,
                 },
         };
  }
@@ -7893,13 +7891,13 @@ static inline int sg_imbalanced(struct sched_group *group)
   * any benefit for the load balance.
   */
  static inline bool
-group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
+group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
  {
         if (sgs->sum_nr_running < sgs->group_weight)
                 return true;
  
         if ((sgs->group_capacity * 100) >
-                       (sgs->group_util * env->sd->imbalance_pct))
+                       (sgs->group_util * imbalance_pct))
                 return true;
  
         return false;
@@ -7914,13 +7912,13 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
   *  false.
   */
  static inline bool
-group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
+group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
  {
         if (sgs->sum_nr_running <= sgs->group_weight)
                 return false;
  
         if ((sgs->group_capacity * 100) <
-                       (sgs->group_util * env->sd->imbalance_pct))
+                       (sgs->group_util * imbalance_pct))
                 return true;
  
         return false;
@@ -7947,19 +7945,26 @@ group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
  }
  
  static inline enum
-group_type group_classify(struct sched_group *group,
+group_type group_classify(unsigned int imbalance_pct,
+                         struct sched_group *group,
                           struct sg_lb_stats *sgs)
  {
-       if (sgs->group_no_capacity)
+       if (group_is_overloaded(imbalance_pct, sgs))
                 return group_overloaded;
  
         if (sg_imbalanced(group))
                 return group_imbalanced;
  
+       if (sgs->group_asym_packing)
+               return group_asym_packing;
+
         if (sgs->group_misfit_task_load)
                 return group_misfit_task;
  
-       return group_other;
+       if (!group_has_capacity(imbalance_pct, sgs))
+               return group_fully_busy;
+
+       return group_has_spare;
  }
  
  static bool update_nohz_stats(struct rq *rq, bool force)
@@ -7996,21 +8001,25 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                                       struct sg_lb_stats *sgs,
                                       int *sg_status)
  {
-       int i, nr_running;
+       int i, nr_running, local_group;
  
         memset(sgs, 0, sizeof(*sgs));
  
+       local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
+
         for_each_cpu_and(i, sched_group_span(group), env->cpus) {
                 struct rq *rq = cpu_rq(i);
  
                 if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
                         env->flags |= LBF_NOHZ_AGAIN;
  
-               sgs->group_load += cpu_runnable_load(rq);
+               sgs->group_load += cpu_load(rq);
                 sgs->group_util += cpu_util(i);
-               sgs->sum_nr_running += rq->cfs.h_nr_running;
+               sgs->sum_h_nr_running += rq->cfs.h_nr_running;
  
                 nr_running = rq->nr_running;
+               sgs->sum_nr_running += nr_running;
+
                 if (nr_running > 1)
                         *sg_status |= SG_OVERLOAD;
  
@@ -8024,9 +8033,16 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                 /*
                  * No need to call idle_cpu() if nr_running is not 0
                  */
-               if (!nr_running && idle_cpu(i))
+               if (!nr_running && idle_cpu(i)) {
                         sgs->idle_cpus++;
+                       /* Idle cpu can't have misfit task */
+                       continue;
+               }
+
+               if (local_group)
+                       continue;
  
+               /* Check for a misfit task on the cpu */
                 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
                     sgs->group_misfit_task_load < rq->misfit_task_load) {
                         sgs->group_misfit_task_load = rq->misfit_task_load;
@@ -8034,17 +8050,24 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                 }
         }
  
-       /* Adjust by relative CPU capacity of the group */
-       sgs->group_capacity = group->sgc->capacity;
-       sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
+       /* Check if dst CPU is idle and preferred to this group */
+       if (env->sd->flags & SD_ASYM_PACKING &&
+           env->idle != CPU_NOT_IDLE &&
+           sgs->sum_h_nr_running &&
+           sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
+               sgs->group_asym_packing = 1;
+       }
  
-       if (sgs->sum_nr_running)
-               sgs->load_per_task = sgs->group_load / sgs->sum_nr_running;
+       sgs->group_capacity = group->sgc->capacity;
  
         sgs->group_weight = group->group_weight;
  
-       sgs->group_no_capacity = group_is_overloaded(env, sgs);
-       sgs->group_type = group_classify(group, sgs);
+       sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
+
+       /* Computing avg_load makes sense only when group is overloaded */
+       if (sgs->group_type == group_overloaded)
+               sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
+                               sgs->group_capacity;
  }
  
  /**
@@ -8067,6 +8090,10 @@ static bool update_sd_pick_busiest(struct lb_env *env,
  {
         struct sg_lb_stats *busiest = &sds->busiest_stat;
  
+       /* Make sure that there is at least one task to pull */
+       if (!sgs->sum_h_nr_running)
+               return false;
+
         /*
          * Don't try to pull misfit tasks we can't help.
          * We can use max_capacity here as reduction in capacity on some
@@ -8075,7 +8102,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
          */
         if (sgs->group_type == group_misfit_task &&
             (!group_smaller_max_cpu_capacity(sg, sds->local) ||
-            !group_has_capacity(env, &sds->local_stat)))
+            sds->local_stat.group_type != group_has_spare))
                 return false;
  
         if (sgs->group_type > busiest->group_type)
@@ -8084,62 +8111,88 @@ static bool update_sd_pick_busiest(struct lb_env *env,
         if (sgs->group_type < busiest->group_type)
                 return false;
  
-       if (sgs->avg_load <= busiest->avg_load)
-               return false;
-
-       if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
-               goto asym_packing;
-
         /*
-        * Candidate sg has no more than one task per CPU and
-        * has higher per-CPU capacity. Migrating tasks to less
-        * capable CPUs may harm throughput. Maximize throughput,
-        * power/energy consequences are not considered.
+        * The candidate and the current busiest group are the same type of
+        * group. Let check which one is the busiest according to the type.
          */
-       if (sgs->sum_nr_running <= sgs->group_weight &&
-           group_smaller_min_cpu_capacity(sds->local, sg))
-               return false;
  
-       /*
-        * If we have more than one misfit sg go with the biggest misfit.
-        */
-       if (sgs->group_type == group_misfit_task &&
-           sgs->group_misfit_task_load < busiest->group_misfit_task_load)
+       switch (sgs->group_type) {
+       case group_overloaded:
+               /* Select the overloaded group with highest avg_load. */
+               if (sgs->avg_load <= busiest->avg_load)
+                       return false;
+               break;
+
+       case group_imbalanced:
+               /*
+                * Select the 1st imbalanced group as we don't have any way to
+                * choose one more than another.
+                */
                 return false;
  
-asym_packing:
-       /* This is the busiest node in its class. */
-       if (!(env->sd->flags & SD_ASYM_PACKING))
-               return true;
+       case group_asym_packing:
+               /* Prefer to move from lowest priority CPU's work */
+               if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
+                       return false;
+               break;
  
-       /* No ASYM_PACKING if target CPU is already busy */
-       if (env->idle == CPU_NOT_IDLE)
-               return true;
-       /*
-        * ASYM_PACKING needs to move all the work to the highest
-        * prority CPUs in the group, therefore mark all groups
-        * of lower priority than ourself as busy.
-        */
-       if (sgs->sum_nr_running &&
-           sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
-               if (!sds->busiest)
-                       return true;
+       case group_misfit_task:
+               /*
+                * If we have more than one misfit sg go with the biggest
+                * misfit.
+                */
+               if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
+                       return false;
+               break;
  
-               /* Prefer to move from lowest priority CPU's work */
-               if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
-                                     sg->asym_prefer_cpu))
-                       return true;
+       case group_fully_busy:
+               /*
+                * Select the fully busy group with highest avg_load. In
+                * theory, there is no need to pull task from such kind of
+                * group because tasks have all compute capacity that they need
+                * but we can still improve the overall throughput by reducing
+                * contention when accessing shared HW resources.
+                *
+                * XXX for now avg_load is not computed and always 0 so we
+                * select the 1st one.
+                */
+               if (sgs->avg_load <= busiest->avg_load)
+                       return false;
+               break;
+
+       case group_has_spare:
+               /*
+                * Select not overloaded group with lowest number of
+                * idle cpus. We could also compare the spare capacity
+                * which is more stable but it can end up that the
+                * group has less spare capacity but finally more idle
+                * CPUs which means less opportunity to pull tasks.
+                */
+               if (sgs->idle_cpus >= busiest->idle_cpus)
+                       return false;
+               break;
         }
  
-       return false;
+       /*
+        * Candidate sg has no more than one task per CPU and has higher
+        * per-CPU capacity. Migrating tasks to less capable CPUs may harm
+        * throughput. Maximize throughput, power/energy consequences are not
+        * considered.
+        */
+       if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
+           (sgs->group_type <= group_fully_busy) &&
+           (group_smaller_min_cpu_capacity(sds->local, sg)))
+               return false;
+
+       return true;
  }
  
  #ifdef CONFIG_NUMA_BALANCING
  static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
  {
-       if (sgs->sum_nr_running > sgs->nr_numa_running)
+       if (sgs->sum_h_nr_running > sgs->nr_numa_running)
                 return regular;
-       if (sgs->sum_nr_running > sgs->nr_preferred_running)
+       if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
                 return remote;
         return all;
  }
@@ -8164,18 +8217,310 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
  }
  #endif /* CONFIG_NUMA_BALANCING */
  
-/**
+
+struct sg_lb_stats;
+
+/*
+ * task_running_on_cpu - return 1 if @p is running on @cpu.
+ */
+
+static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
+{
+       /* Task has no contribution or is new */
+       if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+               return 0;
+
+       if (task_on_rq_queued(p))
+               return 1;
+
+       return 0;
+}
+
+/**
+ * idle_cpu_without - would a given CPU be idle without p ?
+ * @cpu: the processor on which idleness is tested.
+ * @p: task which should be ignored.
+ *
+ * Return: 1 if the CPU would be idle. 0 otherwise.
+ */
+static int idle_cpu_without(int cpu, struct task_struct *p)
+{
+       struct rq *rq = cpu_rq(cpu);
+
+       if (rq->curr != rq->idle && rq->curr != p)
+               return 0;
+
+       /*
+        * rq->nr_running can't be used but an updated version without the
+        * impact of p on cpu must be used instead. The updated nr_running
+        * be computed and tested before calling idle_cpu_without().
+        */
+
+#ifdef CONFIG_SMP
+       if (!llist_empty(&rq->wake_list))
+               return 0;
+#endif
+
+       return 1;
+}
+
+/*
+ * update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
+ * @sd: The sched_domain level to look for idlest group.
+ * @group: sched_group whose statistics are to be updated.
+ * @sgs: variable to hold the statistics for this group.
+ * @p: The task for which we look for the idlest group/CPU.
+ */
+static inline void update_sg_wakeup_stats(struct sched_domain *sd,
+                                         struct sched_group *group,
+                                         struct sg_lb_stats *sgs,
+                                         struct task_struct *p)
+{
+       int i, nr_running;
+
+       memset(sgs, 0, sizeof(*sgs));
+
+       for_each_cpu(i, sched_group_span(group)) {
+               struct rq *rq = cpu_rq(i);
+               unsigned int local;
+
+               sgs->group_load += cpu_load_without(rq, p);
+               sgs->group_util += cpu_util_without(i, p);
+               local = task_running_on_cpu(i, p);
+               sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
+
+               nr_running = rq->nr_running - local;
+               sgs->sum_nr_running += nr_running;
+
+               /*
+                * No need to call idle_cpu_without() if nr_running is not 0
+                */
+               if (!nr_running && idle_cpu_without(i, p))
+                       sgs->idle_cpus++;
+
+       }
+
+       /* Check if task fits in the group */
+       if (sd->flags & SD_ASYM_CPUCAPACITY &&
+           !task_fits_capacity(p, group->sgc->max_capacity)) {
+               sgs->group_misfit_task_load = 1;
+       }
+
+       sgs->group_capacity = group->sgc->capacity;
+
+       sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
+
+       /*
+        * Computing avg_load makes sense only when group is fully busy or
+        * overloaded
+        */
+       if (sgs->group_type < group_fully_busy)
+               sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
+                               sgs->group_capacity;
+}
+
+static bool update_pick_idlest(struct sched_group *idlest,
+                              struct sg_lb_stats *idlest_sgs,
+                              struct sched_group *group,
+                              struct sg_lb_stats *sgs)
+{
+       if (sgs->group_type < idlest_sgs->group_type)
+               return true;
+
+       if (sgs->group_type > idlest_sgs->group_type)
+               return false;
+
+       /*
+        * The candidate and the current idlest group are the same type of
+        * group. Let check which one is the idlest according to the type.
+        */
+
+       switch (sgs->group_type) {
+       case group_overloaded:
+       case group_fully_busy:
+               /* Select the group with lowest avg_load. */
+               if (idlest_sgs->avg_load <= sgs->avg_load)
+                       return false;
+               break;
+
+       case group_imbalanced:
+       case group_asym_packing:
+               /* Those types are not used in the slow wakeup path */
+               return false;
+
+       case group_misfit_task:
+               /* Select group with the highest max capacity */
+               if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
+                       return false;
+               break;
+
+       case group_has_spare:
+               /* Select group with most idle CPUs */
+               if (idlest_sgs->idle_cpus >= sgs->idle_cpus)
+                       return false;
+               break;
+       }
+
+       return true;
+}
+
+/*
+ * find_idlest_group() finds and returns the least busy CPU group within the
+ * domain.
+ *
+ * Assumes p is allowed on at least one CPU in sd.
+ */
+static struct sched_group *
+find_idlest_group(struct sched_domain *sd, struct task_struct *p,
+                 int this_cpu, int sd_flag)
+{
+       struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
+       struct sg_lb_stats local_sgs, tmp_sgs;
+       struct sg_lb_stats *sgs;
+       unsigned long imbalance;
+       struct sg_lb_stats idlest_sgs = {
+                       .avg_load = UINT_MAX,
+                       .group_type = group_overloaded,
+       };
+
+       imbalance = scale_load_down(NICE_0_LOAD) *
+                               (sd->imbalance_pct-100) / 100;
+
+       do {
+               int local_group;
+
+               /* Skip over this group if it has no CPUs allowed */
+               if (!cpumask_intersects(sched_group_span(group),
+                                       p->cpus_ptr))
+                       continue;
+
+               local_group = cpumask_test_cpu(this_cpu,
+                                              sched_group_span(group));
+
+               if (local_group) {
+                       sgs = &local_sgs;
+                       local = group;
+               } else {
+                       sgs = &tmp_sgs;
+               }
+
+               update_sg_wakeup_stats(sd, group, sgs, p);
+
+               if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
+                       idlest = group;
+                       idlest_sgs = *sgs;
+               }
+
+       } while (group = group->next, group != sd->groups);
+
+
+       /* There is no idlest group to push tasks to */
+       if (!idlest)
+               return NULL;
+
+       /*
+        * If the local group is idler than the selected idlest group
+        * don't try and push the task.
+        */
+       if (local_sgs.group_type < idlest_sgs.group_type)
+               return NULL;
+
+       /*
+        * If the local group is busier than the selected idlest group
+        * try and push the task.
+        */
+       if (local_sgs.group_type > idlest_sgs.group_type)
+               return idlest;
+
+       switch (local_sgs.group_type) {
+       case group_overloaded:
+       case group_fully_busy:
+               /*
+                * When comparing groups across NUMA domains, it's possible for
+                * the local domain to be very lightly loaded relative to the
+                * remote domains but "imbalance" skews the comparison making
+                * remote CPUs look much more favourable. When considering
+                * cross-domain, add imbalance to the load on the remote node
+                * and consider staying local.
+                */
+
+               if ((sd->flags & SD_NUMA) &&
+                   ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
+                       return NULL;
+
+               /*
+                * If the local group is less loaded than the selected
+                * idlest group don't try and push any tasks.
+                */
+               if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
+                       return NULL;
+
+               if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
+                       return NULL;
+               break;
+
+       case group_imbalanced:
+       case group_asym_packing:
+               /* Those type are not used in the slow wakeup path */
+               return NULL;
+
+       case group_misfit_task:
+               /* Select group with the highest max capacity */
+               if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
+                       return NULL;
+               break;
+
+       case group_has_spare:
+               if (sd->flags & SD_NUMA) {
+#ifdef CONFIG_NUMA_BALANCING
+                       int idlest_cpu;
+                       /*
+                        * If there is spare capacity at NUMA, try to select
+                        * the preferred node
+                        */
+                       if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
+                               return NULL;
+
+                       idlest_cpu = cpumask_first(sched_group_span(idlest));
+                       if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
+                               return idlest;
+#endif
+                       /*
+                        * Otherwise, keep the task on this node to stay close
+                        * its wakeup source and improve locality. If there is
+                        * a real need of migration, periodic load balance will
+                        * take care of it.
+                        */
+                       if (local_sgs.idle_cpus)
+                               return NULL;
+               }
+
+               /*
+                * Select group with highest number of idle CPUs. We could also
+                * compare the utilization which is more stable but it can end
+                * up that the group has less spare capacity but finally more
+                * idle CPUs which means more opportunity to run task.
+                */
+               if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
+                       return NULL;
+               break;
+       }
+
+       return idlest;
+}
+
+/**
   * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
   * @env: The load balancing environment.
   * @sds: variable to hold the statistics for this sched_domain.
   */
+
  static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
  {
         struct sched_domain *child = env->sd->child;
         struct sched_group *sg = env->sd->groups;
         struct sg_lb_stats *local = &sds->local_stat;
         struct sg_lb_stats tmp_sgs;
-       bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
         int sg_status = 0;
  
  #ifdef CONFIG_NO_HZ_COMMON
@@ -8202,22 +8547,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                 if (local_group)
                         goto next_group;
  
-               /*
-                * In case the child domain prefers tasks go to siblings
-                * first, lower the sg capacity so that we'll try
-                * and move all the excess tasks away. We lower the capacity
-                * of a group only if the local group has the capacity to fit
-                * these excess tasks. The extra check prevents the case where
-                * you always pull from the heaviest group when it is already
-                * under-utilized (possible with a large weight task outweighs
-                * the tasks on the system).
-                */
-               if (prefer_sibling && sds->local &&
-                   group_has_capacity(env, local) &&
-                   (sgs->sum_nr_running > local->sum_nr_running + 1)) {
-                       sgs->group_no_capacity = 1;
-                       sgs->group_type = group_classify(sg, sgs);
-               }
  
                 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
                         sds->busiest = sg;
@@ -8226,13 +8555,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
  
  next_group:
                 /* Now, start updating sd_lb_stats */
-               sds->total_running += sgs->sum_nr_running;
                 sds->total_load += sgs->group_load;
                 sds->total_capacity += sgs->group_capacity;
  
                 sg = sg->next;
         } while (sg != env->sd->groups);
  
+       /* Tag domain that child domain prefers tasks go to siblings first */
+       sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
+
  #ifdef CONFIG_NO_HZ_COMMON
         if ((env->flags & LBF_NOHZ_AGAIN) &&
             cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
@@ -8263,203 +8594,160 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
  }
  
  /**
- * check_asym_packing - Check to see if the group is packed into the
- *                     sched domain.
- *
- * This is primarily intended to used at the sibling level.  Some
- * cores like POWER7 prefer to use lower numbered SMT threads.  In the
- * case of POWER7, it can move to lower SMT modes only when higher
- * threads are idle.  When in lower SMT modes, the threads will
- * perform better since they share less core resources.  Hence when we
- * have idle threads, we want them to be the higher ones.
- *
- * This packing function is run on idle threads.  It checks to see if
- * the busiest CPU in this domain (core in the P7 case) has a higher
- * CPU number than the packing function is being run on.  Here we are
- * assuming lower CPU number will be equivalent to lower a SMT thread
- * number.
- *
- * Return: 1 when packing is required and a task should be moved to
- * this CPU.  The amount of the imbalance is returned in env->imbalance.
- *
- * @env: The load balancing environment.
- * @sds: Statistics of the sched_domain which is to be packed
- */
-static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
-{
-       int busiest_cpu;
-
-       if (!(env->sd->flags & SD_ASYM_PACKING))
-               return 0;
-
-       if (env->idle == CPU_NOT_IDLE)
-               return 0;
-
-       if (!sds->busiest)
-               return 0;
-
-       busiest_cpu = sds->busiest->asym_prefer_cpu;
-       if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
-               return 0;
-
-       env->imbalance = sds->busiest_stat.group_load;
-
-       return 1;
-}
-
-/**
- * fix_small_imbalance - Calculate the minor imbalance that exists
- *                     amongst the groups of a sched_domain, during
- *                     load balancing.
- * @env: The load balancing environment.
- * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+ * calculate_imbalance - Calculate the amount of imbalance present within the
+ *                      groups of a given sched_domain during load balance.
+ * @env: load balance environment
+ * @sds: statistics of the sched_domain whose imbalance is to be calculated.
   */
-static inline
-void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
+static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
  {
-       unsigned long tmp, capa_now = 0, capa_move = 0;
-       unsigned int imbn = 2;
-       unsigned long scaled_busy_load_per_task;
         struct sg_lb_stats *local, *busiest;
  
         local = &sds->local_stat;
         busiest = &sds->busiest_stat;
  
-       if (!local->sum_nr_running)
-               local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
-       else if (busiest->load_per_task > local->load_per_task)
-               imbn = 1;
+       if (busiest->group_type == group_misfit_task) {
+               /* Set imbalance to allow misfit tasks to be balanced. */
+               env->migration_type = migrate_misfit;
+               env->imbalance = 1;
+               return;
+       }
  
-       scaled_busy_load_per_task =
-               (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
-               busiest->group_capacity;
+       if (busiest->group_type == group_asym_packing) {
+               /*
+                * In case of asym capacity, we will try to migrate all load to
+                * the preferred CPU.
+                */
+               env->migration_type = migrate_task;
+               env->imbalance = busiest->sum_h_nr_running;
+               return;
+       }
  
-       if (busiest->avg_load + scaled_busy_load_per_task >=
-           local->avg_load + (scaled_busy_load_per_task * imbn)) {
-               env->imbalance = busiest->load_per_task;
+       if (busiest->group_type == group_imbalanced) {
+               /*
+                * In the group_imb case we cannot rely on group-wide averages
+                * to ensure CPU-load equilibrium, try to move any task to fix
+                * the imbalance. The next load balance will take care of
+                * balancing back the system.
+                */
+               env->migration_type = migrate_task;
+               env->imbalance = 1;
                 return;
         }
  
         /*
-        * OK, we don't have enough imbalance to justify moving tasks,
-        * however we may be able to increase total CPU capacity used by
-        * moving them.
+        * Try to use spare capacity of local group without overloading it or
+        * emptying busiest.
+        * XXX Spreading tasks across NUMA nodes is not always the best policy
+        * and special care should be taken for SD_NUMA domain level before
+        * spreading the tasks. For now, load_balance() fully relies on
+        * NUMA_BALANCING and fbq_classify_group/rq to override the decision.
          */
+       if (local->group_type == group_has_spare) {
+               if (busiest->group_type > group_fully_busy) {
+                       /*
+                        * If busiest is overloaded, try to fill spare
+                        * capacity. This might end up creating spare capacity
+                        * in busiest or busiest still being overloaded but
+                        * there is no simple way to directly compute the
+                        * amount of load to migrate in order to balance the
+                        * system.
+                        */
+                       env->migration_type = migrate_util;
+                       env->imbalance = max(local->group_capacity, local->group_util) -
+                                        local->group_util;
  
-       capa_now += busiest->group_capacity *
-                       min(busiest->load_per_task, busiest->avg_load);
-       capa_now += local->group_capacity *
-                       min(local->load_per_task, local->avg_load);
-       capa_now /= SCHED_CAPACITY_SCALE;
-
-       /* Amount of load we'd subtract */
-       if (busiest->avg_load > scaled_busy_load_per_task) {
-               capa_move += busiest->group_capacity *
-                           min(busiest->load_per_task,
-                               busiest->avg_load - scaled_busy_load_per_task);
-       }
-
-       /* Amount of load we'd add */
-       if (busiest->avg_load * busiest->group_capacity <
-           busiest->load_per_task * SCHED_CAPACITY_SCALE) {
-               tmp = (busiest->avg_load * busiest->group_capacity) /
-                     local->group_capacity;
-       } else {
-               tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
-                     local->group_capacity;
-       }
-       capa_move += local->group_capacity *
-                   min(local->load_per_task, local->avg_load + tmp);
-       capa_move /= SCHED_CAPACITY_SCALE;
-
-       /* Move if we gain throughput */
-       if (capa_move > capa_now)
-               env->imbalance = busiest->load_per_task;
-}
+                       /*
+                        * In some cases, the group's utilization is max or even
+                        * higher than capacity because of migrations but the
+                        * local CPU is (newly) idle. There is at least one
+                        * waiting task in this overloaded busiest group. Let's
+                        * try to pull it.
+                        */
+                       if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
+                               env->migration_type = migrate_task;
+                               env->imbalance = 1;
+                       }
  
-/**
- * calculate_imbalance - Calculate the amount of imbalance present within the
- *                      groups of a given sched_domain during load balance.
- * @env: load balance environment
- * @sds: statistics of the sched_domain whose imbalance is to be calculated.
- */
-static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
-{
-       unsigned long max_pull, load_above_capacity = ~0UL;
-       struct sg_lb_stats *local, *busiest;
+                       return;
+               }
  
-       local = &sds->local_stat;
-       busiest = &sds->busiest_stat;
+               if (busiest->group_weight == 1 || sds->prefer_sibling) {
+                       unsigned int nr_diff = busiest->sum_nr_running;
+                       /*
+                        * When prefer sibling, evenly spread running tasks on
+                        * groups.
+                        */
+                       env->migration_type = migrate_task;
+                       lsub_positive(&nr_diff, local->sum_nr_running);
+                       env->imbalance = nr_diff >> 1;
+                       return;
+               }
  
-       if (busiest->group_type == group_imbalanced) {
                 /*
-                * In the group_imb case we cannot rely on group-wide averages
-                * to ensure CPU-load equilibrium, look at wider averages. XXX
+                * If there is no overload, we just want to even the number of
+                * idle cpus.
                  */
-               busiest->load_per_task =
-                       min(busiest->load_per_task, sds->avg_load);
+               env->migration_type = migrate_task;
+               env->imbalance = max_t(long, 0, (local->idle_cpus -
+                                                busiest->idle_cpus) >> 1);
+               return;
         }
  
         /*
-        * Avg load of busiest sg can be less and avg load of local sg can
-        * be greater than avg load across all sgs of sd because avg load
-        * factors in sg capacity and sgs with smaller group_type are
-        * skipped when updating the busiest sg:
+        * Local is fully busy but has to take more load to relieve the
+        * busiest group
          */
-       if (busiest->group_type != group_misfit_task &&
-           (busiest->avg_load <= sds->avg_load ||
-            local->avg_load >= sds->avg_load)) {
-               env->imbalance = 0;
-               return fix_small_imbalance(env, sds);
-       }
+       if (local->group_type < group_overloaded) {
+               /*
+                * Local will become overloaded so the avg_load metrics are
+                * finally needed.
+                */
  
-       /*
-        * If there aren't any idle CPUs, avoid creating some.
-        */
-       if (busiest->group_type == group_overloaded &&
-           local->group_type   == group_overloaded) {
-               load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
-               if (load_above_capacity > busiest->group_capacity) {
-                       load_above_capacity -= busiest->group_capacity;
-                       load_above_capacity *= scale_load_down(NICE_0_LOAD);
-                       load_above_capacity /= busiest->group_capacity;
-               } else
-                       load_above_capacity = ~0UL;
+               local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
+                                 local->group_capacity;
+
+               sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
+                               sds->total_capacity;
         }
  
         /*
-        * We're trying to get all the CPUs to the average_load, so we don't
-        * want to push ourselves above the average load, nor do we wish to
-        * reduce the max loaded CPU below the average load. At the same time,
-        * we also don't want to reduce the group load below the group
-        * capacity. Thus we look for the minimum possible imbalance.
+        * Both group are or will become overloaded and we're trying to get all
+        * the CPUs to the average_load, so we don't want to push ourselves
+        * above the average load, nor do we wish to reduce the max loaded CPU
+        * below the average load. At the same time, we also don't want to
+        * reduce the group load below the group capacity. Thus we look for
+        * the minimum possible imbalance.
          */
-       max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
-
-       /* How much load to actually move to equalise the imbalance */
+       env->migration_type = migrate_load;
         env->imbalance = min(
-               max_pull * busiest->group_capacity,
+               (busiest->avg_load - sds->avg_load) * busiest->group_capacity,
                 (sds->avg_load - local->avg_load) * local->group_capacity
         ) / SCHED_CAPACITY_SCALE;
-
-       /* Boost imbalance to allow misfit task to be balanced. */
-       if (busiest->group_type == group_misfit_task) {
-               env->imbalance = max_t(long, env->imbalance,
-                                      busiest->group_misfit_task_load);
-       }
-
-       /*
-        * if *imbalance is less than the average load per runnable task
-        * there is no guarantee that any tasks will be moved so we'll have
-        * a think about bumping its value to force at least one task to be
-        * moved
-        */
-       if (env->imbalance < busiest->load_per_task)
-               return fix_small_imbalance(env, sds);
  }
  
  /******* find_busiest_group() helpers end here *********************/
  
+/*
+ * Decision matrix according to the local and busiest group type:
+ *
+ * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
+ * has_spare        nr_idle   balanced   N/A    N/A  balanced   balanced
+ * fully_busy       nr_idle   nr_idle    N/A    N/A  balanced   balanced
+ * misfit_task      force     N/A        N/A    N/A  force      force
+ * asym_packing     force     force      N/A    N/A  force      force
+ * imbalanced       force     force      N/A    N/A  force      force
+ * overloaded       force     force      N/A    N/A  force      avg_load
+ *
+ * N/A :      Not Applicable because already filtered while updating
+ *            statistics.
+ * balanced : The system is balanced for these 2 groups.
+ * force :    Calculate the imbalance as load migration is probably needed.
+ * avg_load : Only if imbalance is significant enough.
+ * nr_idle :  dst_cpu is not busy and the number of idle CPUs is quite
+ *            different in groups.
+ */
+
  /**
   * find_busiest_group - Returns the busiest group within the sched_domain
   * if there is an imbalance.
@@ -8479,7 +8767,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
         init_sd_lb_stats(&sds);
  
         /*
-        * Compute the various statistics relavent for load balancing at
+        * Compute the various statistics relevant for load balancing at
          * this level.
          */
         update_sd_lb_stats(env, &sds);
@@ -8494,17 +8782,17 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
         local = &sds.local_stat;
         busiest = &sds.busiest_stat;
  
-       /* ASYM feature bypasses nice load balance check */
-       if (check_asym_packing(env, &sds))
-               return sds.busiest;
-
         /* There is no busy sibling group to pull tasks from */
-       if (!sds.busiest || busiest->sum_nr_running == 0)
+       if (!sds.busiest)
                 goto out_balanced;
  
-       /* XXX broken for overlapping NUMA groups */
-       sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
-                                               / sds.total_capacity;
+       /* Misfit tasks should be dealt with regardless of the avg load */
+       if (busiest->group_type == group_misfit_task)
+               goto force_balance;
+
+       /* ASYM feature bypasses nice load balance check */
+       if (busiest->group_type == group_asym_packing)
+               goto force_balance;
  
         /*
          * If the busiest group is imbalanced the below checks don't
@@ -8514,56 +8802,81 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
         if (busiest->group_type == group_imbalanced)
                 goto force_balance;
  
-       /*
-        * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
-        * capacities from resulting in underutilization due to avg_load.
-        */
-       if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
-           busiest->group_no_capacity)
-               goto force_balance;
-
-       /* Misfit tasks should be dealt with regardless of the avg load */
-       if (busiest->group_type == group_misfit_task)
-               goto force_balance;
-
         /*
          * If the local group is busier than the selected busiest group
          * don't try and pull any tasks.
          */
-       if (local->avg_load >= busiest->avg_load)
+       if (local->group_type > busiest->group_type)
                 goto out_balanced;
  
         /*
-        * Don't pull any tasks if this group is already above the domain
-        * average load.
+        * When groups are overloaded, use the avg_load to ensure fairness
+        * between tasks.
          */
-       if (local->avg_load >= sds.avg_load)
-               goto out_balanced;
+       if (local->group_type == group_overloaded) {
+               /*
+                * If the local group is more loaded than the selected
+                * busiest group don't try to pull any tasks.
+                */
+               if (local->avg_load >= busiest->avg_load)
+                       goto out_balanced;
+
+               /* XXX broken for overlapping NUMA groups */
+               sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
+                               sds.total_capacity;
  
-       if (env->idle == CPU_IDLE) {
                 /*
-                * This CPU is idle. If the busiest group is not overloaded
-                * and there is no imbalance between this and busiest group
-                * wrt idle CPUs, it is balanced. The imbalance becomes
-                * significant if the diff is greater than 1 otherwise we
-                * might end up to just move the imbalance on another group
+                * Don't pull any tasks if this group is already above the
+                * domain average load.
                  */
-               if ((busiest->group_type != group_overloaded) &&
-                               (local->idle_cpus <= (busiest->idle_cpus + 1)))
+               if (local->avg_load >= sds.avg_load)
                         goto out_balanced;
-       } else {
+
                 /*
-                * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
-                * imbalance_pct to be conservative.
+                * If the busiest group is more loaded, use imbalance_pct to be
+                * conservative.
                  */
                 if (100 * busiest->avg_load <=
                                 env->sd->imbalance_pct * local->avg_load)
                         goto out_balanced;
         }
  
+       /* Try to move all excess tasks to child's sibling domain */
+       if (sds.prefer_sibling && local->group_type == group_has_spare &&
+           busiest->sum_nr_running > local->sum_nr_running + 1)
+               goto force_balance;
+
+       if (busiest->group_type != group_overloaded) {
+               if (env->idle == CPU_NOT_IDLE)
+                       /*
+                        * If the busiest group is not overloaded (and as a
+                        * result the local one too) but this CPU is already
+                        * busy, let another idle CPU try to pull task.
+                        */
+                       goto out_balanced;
+
+               if (busiest->group_weight > 1 &&
+                   local->idle_cpus <= (busiest->idle_cpus + 1))
+                       /*
+                        * If the busiest group is not overloaded
+                        * and there is no imbalance between this and busiest
+                        * group wrt idle CPUs, it is balanced. The imbalance
+                        * becomes significant if the diff is greater than 1
+                        * otherwise we might end up to just move the imbalance
+                        * on another group. Of course this applies only if
+                        * there is more than 1 CPU per group.
+                        */
+                       goto out_balanced;
+
+               if (busiest->sum_h_nr_running == 1)
+                       /*
+                        * busiest doesn't have any tasks waiting to run
+                        */
+                       goto out_balanced;
+       }
+
  force_balance:
         /* Looks like there is an imbalance. Compute it */
-       env->src_grp_type = busiest->group_type;
         calculate_imbalance(env, &sds);
         return env->imbalance ? sds.busiest : NULL;
  
@@ -8579,11 +8892,13 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                                      struct sched_group *group)
  {
         struct rq *busiest = NULL, *rq;
-       unsigned long busiest_load = 0, busiest_capacity = 1;
+       unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
+       unsigned int busiest_nr = 0;
         int i;
  
         for_each_cpu_and(i, sched_group_span(group), env->cpus) {
-               unsigned long capacity, load;
+               unsigned long capacity, load, util;
+               unsigned int nr_running;
                 enum fbq_type rt;
  
                 rq = cpu_rq(i);
@@ -8611,20 +8926,8 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                 if (rt > env->fbq_type)
                         continue;
  
-               /*
-                * For ASYM_CPUCAPACITY domains with misfit tasks we simply
-                * seek the "biggest" misfit task.
-                */
-               if (env->src_grp_type == group_misfit_task) {
-                       if (rq->misfit_task_load > busiest_load) {
-                               busiest_load = rq->misfit_task_load;
-                               busiest = rq;
-                       }
-
-                       continue;
-               }
-
                 capacity = capacity_of(i);
+               nr_running = rq->cfs.h_nr_running;
  
                 /*
                  * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
@@ -8634,35 +8937,69 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                  */
                 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
                     capacity_of(env->dst_cpu) < capacity &&
-                   rq->nr_running == 1)
+                   nr_running == 1)
                         continue;
  
-               load = cpu_runnable_load(rq);
+               switch (env->migration_type) {
+               case migrate_load:
+                       /*
+                        * When comparing with load imbalance, use cpu_load()
+                        * which is not scaled with the CPU capacity.
+                        */
+                       load = cpu_load(rq);
  
-               /*
-                * When comparing with imbalance, use cpu_runnable_load()
-                * which is not scaled with the CPU capacity.
-                */
+                       if (nr_running == 1 && load > env->imbalance &&
+                           !check_cpu_capacity(rq, env->sd))
+                               break;
  
-               if (rq->nr_running == 1 && load > env->imbalance &&
-                   !check_cpu_capacity(rq, env->sd))
-                       continue;
+                       /*
+                        * For the load comparisons with the other CPUs,
+                        * consider the cpu_load() scaled with the CPU
+                        * capacity, so that the load can be moved away
+                        * from the CPU that is potentially running at a
+                        * lower capacity.
+                        *
+                        * Thus we're looking for max(load_i / capacity_i),
+                        * crosswise multiplication to rid ourselves of the
+                        * division works out to:
+                        * load_i * capacity_j > load_j * capacity_i;
+                        * where j is our previous maximum.
+                        */
+                       if (load * busiest_capacity > busiest_load * capacity) {
+                               busiest_load = load;
+                               busiest_capacity = capacity;
+                               busiest = rq;
+                       }
+                       break;
+
+               case migrate_util:
+                       util = cpu_util(cpu_of(rq));
+
+                       if (busiest_util < util) {
+                               busiest_util = util;
+                               busiest = rq;
+                       }
+                       break;
+
+               case migrate_task:
+                       if (busiest_nr < nr_running) {
+                               busiest_nr = nr_running;
+                               busiest = rq;
+                       }
+                       break;
+
+               case migrate_misfit:
+                       /*
+                        * For ASYM_CPUCAPACITY domains with misfit tasks we
+                        * simply seek the "biggest" misfit task.
+                        */
+                       if (rq->misfit_task_load > busiest_load) {
+                               busiest_load = rq->misfit_task_load;
+                               busiest = rq;
+                       }
+
+                       break;
  
-               /*
-                * For the load comparisons with the other CPU's, consider
-                * the cpu_runnable_load() scaled with the CPU capacity, so
-                * that the load can be moved away from the CPU that is
-                * potentially running at a lower capacity.
-                *
-                * Thus we're looking for max(load_i / capacity_i), crosswise
-                * multiplication to rid ourselves of the division works out
-                * to: load_i * capacity_j > load_j * capacity_i;  where j is
-                * our previous maximum.
-                */
-               if (load * busiest_capacity > busiest_load * capacity) {
-                       busiest_load = load;
-                       busiest_capacity = capacity;
-                       busiest = rq;
                 }
         }
  
@@ -8708,7 +9045,7 @@ voluntary_active_balance(struct lb_env *env)
                         return 1;
         }
  
-       if (env->src_grp_type == group_misfit_task)
+       if (env->migration_type == migrate_misfit)
                 return 1;
  
         return 0;
@@ -9737,6 +10074,11 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
  /*
   * idle_balance is called by schedule() if this_cpu is about to become
   * idle. Attempts to pull tasks from other CPUs.
+ *
+ * Returns:
+ *   < 0 - we released the lock and there are !fair tasks present
+ *     0 - failed, no new tasks
+ *   > 0 - success, new (fair) tasks present
   */
  int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
  {
@@ -9981,6 +10323,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
         if (!task_on_rq_queued(p))
                 return;
  
+       if (rq->cfs.nr_running == 1)
+               return;
+
         /*
          * Reschedule if we are currently running on this runqueue and
          * our priority decreased, or if we are not currently running on
@@ -10131,7 +10476,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
   * This routine is mostly called to set cfs_rq->curr field when a task
   * migrates between groups/classes.
   */
-static void set_next_task_fair(struct rq *rq, struct task_struct *p)
+static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
  {
         struct sched_entity *se = &p->se;
  
@@ -10413,12 +10758,12 @@ const struct sched_class fair_sched_class = {
  
         .check_preempt_curr     = check_preempt_wakeup,
  
-       .pick_next_task         = pick_next_task_fair,
-
+       .pick_next_task         = __pick_next_task_fair,
         .put_prev_task          = put_prev_task_fair,
         .set_next_task          = set_next_task_fair,
  
  #ifdef CONFIG_SMP
+       .balance                = balance_fair,
         .select_task_rq         = select_task_rq_fair,
         .migrate_task_rq        = migrate_task_rq_fair,