]> asedeno.scripts.mit.edu Git - linux.git/blobdiff - kernel/sched/fair.c
Merge branch 'x86-entry-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux.git] / kernel / sched / fair.c
index 500f5db0de0ba86a331586d4189e3b299cb6148e..d4bbf68c31611fcd6fa3da456ef435021cefae53 100644 (file)
@@ -96,12 +96,12 @@ int __weak arch_asym_cpu_priority(int cpu)
 }
 
 /*
- * The margin used when comparing utilization with CPU capacity:
- * util * margin < capacity * 1024
+ * The margin used when comparing utilization with CPU capacity.
  *
  * (default: ~20%)
  */
-static unsigned int capacity_margin                    = 1280;
+#define fits_capacity(cap, max)        ((cap) * 1280 < (max) * 1024)
+
 #endif
 
 #ifdef CONFIG_CFS_BANDWIDTH
@@ -1188,47 +1188,6 @@ static unsigned int task_scan_max(struct task_struct *p)
        return max(smin, smax);
 }
 
-void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
-{
-       int mm_users = 0;
-       struct mm_struct *mm = p->mm;
-
-       if (mm) {
-               mm_users = atomic_read(&mm->mm_users);
-               if (mm_users == 1) {
-                       mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
-                       mm->numa_scan_seq = 0;
-               }
-       }
-       p->node_stamp                   = 0;
-       p->numa_scan_seq                = mm ? mm->numa_scan_seq : 0;
-       p->numa_scan_period             = sysctl_numa_balancing_scan_delay;
-       p->numa_work.next               = &p->numa_work;
-       p->numa_faults                  = NULL;
-       RCU_INIT_POINTER(p->numa_group, NULL);
-       p->last_task_numa_placement     = 0;
-       p->last_sum_exec_runtime        = 0;
-
-       /* New address space, reset the preferred nid */
-       if (!(clone_flags & CLONE_VM)) {
-               p->numa_preferred_nid = NUMA_NO_NODE;
-               return;
-       }
-
-       /*
-        * New thread, keep existing numa_preferred_nid which should be copied
-        * already by arch_dup_task_struct but stagger when scans start.
-        */
-       if (mm) {
-               unsigned int delay;
-
-               delay = min_t(unsigned int, task_scan_max(current),
-                       current->numa_scan_period * mm_users * NSEC_PER_MSEC);
-               delay += 2 * TICK_NSEC;
-               p->node_stamp = delay;
-       }
-}
-
 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
 {
        rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
@@ -2523,7 +2482,7 @@ static void reset_ptenuma_scan(struct task_struct *p)
  * The expensive part of numa migration is done from task_work context.
  * Triggered from task_tick_numa().
  */
-void task_numa_work(struct callback_head *work)
+static void task_numa_work(struct callback_head *work)
 {
        unsigned long migrate, next_scan, now = jiffies;
        struct task_struct *p = current;
@@ -2536,7 +2495,7 @@ void task_numa_work(struct callback_head *work)
 
        SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
 
-       work->next = work; /* protect against double add */
+       work->next = work;
        /*
         * Who cares about NUMA placement when they're dying.
         *
@@ -2665,6 +2624,50 @@ void task_numa_work(struct callback_head *work)
        }
 }
 
+void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+{
+       int mm_users = 0;
+       struct mm_struct *mm = p->mm;
+
+       if (mm) {
+               mm_users = atomic_read(&mm->mm_users);
+               if (mm_users == 1) {
+                       mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+                       mm->numa_scan_seq = 0;
+               }
+       }
+       p->node_stamp                   = 0;
+       p->numa_scan_seq                = mm ? mm->numa_scan_seq : 0;
+       p->numa_scan_period             = sysctl_numa_balancing_scan_delay;
+       /* Protect against double add, see task_tick_numa and task_numa_work */
+       p->numa_work.next               = &p->numa_work;
+       p->numa_faults                  = NULL;
+       RCU_INIT_POINTER(p->numa_group, NULL);
+       p->last_task_numa_placement     = 0;
+       p->last_sum_exec_runtime        = 0;
+
+       init_task_work(&p->numa_work, task_numa_work);
+
+       /* New address space, reset the preferred nid */
+       if (!(clone_flags & CLONE_VM)) {
+               p->numa_preferred_nid = NUMA_NO_NODE;
+               return;
+       }
+
+       /*
+        * New thread, keep existing numa_preferred_nid which should be copied
+        * already by arch_dup_task_struct but stagger when scans start.
+        */
+       if (mm) {
+               unsigned int delay;
+
+               delay = min_t(unsigned int, task_scan_max(current),
+                       current->numa_scan_period * mm_users * NSEC_PER_MSEC);
+               delay += 2 * TICK_NSEC;
+               p->node_stamp = delay;
+       }
+}
+
 /*
  * Drive the periodic memory faults..
  */
@@ -2693,10 +2696,8 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
                        curr->numa_scan_period = task_scan_start(curr);
                curr->node_stamp += period;
 
-               if (!time_before(jiffies, curr->mm->numa_next_scan)) {
-                       init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+               if (!time_before(jiffies, curr->mm->numa_next_scan))
                        task_work_add(curr, work, true);
-               }
        }
 }
 
@@ -3689,8 +3690,6 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
        return cfs_rq->avg.load_avg;
 }
 
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
-
 static inline unsigned long task_util(struct task_struct *p)
 {
        return READ_ONCE(p->se.avg.util_avg);
@@ -3807,7 +3806,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
 
 static inline int task_fits_capacity(struct task_struct *p, long capacity)
 {
-       return capacity * 1024 > task_util_est(p) * capacity_margin;
+       return fits_capacity(task_util_est(p), capacity);
 }
 
 static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@ -4370,8 +4369,6 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
 
        now = sched_clock_cpu(smp_processor_id());
        cfs_b->runtime = cfs_b->quota;
-       cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
-       cfs_b->expires_seq++;
 }
 
 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4393,8 +4390,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
        struct task_group *tg = cfs_rq->tg;
        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
-       u64 amount = 0, min_amount, expires;
-       int expires_seq;
+       u64 amount = 0, min_amount;
 
        /* note: this is a positive sum as runtime_remaining <= 0 */
        min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -4411,61 +4407,17 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
                        cfs_b->idle = 0;
                }
        }
-       expires_seq = cfs_b->expires_seq;
-       expires = cfs_b->runtime_expires;
        raw_spin_unlock(&cfs_b->lock);
 
        cfs_rq->runtime_remaining += amount;
-       /*
-        * we may have advanced our local expiration to account for allowed
-        * spread between our sched_clock and the one on which runtime was
-        * issued.
-        */
-       if (cfs_rq->expires_seq != expires_seq) {
-               cfs_rq->expires_seq = expires_seq;
-               cfs_rq->runtime_expires = expires;
-       }
 
        return cfs_rq->runtime_remaining > 0;
 }
 
-/*
- * Note: This depends on the synchronization provided by sched_clock and the
- * fact that rq->clock snapshots this value.
- */
-static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
-       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-
-       /* if the deadline is ahead of our clock, nothing to do */
-       if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
-               return;
-
-       if (cfs_rq->runtime_remaining < 0)
-               return;
-
-       /*
-        * If the local deadline has passed we have to consider the
-        * possibility that our sched_clock is 'fast' and the global deadline
-        * has not truly expired.
-        *
-        * Fortunately we can check determine whether this the case by checking
-        * whether the global deadline(cfs_b->expires_seq) has advanced.
-        */
-       if (cfs_rq->expires_seq == cfs_b->expires_seq) {
-               /* extend local deadline, drift is bounded above by 2 ticks */
-               cfs_rq->runtime_expires += TICK_NSEC;
-       } else {
-               /* global deadline is ahead, expiration has passed */
-               cfs_rq->runtime_remaining = 0;
-       }
-}
-
 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 {
        /* dock delta_exec before expiring quota (as it could span periods) */
        cfs_rq->runtime_remaining -= delta_exec;
-       expire_cfs_rq_runtime(cfs_rq);
 
        if (likely(cfs_rq->runtime_remaining > 0))
                return;
@@ -4556,7 +4508,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
        struct rq *rq = rq_of(cfs_rq);
        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
        struct sched_entity *se;
-       long task_delta, dequeue = 1;
+       long task_delta, idle_task_delta, dequeue = 1;
        bool empty;
 
        se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
@@ -4567,6 +4519,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
        rcu_read_unlock();
 
        task_delta = cfs_rq->h_nr_running;
+       idle_task_delta = cfs_rq->idle_h_nr_running;
        for_each_sched_entity(se) {
                struct cfs_rq *qcfs_rq = cfs_rq_of(se);
                /* throttled entity or throttle-on-deactivate */
@@ -4576,6 +4529,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
                if (dequeue)
                        dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
                qcfs_rq->h_nr_running -= task_delta;
+               qcfs_rq->idle_h_nr_running -= idle_task_delta;
 
                if (qcfs_rq->load.weight)
                        dequeue = 0;
@@ -4615,7 +4569,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
        struct sched_entity *se;
        int enqueue = 1;
-       long task_delta;
+       long task_delta, idle_task_delta;
 
        se = cfs_rq->tg->se[cpu_of(rq)];
 
@@ -4635,6 +4589,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
                return;
 
        task_delta = cfs_rq->h_nr_running;
+       idle_task_delta = cfs_rq->idle_h_nr_running;
        for_each_sched_entity(se) {
                if (se->on_rq)
                        enqueue = 0;
@@ -4643,6 +4598,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
                if (enqueue)
                        enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
                cfs_rq->h_nr_running += task_delta;
+               cfs_rq->idle_h_nr_running += idle_task_delta;
 
                if (cfs_rq_throttled(cfs_rq))
                        break;
@@ -4658,8 +4614,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
                resched_curr(rq);
 }
 
-static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
-               u64 remaining, u64 expires)
+static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
 {
        struct cfs_rq *cfs_rq;
        u64 runtime;
@@ -4684,7 +4639,6 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
                remaining -= runtime;
 
                cfs_rq->runtime_remaining += runtime;
-               cfs_rq->runtime_expires = expires;
 
                /* we check whether we're throttled above */
                if (cfs_rq->runtime_remaining > 0)
@@ -4709,7 +4663,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
  */
 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
 {
-       u64 runtime, runtime_expires;
+       u64 runtime;
        int throttled;
 
        /* no need to continue the timer with no bandwidth constraint */
@@ -4737,8 +4691,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
        /* account preceding periods in which throttling occurred */
        cfs_b->nr_throttled += overrun;
 
-       runtime_expires = cfs_b->runtime_expires;
-
        /*
         * This check is repeated as we are holding onto the new bandwidth while
         * we unthrottle. This can potentially race with an unthrottled group
@@ -4751,8 +4703,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
                cfs_b->distribute_running = 1;
                raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
                /* we can't nest cfs_b->lock while distributing bandwidth */
-               runtime = distribute_cfs_runtime(cfs_b, runtime,
-                                                runtime_expires);
+               runtime = distribute_cfs_runtime(cfs_b, runtime);
                raw_spin_lock_irqsave(&cfs_b->lock, flags);
 
                cfs_b->distribute_running = 0;
@@ -4834,8 +4785,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
                return;
 
        raw_spin_lock(&cfs_b->lock);
-       if (cfs_b->quota != RUNTIME_INF &&
-           cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+       if (cfs_b->quota != RUNTIME_INF) {
                cfs_b->runtime += slack_runtime;
 
                /* we are under rq->lock, defer unthrottling using a timer */
@@ -4868,7 +4818,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 {
        u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
        unsigned long flags;
-       u64 expires;
 
        /* confirm we're still not at a refresh boundary */
        raw_spin_lock_irqsave(&cfs_b->lock, flags);
@@ -4886,7 +4835,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
        if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
                runtime = cfs_b->runtime;
 
-       expires = cfs_b->runtime_expires;
        if (runtime)
                cfs_b->distribute_running = 1;
 
@@ -4895,11 +4843,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
        if (!runtime)
                return;
 
-       runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+       runtime = distribute_cfs_runtime(cfs_b, runtime);
 
        raw_spin_lock_irqsave(&cfs_b->lock, flags);
-       if (expires == cfs_b->runtime_expires)
-               lsub_positive(&cfs_b->runtime, runtime);
+       lsub_positive(&cfs_b->runtime, runtime);
        cfs_b->distribute_running = 0;
        raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
 }
@@ -5056,8 +5003,6 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 
        cfs_b->period_active = 1;
        overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
-       cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
-       cfs_b->expires_seq++;
        hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
 }
 
@@ -5235,7 +5180,7 @@ static inline unsigned long cpu_util(int cpu);
 
 static inline bool cpu_overutilized(int cpu)
 {
-       return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+       return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
 }
 
 static inline void update_overutilized_status(struct rq *rq)
@@ -5259,6 +5204,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
+       int idle_h_nr_running = task_has_idle_policy(p);
 
        /*
         * The code below (indirectly) updates schedutil which looks at
@@ -5291,6 +5237,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                if (cfs_rq_throttled(cfs_rq))
                        break;
                cfs_rq->h_nr_running++;
+               cfs_rq->idle_h_nr_running += idle_h_nr_running;
 
                flags = ENQUEUE_WAKEUP;
        }
@@ -5298,6 +5245,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
                cfs_rq->h_nr_running++;
+               cfs_rq->idle_h_nr_running += idle_h_nr_running;
 
                if (cfs_rq_throttled(cfs_rq))
                        break;
@@ -5359,6 +5307,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
        int task_sleep = flags & DEQUEUE_SLEEP;
+       int idle_h_nr_running = task_has_idle_policy(p);
 
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
@@ -5373,6 +5322,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                if (cfs_rq_throttled(cfs_rq))
                        break;
                cfs_rq->h_nr_running--;
+               cfs_rq->idle_h_nr_running -= idle_h_nr_running;
 
                /* Don't dequeue parent if it has other entities besides us */
                if (cfs_rq->load.weight) {
@@ -5392,6 +5342,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
                cfs_rq->h_nr_running--;
+               cfs_rq->idle_h_nr_running -= idle_h_nr_running;
 
                if (cfs_rq_throttled(cfs_rq))
                        break;
@@ -5425,6 +5376,15 @@ static struct {
 
 #endif /* CONFIG_NO_HZ_COMMON */
 
+/* CPU only has SCHED_IDLE tasks enqueued */
+static int sched_idle_cpu(int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+
+       return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
+                       rq->nr_running);
+}
+
 static unsigned long cpu_runnable_load(struct rq *rq)
 {
        return cfs_rq_runnable_load_avg(&rq->cfs);
@@ -5747,7 +5707,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
        unsigned int min_exit_latency = UINT_MAX;
        u64 latest_idle_timestamp = 0;
        int least_loaded_cpu = this_cpu;
-       int shallowest_idle_cpu = -1;
+       int shallowest_idle_cpu = -1, si_cpu = -1;
        int i;
 
        /* Check if we have any choice: */
@@ -5778,7 +5738,12 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
                                latest_idle_timestamp = rq->idle_stamp;
                                shallowest_idle_cpu = i;
                        }
-               } else if (shallowest_idle_cpu == -1) {
+               } else if (shallowest_idle_cpu == -1 && si_cpu == -1) {
+                       if (sched_idle_cpu(i)) {
+                               si_cpu = i;
+                               continue;
+                       }
+
                        load = cpu_runnable_load(cpu_rq(i));
                        if (load < min_load) {
                                min_load = load;
@@ -5787,7 +5752,11 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
                }
        }
 
-       return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+       if (shallowest_idle_cpu != -1)
+               return shallowest_idle_cpu;
+       if (si_cpu != -1)
+               return si_cpu;
+       return least_loaded_cpu;
 }
 
 static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
@@ -5940,7 +5909,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
  */
 static int select_idle_smt(struct task_struct *p, int target)
 {
-       int cpu;
+       int cpu, si_cpu = -1;
 
        if (!static_branch_likely(&sched_smt_present))
                return -1;
@@ -5950,9 +5919,11 @@ static int select_idle_smt(struct task_struct *p, int target)
                        continue;
                if (available_idle_cpu(cpu))
                        return cpu;
+               if (si_cpu == -1 && sched_idle_cpu(cpu))
+                       si_cpu = cpu;
        }
 
-       return -1;
+       return si_cpu;
 }
 
 #else /* CONFIG_SCHED_SMT */
@@ -5980,8 +5951,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
        u64 avg_cost, avg_idle;
        u64 time, cost;
        s64 delta;
-       int cpu, nr = INT_MAX;
        int this = smp_processor_id();
+       int cpu, nr = INT_MAX, si_cpu = -1;
 
        this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
        if (!this_sd)
@@ -6009,11 +5980,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 
        for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
                if (!--nr)
-                       return -1;
+                       return si_cpu;
                if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                        continue;
                if (available_idle_cpu(cpu))
                        break;
+               if (si_cpu == -1 && sched_idle_cpu(cpu))
+                       si_cpu = cpu;
        }
 
        time = cpu_clock(this) - time;
@@ -6032,13 +6005,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
        struct sched_domain *sd;
        int i, recent_used_cpu;
 
-       if (available_idle_cpu(target))
+       if (available_idle_cpu(target) || sched_idle_cpu(target))
                return target;
 
        /*
         * If the previous CPU is cache affine and idle, don't be stupid:
         */
-       if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
+       if (prev != target && cpus_share_cache(prev, target) &&
+           (available_idle_cpu(prev) || sched_idle_cpu(prev)))
                return prev;
 
        /* Check a recently used CPU as a potential idle candidate: */
@@ -6046,7 +6020,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
        if (recent_used_cpu != prev &&
            recent_used_cpu != target &&
            cpus_share_cache(recent_used_cpu, target) &&
-           available_idle_cpu(recent_used_cpu) &&
+           (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
            cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
                /*
                 * Replace recent_used_cpu with prev as it is a potential
@@ -6282,69 +6256,55 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
 }
 
 /*
- * compute_energy(): Estimates the energy that would be consumed if @p was
+ * compute_energy(): Estimates the energy that @pd would consume if @p was
  * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
- * landscape of the * CPUs after the task migration, and uses the Energy Model
+ * landscape of @pd's CPUs after the task migration, and uses the Energy Model
  * to compute what would be the energy if we decided to actually migrate that
  * task.
  */
 static long
 compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
 {
-       unsigned int max_util, util_cfs, cpu_util, cpu_cap;
-       unsigned long sum_util, energy = 0;
-       struct task_struct *tsk;
+       struct cpumask *pd_mask = perf_domain_span(pd);
+       unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
+       unsigned long max_util = 0, sum_util = 0;
        int cpu;
 
-       for (; pd; pd = pd->next) {
-               struct cpumask *pd_mask = perf_domain_span(pd);
+       /*
+        * The capacity state of CPUs of the current rd can be driven by CPUs
+        * of another rd if they belong to the same pd. So, account for the
+        * utilization of these CPUs too by masking pd with cpu_online_mask
+        * instead of the rd span.
+        *
+        * If an entire pd is outside of the current rd, it will not appear in
+        * its pd list and will not be accounted by compute_energy().
+        */
+       for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
+               unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
+               struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
 
                /*
-                * The energy model mandates all the CPUs of a performance
-                * domain have the same capacity.
+                * Busy time computation: utilization clamping is not
+                * required since the ratio (sum_util / cpu_capacity)
+                * is already enough to scale the EM reported power
+                * consumption at the (eventually clamped) cpu_capacity.
                 */
-               cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
-               max_util = sum_util = 0;
+               sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+                                              ENERGY_UTIL, NULL);
 
                /*
-                * The capacity state of CPUs of the current rd can be driven by
-                * CPUs of another rd if they belong to the same performance
-                * domain. So, account for the utilization of these CPUs too
-                * by masking pd with cpu_online_mask instead of the rd span.
-                *
-                * If an entire performance domain is outside of the current rd,
-                * it will not appear in its pd list and will not be accounted
-                * by compute_energy().
+                * Performance domain frequency: utilization clamping
+                * must be considered since it affects the selection
+                * of the performance domain frequency.
+                * NOTE: in case RT tasks are running, by default the
+                * FREQUENCY_UTIL's utilization can be max OPP.
                 */
-               for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
-                       util_cfs = cpu_util_next(cpu, p, dst_cpu);
-
-                       /*
-                        * Busy time computation: utilization clamping is not
-                        * required since the ratio (sum_util / cpu_capacity)
-                        * is already enough to scale the EM reported power
-                        * consumption at the (eventually clamped) cpu_capacity.
-                        */
-                       sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
-                                                      ENERGY_UTIL, NULL);
-
-                       /*
-                        * Performance domain frequency: utilization clamping
-                        * must be considered since it affects the selection
-                        * of the performance domain frequency.
-                        * NOTE: in case RT tasks are running, by default the
-                        * FREQUENCY_UTIL's utilization can be max OPP.
-                        */
-                       tsk = cpu == dst_cpu ? p : NULL;
-                       cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
-                                                     FREQUENCY_UTIL, tsk);
-                       max_util = max(max_util, cpu_util);
-               }
-
-               energy += em_pd_energy(pd->em_pd, max_util, sum_util);
+               cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+                                             FREQUENCY_UTIL, tsk);
+               max_util = max(max_util, cpu_util);
        }
 
-       return energy;
+       return em_pd_energy(pd->em_pd, max_util, sum_util);
 }
 
 /*
@@ -6386,21 +6346,19 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
  * other use-cases too. So, until someone finds a better way to solve this,
  * let's keep things simple by re-using the existing slow path.
  */
-
 static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 {
-       unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
+       unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
        struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+       unsigned long cpu_cap, util, base_energy = 0;
        int cpu, best_energy_cpu = prev_cpu;
-       struct perf_domain *head, *pd;
-       unsigned long cpu_cap, util;
        struct sched_domain *sd;
+       struct perf_domain *pd;
 
        rcu_read_lock();
        pd = rcu_dereference(rd->pd);
        if (!pd || READ_ONCE(rd->overutilized))
                goto fail;
-       head = pd;
 
        /*
         * Energy-aware wake-up happens on the lowest sched_domain starting
@@ -6417,9 +6375,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
                goto unlock;
 
        for (; pd; pd = pd->next) {
-               unsigned long cur_energy, spare_cap, max_spare_cap = 0;
+               unsigned long cur_delta, spare_cap, max_spare_cap = 0;
+               unsigned long base_energy_pd;
                int max_spare_cap_cpu = -1;
 
+               /* Compute the 'base' energy of the pd, without @p */
+               base_energy_pd = compute_energy(p, -1, pd);
+               base_energy += base_energy_pd;
+
                for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
                        if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                                continue;
@@ -6427,14 +6390,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
                        /* Skip CPUs that will be overutilized. */
                        util = cpu_util_next(cpu, p, cpu);
                        cpu_cap = capacity_of(cpu);
-                       if (cpu_cap * 1024 < util * capacity_margin)
+                       if (!fits_capacity(util, cpu_cap))
                                continue;
 
                        /* Always use prev_cpu as a candidate. */
                        if (cpu == prev_cpu) {
-                               prev_energy = compute_energy(p, prev_cpu, head);
-                               best_energy = min(best_energy, prev_energy);
-                               continue;
+                               prev_delta = compute_energy(p, prev_cpu, pd);
+                               prev_delta -= base_energy_pd;
+                               best_delta = min(best_delta, prev_delta);
                        }
 
                        /*
@@ -6450,9 +6413,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 
                /* Evaluate the energy impact of using this CPU. */
                if (max_spare_cap_cpu >= 0) {
-                       cur_energy = compute_energy(p, max_spare_cap_cpu, head);
-                       if (cur_energy < best_energy) {
-                               best_energy = cur_energy;
+                       cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
+                       cur_delta -= base_energy_pd;
+                       if (cur_delta < best_delta) {
+                               best_delta = cur_delta;
                                best_energy_cpu = max_spare_cap_cpu;
                        }
                }
@@ -6464,10 +6428,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
         * Pick the best CPU if prev_cpu cannot be used, or if it saves at
         * least 6% of the energy used by prev_cpu.
         */
-       if (prev_energy == ULONG_MAX)
+       if (prev_delta == ULONG_MAX)
                return best_energy_cpu;
 
-       if ((prev_energy - best_energy) > (prev_energy >> 4))
+       if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
                return best_energy_cpu;
 
        return prev_cpu;
@@ -6801,7 +6765,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
                goto idle;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-       if (prev->sched_class != &fair_sched_class)
+       if (!prev || prev->sched_class != &fair_sched_class)
                goto simple;
 
        /*
@@ -6878,8 +6842,8 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
        goto done;
 simple:
 #endif
-
-       put_prev_task(rq, prev);
+       if (prev)
+               put_prev_task(rq, prev);
 
        do {
                se = pick_next_entity(cfs_rq, NULL);
@@ -6907,11 +6871,13 @@ done: __maybe_unused;
        return p;
 
 idle:
-       update_misfit_status(NULL, rq);
-       new_tasks = idle_balance(rq, rf);
+       if (!rf)
+               return NULL;
+
+       new_tasks = newidle_balance(rq, rf);
 
        /*
-        * Because idle_balance() releases (and re-acquires) rq->lock, it is
+        * Because newidle_balance() releases (and re-acquires) rq->lock, it is
         * possible for any higher priority task to appear. In that case we
         * must re-start the pick_next_entity() loop.
         */
@@ -6933,7 +6899,7 @@ done: __maybe_unused;
 /*
  * Account for a descheduled task:
  */
-static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
+static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 {
        struct sched_entity *se = &prev->se;
        struct cfs_rq *cfs_rq;
@@ -7435,7 +7401,7 @@ static int detach_tasks(struct lb_env *env)
                detached++;
                env->imbalance -= load;
 
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
                /*
                 * NEWIDLE balancing is a source of latency, so preemptible
                 * kernels will stop after the first task is detached to minimize
@@ -7982,8 +7948,7 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
 static inline bool
 group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
 {
-       return sg->sgc->min_capacity * capacity_margin <
-                                               ref->sgc->min_capacity * 1024;
+       return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
 }
 
 /*
@@ -7993,8 +7958,7 @@ group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
 static inline bool
 group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
 {
-       return sg->sgc->max_capacity * capacity_margin <
-                                               ref->sgc->max_capacity * 1024;
+       return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
 }
 
 static inline enum
@@ -9052,9 +9016,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 out_balanced:
        /*
         * We reach balance although we may have faced some affinity
-        * constraints. Clear the imbalance flag if it was set.
+        * constraints. Clear the imbalance flag only if other tasks got
+        * a chance to move and fix the imbalance.
         */
-       if (sd_parent) {
+       if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
                int *group_imbalance = &sd_parent->groups->sgc->imbalance;
 
                if (*group_imbalance)
@@ -9075,10 +9040,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        ld_moved = 0;
 
        /*
-        * idle_balance() disregards balance intervals, so we could repeatedly
-        * reach this code, which would lead to balance_interval skyrocketting
-        * in a short amount of time. Skip the balance_interval increase logic
-        * to avoid that.
+        * newidle_balance() disregards balance intervals, so we could
+        * repeatedly reach this code, which would lead to balance_interval
+        * skyrocketting in a short amount of time. Skip the balance_interval
+        * increase logic to avoid that.
         */
        if (env.idle == CPU_NEWLY_IDLE)
                goto out;
@@ -9788,7 +9753,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
  * idle_balance is called by schedule() if this_cpu is about to become
  * idle. Attempts to pull tasks from other CPUs.
  */
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
+int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
 {
        unsigned long next_balance = jiffies + HZ;
        int this_cpu = this_rq->cpu;
@@ -9796,6 +9761,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
        int pulled_task = 0;
        u64 curr_cost = 0;
 
+       update_misfit_status(NULL, this_rq);
        /*
         * We must set idle_stamp _before_ calling idle_balance(), such that we
         * measure the duration of idle_balance() as idle time.
@@ -10180,9 +10146,19 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
  * This routine is mostly called to set cfs_rq->curr field when a task
  * migrates between groups/classes.
  */
-static void set_curr_task_fair(struct rq *rq)
+static void set_next_task_fair(struct rq *rq, struct task_struct *p)
 {
-       struct sched_entity *se = &rq->curr->se;
+       struct sched_entity *se = &p->se;
+
+#ifdef CONFIG_SMP
+       if (task_on_rq_queued(p)) {
+               /*
+                * Move the next running task to the front of the list, so our
+                * cfs_tasks list becomes MRU one.
+                */
+               list_move(&se->group_node, &rq->cfs_tasks);
+       }
+#endif
 
        for_each_sched_entity(se) {
                struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -10300,18 +10276,18 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 void online_fair_sched_group(struct task_group *tg)
 {
        struct sched_entity *se;
+       struct rq_flags rf;
        struct rq *rq;
        int i;
 
        for_each_possible_cpu(i) {
                rq = cpu_rq(i);
                se = tg->se[i];
-
-               raw_spin_lock_irq(&rq->lock);
+               rq_lock_irq(rq, &rf);
                update_rq_clock(rq);
                attach_entity_cfs_rq(se);
                sync_throttle(tg, i);
-               raw_spin_unlock_irq(&rq->lock);
+               rq_unlock_irq(rq, &rf);
        }
 }
 
@@ -10453,7 +10429,9 @@ const struct sched_class fair_sched_class = {
        .check_preempt_curr     = check_preempt_wakeup,
 
        .pick_next_task         = pick_next_task_fair,
+
        .put_prev_task          = put_prev_task_fair,
+       .set_next_task          = set_next_task_fair,
 
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_fair,
@@ -10466,7 +10444,6 @@ const struct sched_class fair_sched_class = {
        .set_cpus_allowed       = set_cpus_allowed_common,
 #endif
 
-       .set_curr_task          = set_curr_task_fair,
        .task_tick              = task_tick_fair,
        .task_fork              = task_fork_fair,