]> asedeno.scripts.mit.edu Git - linux.git/blobdiff - kernel/sched/fair.c
sched/fair: Fix kernel-doc warning in attach_entity_load_avg()
[linux.git] / kernel / sched / fair.c
index 08a233e97a01974850ddeac7b7ccd6b782ea897b..3c8a379c357e5f9686b376ab87a6a156da8db655 100644 (file)
@@ -801,7 +801,7 @@ void post_init_entity_util_avg(struct task_struct *p)
                 * For !fair tasks do:
                 *
                update_cfs_rq_load_avg(now, cfs_rq);
-               attach_entity_load_avg(cfs_rq, se, 0);
+               attach_entity_load_avg(cfs_rq, se);
                switched_from_fair(rq, p);
                 *
                 * such that the next switched_to_fair() has the
@@ -3114,7 +3114,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
 {
        struct rq *rq = rq_of(cfs_rq);
 
-       if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
+       if (&rq->cfs == cfs_rq) {
                /*
                 * There are a few boundary cases this might miss but it should
                 * get called often enough that that should (hopefully) not be
@@ -3366,16 +3366,17 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
 
        runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
        runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
-       delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
-       delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
-
-       se->avg.runnable_load_sum = runnable_sum;
-       se->avg.runnable_load_avg = runnable_load_avg;
 
        if (se->on_rq) {
+               delta_sum = runnable_load_sum -
+                               se_weight(se) * se->avg.runnable_load_sum;
+               delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
                add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
                add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
        }
+
+       se->avg.runnable_load_sum = runnable_sum;
+       se->avg.runnable_load_avg = runnable_load_avg;
 }
 
 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -3515,12 +3516,11 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
  * attach_entity_load_avg - attach this entity to its cfs_rq load avg
  * @cfs_rq: cfs_rq to attach to
  * @se: sched_entity to attach
- * @flags: migration hints
  *
  * Must call update_cfs_rq_load_avg() before this, since we rely on
  * cfs_rq->avg.last_update_time being current.
  */
-static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
 
@@ -3556,7 +3556,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 
        add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
 
-       cfs_rq_util_change(cfs_rq, flags);
+       cfs_rq_util_change(cfs_rq, 0);
 
        trace_pelt_cfs_tp(cfs_rq);
 }
@@ -3614,7 +3614,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
                 *
                 * IOW we're enqueueing a task on a new CPU.
                 */
-               attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
+               attach_entity_load_avg(cfs_rq, se);
                update_tg_load_avg(cfs_rq, 0);
 
        } else if (decayed) {
@@ -3711,6 +3711,20 @@ static inline unsigned long task_util_est(struct task_struct *p)
        return max(task_util(p), _task_util_est(p));
 }
 
+#ifdef CONFIG_UCLAMP_TASK
+static inline unsigned long uclamp_task_util(struct task_struct *p)
+{
+       return clamp(task_util_est(p),
+                    uclamp_eff_value(p, UCLAMP_MIN),
+                    uclamp_eff_value(p, UCLAMP_MAX));
+}
+#else
+static inline unsigned long uclamp_task_util(struct task_struct *p)
+{
+       return task_util_est(p);
+}
+#endif
+
 static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
                                    struct task_struct *p)
 {
@@ -3822,7 +3836,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
 
 static inline int task_fits_capacity(struct task_struct *p, long capacity)
 {
-       return fits_capacity(task_util_est(p), capacity);
+       return fits_capacity(uclamp_task_util(p), capacity);
 }
 
 static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@ -3857,7 +3871,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 static inline void remove_entity_load_avg(struct sched_entity *se) {}
 
 static inline void
-attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
+attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 static inline void
 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 
@@ -5196,6 +5210,20 @@ static inline void update_overutilized_status(struct rq *rq)
 static inline void update_overutilized_status(struct rq *rq) { }
 #endif
 
+/* Runqueue only has SCHED_IDLE tasks enqueued */
+static int sched_idle_rq(struct rq *rq)
+{
+       return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
+                       rq->nr_running);
+}
+
+#ifdef CONFIG_SMP
+static int sched_idle_cpu(int cpu)
+{
+       return sched_idle_rq(cpu_rq(cpu));
+}
+#endif
+
 /*
  * The enqueue_task method is called before nr_running is
  * increased. Here we update the fair scheduling stats and
@@ -5310,6 +5338,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        struct sched_entity *se = &p->se;
        int task_sleep = flags & DEQUEUE_SLEEP;
        int idle_h_nr_running = task_has_idle_policy(p);
+       bool was_sched_idle = sched_idle_rq(rq);
 
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
@@ -5356,6 +5385,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        if (!se)
                sub_nr_running(rq, 1);
 
+       /* balance early to pull high priority tasks */
+       if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
+               rq->next_balance = jiffies;
+
        util_est_dequeue(&rq->cfs, p, task_sleep);
        hrtick_update(rq);
 }
@@ -5378,15 +5411,6 @@ static struct {
 
 #endif /* CONFIG_NO_HZ_COMMON */
 
-/* CPU only has SCHED_IDLE tasks enqueued */
-static int sched_idle_cpu(int cpu)
-{
-       struct rq *rq = cpu_rq(cpu);
-
-       return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
-                       rq->nr_running);
-}
-
 static unsigned long cpu_load(struct rq *rq)
 {
        return cfs_rq_load_avg(&rq->cfs);
@@ -5588,7 +5612,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
        unsigned int min_exit_latency = UINT_MAX;
        u64 latest_idle_timestamp = 0;
        int least_loaded_cpu = this_cpu;
-       int shallowest_idle_cpu = -1, si_cpu = -1;
+       int shallowest_idle_cpu = -1;
        int i;
 
        /* Check if we have any choice: */
@@ -5597,6 +5621,9 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
 
        /* Traverse only the allowed CPUs */
        for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
+               if (sched_idle_cpu(i))
+                       return i;
+
                if (available_idle_cpu(i)) {
                        struct rq *rq = cpu_rq(i);
                        struct cpuidle_state *idle = idle_get_state(rq);
@@ -5619,12 +5646,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
                                latest_idle_timestamp = rq->idle_stamp;
                                shallowest_idle_cpu = i;
                        }
-               } else if (shallowest_idle_cpu == -1 && si_cpu == -1) {
-                       if (sched_idle_cpu(i)) {
-                               si_cpu = i;
-                               continue;
-                       }
-
+               } else if (shallowest_idle_cpu == -1) {
                        load = cpu_load(cpu_rq(i));
                        if (load < min_load) {
                                min_load = load;
@@ -5633,11 +5655,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
                }
        }
 
-       if (shallowest_idle_cpu != -1)
-               return shallowest_idle_cpu;
-       if (si_cpu != -1)
-               return si_cpu;
-       return least_loaded_cpu;
+       return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
 }
 
 static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
@@ -5790,7 +5808,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
  */
 static int select_idle_smt(struct task_struct *p, int target)
 {
-       int cpu, si_cpu = -1;
+       int cpu;
 
        if (!static_branch_likely(&sched_smt_present))
                return -1;
@@ -5798,13 +5816,11 @@ static int select_idle_smt(struct task_struct *p, int target)
        for_each_cpu(cpu, cpu_smt_mask(target)) {
                if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                        continue;
-               if (available_idle_cpu(cpu))
+               if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
                        return cpu;
-               if (si_cpu == -1 && sched_idle_cpu(cpu))
-                       si_cpu = cpu;
        }
 
-       return si_cpu;
+       return -1;
 }
 
 #else /* CONFIG_SCHED_SMT */
@@ -5828,12 +5844,13 @@ static inline int select_idle_smt(struct task_struct *p, int target)
  */
 static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
 {
+       struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
        struct sched_domain *this_sd;
        u64 avg_cost, avg_idle;
        u64 time, cost;
        s64 delta;
        int this = smp_processor_id();
-       int cpu, nr = INT_MAX, si_cpu = -1;
+       int cpu, nr = INT_MAX;
 
        this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
        if (!this_sd)
@@ -5859,15 +5876,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 
        time = cpu_clock(this);
 
-       for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
+       cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+
+       for_each_cpu_wrap(cpu, cpus, target) {
                if (!--nr)
-                       return si_cpu;
-               if (!cpumask_test_cpu(cpu, p->cpus_ptr))
-                       continue;
-               if (available_idle_cpu(cpu))
+                       return -1;
+               if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
                        break;
-               if (si_cpu == -1 && sched_idle_cpu(cpu))
-                       si_cpu = cpu;
        }
 
        time = cpu_clock(this) - time;
@@ -5896,6 +5911,20 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
            (available_idle_cpu(prev) || sched_idle_cpu(prev)))
                return prev;
 
+       /*
+        * Allow a per-cpu kthread to stack with the wakee if the
+        * kworker thread and the tasks previous CPUs are the same.
+        * The assumption is that the wakee queued work for the
+        * per-cpu kthread that is now complete and the wakeup is
+        * essentially a sync wakeup. An obvious example of this
+        * pattern is IO completions.
+        */
+       if (is_per_cpu_kthread(current) &&
+           prev == smp_processor_id() &&
+           this_rq()->nr_running <= 1) {
+               return prev;
+       }
+
        /* Check a recently used CPU as a potential idle candidate: */
        recent_used_cpu = p->recent_used_cpu;
        if (recent_used_cpu != prev &&
@@ -6268,9 +6297,18 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
                        if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                                continue;
 
-                       /* Skip CPUs that will be overutilized. */
                        util = cpu_util_next(cpu, p, cpu);
                        cpu_cap = capacity_of(cpu);
+                       spare_cap = cpu_cap - util;
+
+                       /*
+                        * Skip CPUs that cannot satisfy the capacity request.
+                        * IOW, placing the task there would make the CPU
+                        * overutilized. Take uclamp into account to see how
+                        * much capacity we can get out of the CPU; this is
+                        * aligned with schedutil_cpu_util().
+                        */
+                       util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
                        if (!fits_capacity(util, cpu_cap))
                                continue;
 
@@ -6285,7 +6323,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
                         * Find the CPU with the maximum spare capacity in
                         * the performance domain
                         */
-                       spare_cap = cpu_cap - util;
                        if (spare_cap > max_spare_cap) {
                                max_spare_cap = spare_cap;
                                max_spare_cap_cpu = cpu;
@@ -7328,7 +7365,14 @@ static int detach_tasks(struct lb_env *env)
                            load < 16 && !env->sd->nr_balance_failed)
                                goto next;
 
-                       if (load/2 > env->imbalance)
+                       /*
+                        * Make sure that we don't migrate too much load.
+                        * Nevertheless, let relax the constraint if
+                        * scheduler fails to find a good waiting task to
+                        * migrate.
+                        */
+                       if (load/2 > env->imbalance &&
+                           env->sd->nr_balance_failed <= env->sd->cache_nice_tries)
                                goto next;
 
                        env->imbalance -= load;
@@ -7773,29 +7817,11 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
                 */
 
                for_each_cpu(cpu, sched_group_span(sdg)) {
-                       struct sched_group_capacity *sgc;
-                       struct rq *rq = cpu_rq(cpu);
+                       unsigned long cpu_cap = capacity_of(cpu);
 
-                       /*
-                        * build_sched_domains() -> init_sched_groups_capacity()
-                        * gets here before we've attached the domains to the
-                        * runqueues.
-                        *
-                        * Use capacity_of(), which is set irrespective of domains
-                        * in update_cpu_capacity().
-                        *
-                        * This avoids capacity from being 0 and
-                        * causing divide-by-zero issues on boot.
-                        */
-                       if (unlikely(!rq->sd)) {
-                               capacity += capacity_of(cpu);
-                       } else {
-                               sgc = rq->sd->groups->sgc;
-                               capacity += sgc->capacity;
-                       }
-
-                       min_capacity = min(capacity, min_capacity);
-                       max_capacity = max(capacity, max_capacity);
+                       capacity += cpu_cap;
+                       min_capacity = min(cpu_cap, min_capacity);
+                       max_capacity = max(cpu_cap, max_capacity);
                }
        } else  {
                /*
@@ -8161,14 +8187,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 
        case group_has_spare:
                /*
-                * Select not overloaded group with lowest number of
-                * idle cpus. We could also compare the spare capacity
-                * which is more stable but it can end up that the
-                * group has less spare capacity but finally more idle
+                * Select not overloaded group with lowest number of idle cpus
+                * and highest number of running tasks. We could also compare
+                * the spare capacity which is more stable but it can end up
+                * that the group has less spare capacity but finally more idle
                 * CPUs which means less opportunity to pull tasks.
                 */
-               if (sgs->idle_cpus >= busiest->idle_cpus)
+               if (sgs->idle_cpus > busiest->idle_cpus)
+                       return false;
+               else if ((sgs->idle_cpus == busiest->idle_cpus) &&
+                        (sgs->sum_nr_running <= busiest->sum_nr_running))
                        return false;
+
                break;
        }
 
@@ -8417,6 +8447,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
        if (!idlest)
                return NULL;
 
+       /* The local group has been skipped because of CPU affinity */
+       if (!local)
+               return idlest;
+
        /*
         * If the local group is idler than the selected idlest group
         * don't try and push the task.
@@ -8637,10 +8671,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
        /*
         * Try to use spare capacity of local group without overloading it or
         * emptying busiest.
-        * XXX Spreading tasks across NUMA nodes is not always the best policy
-        * and special care should be taken for SD_NUMA domain level before
-        * spreading the tasks. For now, load_balance() fully relies on
-        * NUMA_BALANCING and fbq_classify_group/rq to override the decision.
         */
        if (local->group_type == group_has_spare) {
                if (busiest->group_type > group_fully_busy) {
@@ -8680,16 +8710,37 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
                        env->migration_type = migrate_task;
                        lsub_positive(&nr_diff, local->sum_nr_running);
                        env->imbalance = nr_diff >> 1;
-                       return;
-               }
+               } else {
 
-               /*
-                * If there is no overload, we just want to even the number of
-                * idle cpus.
-                */
-               env->migration_type = migrate_task;
-               env->imbalance = max_t(long, 0, (local->idle_cpus -
+                       /*
+                        * If there is no overload, we just want to even the number of
+                        * idle cpus.
+                        */
+                       env->migration_type = migrate_task;
+                       env->imbalance = max_t(long, 0, (local->idle_cpus -
                                                 busiest->idle_cpus) >> 1);
+               }
+
+               /* Consider allowing a small imbalance between NUMA groups */
+               if (env->sd->flags & SD_NUMA) {
+                       unsigned int imbalance_min;
+
+                       /*
+                        * Compute an allowed imbalance based on a simple
+                        * pair of communicating tasks that should remain
+                        * local and ignore them.
+                        *
+                        * NOTE: Generally this would have been based on
+                        * the domain size and this was evaluated. However,
+                        * the benefit is similar across a range of workloads
+                        * and machines but scaling by the domain size adds
+                        * the risk that lower domains have to be rebalanced.
+                        */
+                       imbalance_min = 2;
+                       if (busiest->sum_nr_running <= imbalance_min)
+                               env->imbalance = 0;
+               }
+
                return;
        }
 
@@ -9518,6 +9569,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
 {
        int continue_balancing = 1;
        int cpu = rq->cpu;
+       int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
        unsigned long interval;
        struct sched_domain *sd;
        /* Earliest time when we have to do rebalance again */
@@ -9554,7 +9606,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
                        break;
                }
 
-               interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
+               interval = get_sd_balance_interval(sd, busy);
 
                need_serialize = sd->flags & SD_SERIALIZE;
                if (need_serialize) {
@@ -9570,9 +9622,10 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
                                 * state even if we migrated tasks. Update it.
                                 */
                                idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
+                               busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
                        }
                        sd->last_balance = jiffies;
-                       interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
+                       interval = get_sd_balance_interval(sd, busy);
                }
                if (need_serialize)
                        spin_unlock(&balancing);
@@ -10322,6 +10375,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
        if (!task_on_rq_queued(p))
                return;
 
+       if (rq->cfs.nr_running == 1)
+               return;
+
        /*
         * Reschedule if we are currently running on this runqueue and
         * our priority decreased, or if we are not currently running on
@@ -10412,7 +10468,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
 
        /* Synchronize entity with its cfs_rq */
        update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
-       attach_entity_load_avg(cfs_rq, se, 0);
+       attach_entity_load_avg(cfs_rq, se);
        update_tg_load_avg(cfs_rq, false);
        propagate_entity_cfs_rq(se);
 }