sched/fair: Fix kernel-doc warning in attach_entity_load_avg()

[linux.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 08a233e97a01974850ddeac7b7ccd6b782ea897b..3c8a379c357e5f9686b376ab87a6a156da8db655 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -801,7 +801,7 @@ void post_init_entity_util_avg(struct task_struct *p)
                  * For !fair tasks do:
                  *
                 update_cfs_rq_load_avg(now, cfs_rq);
-               attach_entity_load_avg(cfs_rq, se, 0);
+               attach_entity_load_avg(cfs_rq, se);
                 switched_from_fair(rq, p);
                  *
                  * such that the next switched_to_fair() has the
@@ -3114,7 +3114,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
  {
         struct rq *rq = rq_of(cfs_rq);
  
-       if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
+       if (&rq->cfs == cfs_rq) {
                 /*
                  * There are a few boundary cases this might miss but it should
                  * get called often enough that that should (hopefully) not be
@@ -3366,16 +3366,17 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
  
         runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
         runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
-       delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
-       delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
-
-       se->avg.runnable_load_sum = runnable_sum;
-       se->avg.runnable_load_avg = runnable_load_avg;
  
         if (se->on_rq) {
+               delta_sum = runnable_load_sum -
+                               se_weight(se) * se->avg.runnable_load_sum;
+               delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
                 add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
                 add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
         }
+
+       se->avg.runnable_load_sum = runnable_sum;
+       se->avg.runnable_load_avg = runnable_load_avg;
  }
  
  static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -3515,12 +3516,11 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
   * attach_entity_load_avg - attach this entity to its cfs_rq load avg
   * @cfs_rq: cfs_rq to attach to
   * @se: sched_entity to attach
- * @flags: migration hints
   *
   * Must call update_cfs_rq_load_avg() before this, since we rely on
   * cfs_rq->avg.last_update_time being current.
   */
-static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
         u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
  
@@ -3556,7 +3556,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
  
         add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
  
-       cfs_rq_util_change(cfs_rq, flags);
+       cfs_rq_util_change(cfs_rq, 0);
  
         trace_pelt_cfs_tp(cfs_rq);
  }
@@ -3614,7 +3614,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
                  *
                  * IOW we're enqueueing a task on a new CPU.
                  */
-               attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
+               attach_entity_load_avg(cfs_rq, se);
                 update_tg_load_avg(cfs_rq, 0);
  
         } else if (decayed) {
@@ -3711,6 +3711,20 @@ static inline unsigned long task_util_est(struct task_struct *p)
         return max(task_util(p), _task_util_est(p));
  }
  
+#ifdef CONFIG_UCLAMP_TASK
+static inline unsigned long uclamp_task_util(struct task_struct *p)
+{
+       return clamp(task_util_est(p),
+                    uclamp_eff_value(p, UCLAMP_MIN),
+                    uclamp_eff_value(p, UCLAMP_MAX));
+}
+#else
+static inline unsigned long uclamp_task_util(struct task_struct *p)
+{
+       return task_util_est(p);
+}
+#endif
+
  static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
                                     struct task_struct *p)
  {
@@ -3822,7 +3836,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
  
  static inline int task_fits_capacity(struct task_struct *p, long capacity)
  {
-       return fits_capacity(task_util_est(p), capacity);
+       return fits_capacity(uclamp_task_util(p), capacity);
  }
  
  static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@ -3857,7 +3871,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
  static inline void remove_entity_load_avg(struct sched_entity *se) {}
  
  static inline void
-attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
+attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
  static inline void
  detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
  
@@ -5196,6 +5210,20 @@ static inline void update_overutilized_status(struct rq *rq)
  static inline void update_overutilized_status(struct rq *rq) { }
  #endif
  
+/* Runqueue only has SCHED_IDLE tasks enqueued */
+static int sched_idle_rq(struct rq *rq)
+{
+       return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
+                       rq->nr_running);
+}
+
+#ifdef CONFIG_SMP
+static int sched_idle_cpu(int cpu)
+{
+       return sched_idle_rq(cpu_rq(cpu));
+}
+#endif
+
  /*
   * The enqueue_task method is called before nr_running is
   * increased. Here we update the fair scheduling stats and
@@ -5310,6 +5338,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
         struct sched_entity *se = &p->se;
         int task_sleep = flags & DEQUEUE_SLEEP;
         int idle_h_nr_running = task_has_idle_policy(p);
+       bool was_sched_idle = sched_idle_rq(rq);
  
         for_each_sched_entity(se) {
                 cfs_rq = cfs_rq_of(se);
@@ -5356,6 +5385,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
         if (!se)
                 sub_nr_running(rq, 1);
  
+       /* balance early to pull high priority tasks */
+       if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
+               rq->next_balance = jiffies;
+
         util_est_dequeue(&rq->cfs, p, task_sleep);
         hrtick_update(rq);
  }
@@ -5378,15 +5411,6 @@ static struct {
  
  #endif /* CONFIG_NO_HZ_COMMON */
  
-/* CPU only has SCHED_IDLE tasks enqueued */
-static int sched_idle_cpu(int cpu)
-{
-       struct rq *rq = cpu_rq(cpu);
-
-       return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
-                       rq->nr_running);
-}
-
  static unsigned long cpu_load(struct rq *rq)
  {
         return cfs_rq_load_avg(&rq->cfs);
@@ -5588,7 +5612,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
         unsigned int min_exit_latency = UINT_MAX;
         u64 latest_idle_timestamp = 0;
         int least_loaded_cpu = this_cpu;
-       int shallowest_idle_cpu = -1, si_cpu = -1;
+       int shallowest_idle_cpu = -1;
         int i;
  
         /* Check if we have any choice: */
@@ -5597,6 +5621,9 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
  
         /* Traverse only the allowed CPUs */
         for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
+               if (sched_idle_cpu(i))
+                       return i;
+
                 if (available_idle_cpu(i)) {
                         struct rq *rq = cpu_rq(i);
                         struct cpuidle_state *idle = idle_get_state(rq);
@@ -5619,12 +5646,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
                                 latest_idle_timestamp = rq->idle_stamp;
                                 shallowest_idle_cpu = i;
                         }
-               } else if (shallowest_idle_cpu == -1 && si_cpu == -1) {
-                       if (sched_idle_cpu(i)) {
-                               si_cpu = i;
-                               continue;
-                       }
-
+               } else if (shallowest_idle_cpu == -1) {
                         load = cpu_load(cpu_rq(i));
                         if (load < min_load) {
                                 min_load = load;
@@ -5633,11 +5655,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
                 }
         }
  
-       if (shallowest_idle_cpu != -1)
-               return shallowest_idle_cpu;
-       if (si_cpu != -1)
-               return si_cpu;
-       return least_loaded_cpu;
+       return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
  }
  
  static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
@@ -5790,7 +5808,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
   */
  static int select_idle_smt(struct task_struct *p, int target)
  {
-       int cpu, si_cpu = -1;
+       int cpu;
  
         if (!static_branch_likely(&sched_smt_present))
                 return -1;
@@ -5798,13 +5816,11 @@ static int select_idle_smt(struct task_struct *p, int target)
         for_each_cpu(cpu, cpu_smt_mask(target)) {
                 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                         continue;
-               if (available_idle_cpu(cpu))
+               if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
                         return cpu;
-               if (si_cpu == -1 && sched_idle_cpu(cpu))
-                       si_cpu = cpu;
         }
  
-       return si_cpu;
+       return -1;
  }
  
  #else /* CONFIG_SCHED_SMT */
@@ -5828,12 +5844,13 @@ static inline int select_idle_smt(struct task_struct *p, int target)
   */
  static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
  {
+       struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
         struct sched_domain *this_sd;
         u64 avg_cost, avg_idle;
         u64 time, cost;
         s64 delta;
         int this = smp_processor_id();
-       int cpu, nr = INT_MAX, si_cpu = -1;
+       int cpu, nr = INT_MAX;
  
         this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
         if (!this_sd)
@@ -5859,15 +5876,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
  
         time = cpu_clock(this);
  
-       for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
+       cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+
+       for_each_cpu_wrap(cpu, cpus, target) {
                 if (!--nr)
-                       return si_cpu;
-               if (!cpumask_test_cpu(cpu, p->cpus_ptr))
-                       continue;
-               if (available_idle_cpu(cpu))
+                       return -1;
+               if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
                         break;
-               if (si_cpu == -1 && sched_idle_cpu(cpu))
-                       si_cpu = cpu;
         }
  
         time = cpu_clock(this) - time;
@@ -5896,6 +5911,20 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
             (available_idle_cpu(prev) || sched_idle_cpu(prev)))
                 return prev;
  
+       /*
+        * Allow a per-cpu kthread to stack with the wakee if the
+        * kworker thread and the tasks previous CPUs are the same.
+        * The assumption is that the wakee queued work for the
+        * per-cpu kthread that is now complete and the wakeup is
+        * essentially a sync wakeup. An obvious example of this
+        * pattern is IO completions.
+        */
+       if (is_per_cpu_kthread(current) &&
+           prev == smp_processor_id() &&
+           this_rq()->nr_running <= 1) {
+               return prev;
+       }
+
         /* Check a recently used CPU as a potential idle candidate: */
         recent_used_cpu = p->recent_used_cpu;
         if (recent_used_cpu != prev &&
@@ -6268,9 +6297,18 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
                         if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                                 continue;
  
-                       /* Skip CPUs that will be overutilized. */
                         util = cpu_util_next(cpu, p, cpu);
                         cpu_cap = capacity_of(cpu);
+                       spare_cap = cpu_cap - util;
+
+                       /*
+                        * Skip CPUs that cannot satisfy the capacity request.
+                        * IOW, placing the task there would make the CPU
+                        * overutilized. Take uclamp into account to see how
+                        * much capacity we can get out of the CPU; this is
+                        * aligned with schedutil_cpu_util().
+                        */
+                       util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
                         if (!fits_capacity(util, cpu_cap))
                                 continue;
  
@@ -6285,7 +6323,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
                          * Find the CPU with the maximum spare capacity in
                          * the performance domain
                          */
-                       spare_cap = cpu_cap - util;
                         if (spare_cap > max_spare_cap) {
                                 max_spare_cap = spare_cap;
                                 max_spare_cap_cpu = cpu;
@@ -7328,7 +7365,14 @@ static int detach_tasks(struct lb_env *env)
                             load < 16 && !env->sd->nr_balance_failed)
                                 goto next;
  
-                       if (load/2 > env->imbalance)
+                       /*
+                        * Make sure that we don't migrate too much load.
+                        * Nevertheless, let relax the constraint if
+                        * scheduler fails to find a good waiting task to
+                        * migrate.
+                        */
+                       if (load/2 > env->imbalance &&
+                           env->sd->nr_balance_failed <= env->sd->cache_nice_tries)
                                 goto next;
  
                         env->imbalance -= load;
@@ -7773,29 +7817,11 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
                  */
  
                 for_each_cpu(cpu, sched_group_span(sdg)) {
-                       struct sched_group_capacity *sgc;
-                       struct rq *rq = cpu_rq(cpu);
+                       unsigned long cpu_cap = capacity_of(cpu);
  
-                       /*
-                        * build_sched_domains() -> init_sched_groups_capacity()
-                        * gets here before we've attached the domains to the
-                        * runqueues.
-                        *
-                        * Use capacity_of(), which is set irrespective of domains
-                        * in update_cpu_capacity().
-                        *
-                        * This avoids capacity from being 0 and
-                        * causing divide-by-zero issues on boot.
-                        */
-                       if (unlikely(!rq->sd)) {
-                               capacity += capacity_of(cpu);
-                       } else {
-                               sgc = rq->sd->groups->sgc;
-                               capacity += sgc->capacity;
-                       }
-
-                       min_capacity = min(capacity, min_capacity);
-                       max_capacity = max(capacity, max_capacity);
+                       capacity += cpu_cap;
+                       min_capacity = min(cpu_cap, min_capacity);
+                       max_capacity = max(cpu_cap, max_capacity);
                 }
         } else  {
                 /*
@@ -8161,14 +8187,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,
  
         case group_has_spare:
                 /*
-                * Select not overloaded group with lowest number of
-                * idle cpus. We could also compare the spare capacity
-                * which is more stable but it can end up that the
-                * group has less spare capacity but finally more idle
+                * Select not overloaded group with lowest number of idle cpus
+                * and highest number of running tasks. We could also compare
+                * the spare capacity which is more stable but it can end up
+                * that the group has less spare capacity but finally more idle
                  * CPUs which means less opportunity to pull tasks.
                  */
-               if (sgs->idle_cpus >= busiest->idle_cpus)
+               if (sgs->idle_cpus > busiest->idle_cpus)
+                       return false;
+               else if ((sgs->idle_cpus == busiest->idle_cpus) &&
+                        (sgs->sum_nr_running <= busiest->sum_nr_running))
                         return false;
+
                 break;
         }
  
@@ -8417,6 +8447,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
         if (!idlest)
                 return NULL;
  
+       /* The local group has been skipped because of CPU affinity */
+       if (!local)
+               return idlest;
+
         /*
          * If the local group is idler than the selected idlest group
          * don't try and push the task.
@@ -8637,10 +8671,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
         /*
          * Try to use spare capacity of local group without overloading it or
          * emptying busiest.
-        * XXX Spreading tasks across NUMA nodes is not always the best policy
-        * and special care should be taken for SD_NUMA domain level before
-        * spreading the tasks. For now, load_balance() fully relies on
-        * NUMA_BALANCING and fbq_classify_group/rq to override the decision.
          */
         if (local->group_type == group_has_spare) {
                 if (busiest->group_type > group_fully_busy) {
@@ -8680,16 +8710,37 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
                         env->migration_type = migrate_task;
                         lsub_positive(&nr_diff, local->sum_nr_running);
                         env->imbalance = nr_diff >> 1;
-                       return;
-               }
+               } else {
  
-               /*
-                * If there is no overload, we just want to even the number of
-                * idle cpus.
-                */
-               env->migration_type = migrate_task;
-               env->imbalance = max_t(long, 0, (local->idle_cpus -
+                       /*
+                        * If there is no overload, we just want to even the number of
+                        * idle cpus.
+                        */
+                       env->migration_type = migrate_task;
+                       env->imbalance = max_t(long, 0, (local->idle_cpus -
                                                  busiest->idle_cpus) >> 1);
+               }
+
+               /* Consider allowing a small imbalance between NUMA groups */
+               if (env->sd->flags & SD_NUMA) {
+                       unsigned int imbalance_min;
+
+                       /*
+                        * Compute an allowed imbalance based on a simple
+                        * pair of communicating tasks that should remain
+                        * local and ignore them.
+                        *
+                        * NOTE: Generally this would have been based on
+                        * the domain size and this was evaluated. However,
+                        * the benefit is similar across a range of workloads
+                        * and machines but scaling by the domain size adds
+                        * the risk that lower domains have to be rebalanced.
+                        */
+                       imbalance_min = 2;
+                       if (busiest->sum_nr_running <= imbalance_min)
+                               env->imbalance = 0;
+               }
+
                 return;
         }
  
@@ -9518,6 +9569,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
  {
         int continue_balancing = 1;
         int cpu = rq->cpu;
+       int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
         unsigned long interval;
         struct sched_domain *sd;
         /* Earliest time when we have to do rebalance again */
@@ -9554,7 +9606,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
                         break;
                 }
  
-               interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
+               interval = get_sd_balance_interval(sd, busy);
  
                 need_serialize = sd->flags & SD_SERIALIZE;
                 if (need_serialize) {
@@ -9570,9 +9622,10 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
                                  * state even if we migrated tasks. Update it.
                                  */
                                 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
+                               busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
                         }
                         sd->last_balance = jiffies;
-                       interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
+                       interval = get_sd_balance_interval(sd, busy);
                 }
                 if (need_serialize)
                         spin_unlock(&balancing);
@@ -10322,6 +10375,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
         if (!task_on_rq_queued(p))
                 return;
  
+       if (rq->cfs.nr_running == 1)
+               return;
+
         /*
          * Reschedule if we are currently running on this runqueue and
          * our priority decreased, or if we are not currently running on
@@ -10412,7 +10468,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
  
         /* Synchronize entity with its cfs_rq */
         update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
-       attach_entity_load_avg(cfs_rq, se, 0);
+       attach_entity_load_avg(cfs_rq, se);
         update_tg_load_avg(cfs_rq, false);
         propagate_entity_cfs_rq(se);
  }