Merge branch 'x86-entry-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...

[linux.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index c32ac071c203cc6b0cca5aef81aace665a8e09df..06961b997ed6d8c13ced5558520f75b07c85aedc 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -810,7 +810,7 @@ static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
         return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
  }
  
-static inline unsigned int uclamp_none(int clamp_id)
+static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)
  {
         if (clamp_id == UCLAMP_MIN)
                 return 0;
@@ -826,7 +826,7 @@ static inline void uclamp_se_set(struct uclamp_se *uc_se,
  }
  
  static inline unsigned int
-uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
+uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
                   unsigned int clamp_value)
  {
         /*
@@ -842,7 +842,7 @@ uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
         return uclamp_none(UCLAMP_MIN);
  }
  
-static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
+static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
                                      unsigned int clamp_value)
  {
         /* Reset max-clamp retention only on idle exit */
@@ -853,8 +853,8 @@ static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
  }
  
  static inline
-unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
-                                unsigned int clamp_value)
+enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
+                                  unsigned int clamp_value)
  {
         struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
         int bucket_id = UCLAMP_BUCKETS - 1;
@@ -874,7 +874,7 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
  }
  
  static inline struct uclamp_se
-uclamp_tg_restrict(struct task_struct *p, unsigned int clamp_id)
+uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
  {
         struct uclamp_se uc_req = p->uclamp_req[clamp_id];
  #ifdef CONFIG_UCLAMP_TASK_GROUP
@@ -906,7 +906,7 @@ uclamp_tg_restrict(struct task_struct *p, unsigned int clamp_id)
   * - the system default clamp value, defined by the sysadmin
   */
  static inline struct uclamp_se
-uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
+uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
  {
         struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
         struct uclamp_se uc_max = uclamp_default[clamp_id];
@@ -918,7 +918,7 @@ uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
         return uc_req;
  }
  
-unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
+enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
  {
         struct uclamp_se uc_eff;
  
@@ -942,7 +942,7 @@ unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
   * for each bucket when all its RUNNABLE tasks require the same clamp.
   */
  static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
-                                   unsigned int clamp_id)
+                                   enum uclamp_id clamp_id)
  {
         struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
         struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@ -980,7 +980,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
   * enforce the expected state and warn.
   */
  static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
-                                   unsigned int clamp_id)
+                                   enum uclamp_id clamp_id)
  {
         struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
         struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@ -1019,7 +1019,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
  
  static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
  {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
  
         if (unlikely(!p->sched_class->uclamp_enabled))
                 return;
@@ -1034,7 +1034,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
  
  static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
  {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
  
         if (unlikely(!p->sched_class->uclamp_enabled))
                 return;
@@ -1043,6 +1043,54 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
                 uclamp_rq_dec_id(rq, p, clamp_id);
  }
  
+static inline void
+uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
+{
+       struct rq_flags rf;
+       struct rq *rq;
+
+       /*
+        * Lock the task and the rq where the task is (or was) queued.
+        *
+        * We might lock the (previous) rq of a !RUNNABLE task, but that's the
+        * price to pay to safely serialize util_{min,max} updates with
+        * enqueues, dequeues and migration operations.
+        * This is the same locking schema used by __set_cpus_allowed_ptr().
+        */
+       rq = task_rq_lock(p, &rf);
+
+       /*
+        * Setting the clamp bucket is serialized by task_rq_lock().
+        * If the task is not yet RUNNABLE and its task_struct is not
+        * affecting a valid clamp bucket, the next time it's enqueued,
+        * it will already see the updated clamp bucket value.
+        */
+       if (!p->uclamp[clamp_id].active) {
+               uclamp_rq_dec_id(rq, p, clamp_id);
+               uclamp_rq_inc_id(rq, p, clamp_id);
+       }
+
+       task_rq_unlock(rq, p, &rf);
+}
+
+static inline void
+uclamp_update_active_tasks(struct cgroup_subsys_state *css,
+                          unsigned int clamps)
+{
+       enum uclamp_id clamp_id;
+       struct css_task_iter it;
+       struct task_struct *p;
+
+       css_task_iter_start(css, 0, &it);
+       while ((p = css_task_iter_next(&it))) {
+               for_each_clamp_id(clamp_id) {
+                       if ((0x1 << clamp_id) & clamps)
+                               uclamp_update_active(p, clamp_id);
+               }
+       }
+       css_task_iter_end(&it);
+}
+
  #ifdef CONFIG_UCLAMP_TASK_GROUP
  static void cpu_util_update_eff(struct cgroup_subsys_state *css);
  static void uclamp_update_root_tg(void)
@@ -1139,7 +1187,7 @@ static int uclamp_validate(struct task_struct *p,
  static void __setscheduler_uclamp(struct task_struct *p,
                                   const struct sched_attr *attr)
  {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
  
         /*
          * On scheduling class change, reset to default clamps for tasks
@@ -1176,7 +1224,7 @@ static void __setscheduler_uclamp(struct task_struct *p,
  
  static void uclamp_fork(struct task_struct *p)
  {
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
  
         for_each_clamp_id(clamp_id)
                 p->uclamp[clamp_id].active = false;
@@ -1198,7 +1246,7 @@ static void uclamp_fork(struct task_struct *p)
  static void __init init_uclamp(void)
  {
         struct uclamp_se uc_max = {};
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
         int cpu;
  
         mutex_init(&uclamp_mutex);
@@ -3704,7 +3752,7 @@ static inline void sched_tick_start(int cpu) { }
  static inline void sched_tick_stop(int cpu) { }
  #endif
  
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
                                 defined(CONFIG_TRACE_PREEMPT_TOGGLE))
  /*
   * If the value passed in is equal to the current preempt count
@@ -3910,7 +3958,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
   *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
   *      called on the nearest possible occasion:
   *
- *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
+ *       - If the kernel is preemptible (CONFIG_PREEMPTION=y):
   *
   *         - in syscall or exception context, at the next outmost
   *           preempt_enable(). (this might be as soon as the wake_up()'s
@@ -3919,7 +3967,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
   *         - in IRQ context, return from interrupt-handler to
   *           preemptible context
   *
- *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
+ *       - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
   *         then at the next:
   *
   *          - cond_resched() call
@@ -4032,7 +4080,7 @@ void __noreturn do_task_dead(void)
  
  static inline void sched_submit_work(struct task_struct *tsk)
  {
-       if (!tsk->state || tsk_is_pi_blocked(tsk))
+       if (!tsk->state)
                 return;
  
         /*
@@ -4048,6 +4096,9 @@ static inline void sched_submit_work(struct task_struct *tsk)
                 preempt_enable_no_resched();
         }
  
+       if (tsk_is_pi_blocked(tsk))
+               return;
+
         /*
          * If we are going to sleep and we have plugged IO queued,
          * make sure to submit it to avoid deadlocks.
@@ -4161,7 +4212,7 @@ static void __sched notrace preempt_schedule_common(void)
         } while (need_resched());
  }
  
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
  /*
   * this is the entry point to schedule() from in-kernel preemption
   * off of preempt_enable. Kernel preemptions off return from interrupt
@@ -4233,7 +4284,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
  }
  EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
  
-#endif /* CONFIG_PREEMPT */
+#endif /* CONFIG_PREEMPTION */
  
  /*
   * this is the entry point to schedule() from kernel preemption
@@ -5248,37 +5299,40 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
         return retval;
  }
  
-static int sched_read_attr(struct sched_attr __user *uattr,
-                          struct sched_attr *attr,
-                          unsigned int usize)
+/*
+ * Copy the kernel size attribute structure (which might be larger
+ * than what user-space knows about) to user-space.
+ *
+ * Note that all cases are valid: user-space buffer can be larger or
+ * smaller than the kernel-space buffer. The usual case is that both
+ * have the same size.
+ */
+static int
+sched_attr_copy_to_user(struct sched_attr __user *uattr,
+                       struct sched_attr *kattr,
+                       unsigned int usize)
  {
-       int ret;
+       unsigned int ksize = sizeof(*kattr);
  
         if (!access_ok(uattr, usize))
                 return -EFAULT;
  
         /*
-        * If we're handed a smaller struct than we know of,
-        * ensure all the unknown bits are 0 - i.e. old
-        * user-space does not get uncomplete information.
+        * sched_getattr() ABI forwards and backwards compatibility:
+        *
+        * If usize == ksize then we just copy everything to user-space and all is good.
+        *
+        * If usize < ksize then we only copy as much as user-space has space for,
+        * this keeps ABI compatibility as well. We skip the rest.
+        *
+        * If usize > ksize then user-space is using a newer version of the ABI,
+        * which part the kernel doesn't know about. Just ignore it - tooling can
+        * detect the kernel's knowledge of attributes from the attr->size value
+        * which is set to ksize in this case.
          */
-       if (usize < sizeof(*attr)) {
-               unsigned char *addr;
-               unsigned char *end;
-
-               addr = (void *)attr + usize;
-               end  = (void *)attr + sizeof(*attr);
+       kattr->size = min(usize, ksize);
  
-               for (; addr < end; addr++) {
-                       if (*addr)
-                               return -EFBIG;
-               }
-
-               attr->size = usize;
-       }
-
-       ret = copy_to_user(uattr, attr, attr->size);
-       if (ret)
+       if (copy_to_user(uattr, kattr, kattr->size))
                 return -EFAULT;
  
         return 0;
@@ -5288,20 +5342,18 @@ static int sched_read_attr(struct sched_attr __user *uattr,
   * sys_sched_getattr - similar to sched_getparam, but with sched_attr
   * @pid: the pid in question.
   * @uattr: structure containing the extended parameters.
- * @size: sizeof(attr) for fwd/bwd comp.
+ * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility.
   * @flags: for future extension.
   */
  SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
-               unsigned int, size, unsigned int, flags)
+               unsigned int, usize, unsigned int, flags)
  {
-       struct sched_attr attr = {
-               .size = sizeof(struct sched_attr),
-       };
+       struct sched_attr kattr = { };
         struct task_struct *p;
         int retval;
  
-       if (!uattr || pid < 0 || size > PAGE_SIZE ||
-           size < SCHED_ATTR_SIZE_VER0 || flags)
+       if (!uattr || pid < 0 || usize > PAGE_SIZE ||
+           usize < SCHED_ATTR_SIZE_VER0 || flags)
                 return -EINVAL;
  
         rcu_read_lock();
@@ -5314,25 +5366,24 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
         if (retval)
                 goto out_unlock;
  
-       attr.sched_policy = p->policy;
+       kattr.sched_policy = p->policy;
         if (p->sched_reset_on_fork)
-               attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+               kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
         if (task_has_dl_policy(p))
-               __getparam_dl(p, &attr);
+               __getparam_dl(p, &kattr);
         else if (task_has_rt_policy(p))
-               attr.sched_priority = p->rt_priority;
+               kattr.sched_priority = p->rt_priority;
         else
-               attr.sched_nice = task_nice(p);
+               kattr.sched_nice = task_nice(p);
  
  #ifdef CONFIG_UCLAMP_TASK
-       attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
-       attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+       kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+       kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
  #endif
  
         rcu_read_unlock();
  
-       retval = sched_read_attr(uattr, &attr, size);
-       return retval;
+       return sched_attr_copy_to_user(uattr, &kattr, usize);
  
  out_unlock:
         rcu_read_unlock();
@@ -5562,7 +5613,7 @@ SYSCALL_DEFINE0(sched_yield)
         return 0;
  }
  
-#ifndef CONFIG_PREEMPT
+#ifndef CONFIG_PREEMPTION
  int __sched _cond_resched(void)
  {
         if (should_resched(0)) {
@@ -5579,7 +5630,7 @@ EXPORT_SYMBOL(_cond_resched);
   * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
   * call schedule, and on return reacquire the lock.
   *
- * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
   * operations here to prevent schedule() from being called twice (once via
   * spin_unlock(), once by hand).
   */
@@ -6873,7 +6924,7 @@ static inline void alloc_uclamp_sched_group(struct task_group *tg,
                                             struct task_group *parent)
  {
  #ifdef CONFIG_UCLAMP_TASK_GROUP
-       int clamp_id;
+       enum uclamp_id clamp_id;
  
         for_each_clamp_id(clamp_id) {
                 uclamp_se_set(&tg->uclamp_req[clamp_id],
@@ -7131,7 +7182,7 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css)
         struct uclamp_se *uc_parent = NULL;
         struct uclamp_se *uc_se = NULL;
         unsigned int eff[UCLAMP_CNT];
-       unsigned int clamp_id;
+       enum uclamp_id clamp_id;
         unsigned int clamps;
  
         css_for_each_descendant_pre(css, top_css) {
@@ -7160,8 +7211,13 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css)
                         uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
                         clamps |= (0x1 << clamp_id);
                 }
-               if (!clamps)
+               if (!clamps) {
                         css = css_rightmost_descendant(css);
+                       continue;
+               }
+
+               /* Immediately update descendants RUNNABLE tasks */
+               uclamp_update_active_tasks(css, clamps);
         }
  }