sched/fair: Define sched_idle_cpu() only for SMP configurations

[linux.git] / kernel / sched / cputime.c
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c

index 46ed4e1383e21c4f99fd832cc6bc55f9606a1c1e..cff3e656566d6e3aede6414e40fa8763bb16bb64 100644 (file)
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -355,7 +355,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
   * softirq as those do not count in task exec_runtime any more.
   */
  static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
-                                        struct rq *rq, int ticks)
+                                        int ticks)
  {
         u64 other, cputime = TICK_NSEC * ticks;
  
@@ -381,7 +381,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
                 account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
         } else if (user_tick) {
                 account_user_time(p, cputime);
-       } else if (p == rq->idle) {
+       } else if (p == this_rq()->idle) {
                 account_idle_time(cputime);
         } else if (p->flags & PF_VCPU) { /* System time or guest time */
                 account_guest_time(p, cputime);
@@ -392,40 +392,36 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
  
  static void irqtime_account_idle_ticks(int ticks)
  {
-       struct rq *rq = this_rq();
-
-       irqtime_account_process_tick(current, 0, rq, ticks);
+       irqtime_account_process_tick(current, 0, ticks);
  }
  #else /* CONFIG_IRQ_TIME_ACCOUNTING */
  static inline void irqtime_account_idle_ticks(int ticks) { }
  static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
-                                               struct rq *rq, int nr_ticks) { }
+                                               int nr_ticks) { }
  #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
  
  /*
   * Use precise platform statistics if available:
   */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+
  # ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_common_task_switch(struct task_struct *prev)
+void vtime_task_switch(struct task_struct *prev)
  {
         if (is_idle_task(prev))
                 vtime_account_idle(prev);
         else
-               vtime_account_system(prev);
+               vtime_account_kernel(prev);
  
         vtime_flush(prev);
         arch_vtime_task_switch(prev);
  }
  # endif
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
  
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
  /*
   * Archs that account the whole time spent in the idle task
   * (outside irq) as idle time can rely on this and just implement
- * vtime_account_system() and vtime_account_idle(). Archs that
+ * vtime_account_kernel() and vtime_account_idle(). Archs that
   * have other meaning of the idle time (s390 only includes the
   * time spent by the CPU when it's in low power mode) must override
   * vtime_account().
@@ -436,7 +432,7 @@ void vtime_account_irq_enter(struct task_struct *tsk)
         if (!in_interrupt() && is_idle_task(tsk))
                 vtime_account_idle(tsk);
         else
-               vtime_account_system(tsk);
+               vtime_account_kernel(tsk);
  }
  EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
  #endif /* __ARCH_HAS_VTIME_ACCOUNT */
@@ -475,13 +471,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
  void account_process_tick(struct task_struct *p, int user_tick)
  {
         u64 cputime, steal;
-       struct rq *rq = this_rq();
  
-       if (vtime_accounting_cpu_enabled())
+       if (vtime_accounting_enabled_this_cpu())
                 return;
  
         if (sched_clock_irqtime) {
-               irqtime_account_process_tick(p, user_tick, rq, 1);
+               irqtime_account_process_tick(p, user_tick, 1);
                 return;
         }
  
@@ -495,7 +490,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
  
         if (user_tick)
                 account_user_time(p, cputime);
-       else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
+       else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET))
                 account_system_time(p, HARDIRQ_OFFSET, cputime);
         else
                 account_idle_time(cputime);
@@ -711,8 +706,8 @@ static u64 get_vtime_delta(struct vtime *vtime)
         return delta - other;
  }
  
-static void __vtime_account_system(struct task_struct *tsk,
-                                  struct vtime *vtime)
+static void vtime_account_system(struct task_struct *tsk,
+                                struct vtime *vtime)
  {
         vtime->stime += get_vtime_delta(vtime);
         if (vtime->stime >= TICK_NSEC) {
@@ -731,7 +726,17 @@ static void vtime_account_guest(struct task_struct *tsk,
         }
  }
  
-void vtime_account_system(struct task_struct *tsk)
+static void __vtime_account_kernel(struct task_struct *tsk,
+                                  struct vtime *vtime)
+{
+       /* We might have scheduled out from guest path */
+       if (vtime->state == VTIME_GUEST)
+               vtime_account_guest(tsk, vtime);
+       else
+               vtime_account_system(tsk, vtime);
+}
+
+void vtime_account_kernel(struct task_struct *tsk)
  {
         struct vtime *vtime = &tsk->vtime;
  
@@ -739,11 +744,7 @@ void vtime_account_system(struct task_struct *tsk)
                 return;
  
         write_seqcount_begin(&vtime->seqcount);
-       /* We might have scheduled out from guest path */
-       if (tsk->flags & PF_VCPU)
-               vtime_account_guest(tsk, vtime);
-       else
-               __vtime_account_system(tsk, vtime);
+       __vtime_account_kernel(tsk, vtime);
         write_seqcount_end(&vtime->seqcount);
  }
  
@@ -752,7 +753,7 @@ void vtime_user_enter(struct task_struct *tsk)
         struct vtime *vtime = &tsk->vtime;
  
         write_seqcount_begin(&vtime->seqcount);
-       __vtime_account_system(tsk, vtime);
+       vtime_account_system(tsk, vtime);
         vtime->state = VTIME_USER;
         write_seqcount_end(&vtime->seqcount);
  }
@@ -782,8 +783,9 @@ void vtime_guest_enter(struct task_struct *tsk)
          * that can thus safely catch up with a tickless delta.
          */
         write_seqcount_begin(&vtime->seqcount);
-       __vtime_account_system(tsk, vtime);
+       vtime_account_system(tsk, vtime);
         tsk->flags |= PF_VCPU;
+       vtime->state = VTIME_GUEST;
         write_seqcount_end(&vtime->seqcount);
  }
  EXPORT_SYMBOL_GPL(vtime_guest_enter);
@@ -795,6 +797,7 @@ void vtime_guest_exit(struct task_struct *tsk)
         write_seqcount_begin(&vtime->seqcount);
         vtime_account_guest(tsk, vtime);
         tsk->flags &= ~PF_VCPU;
+       vtime->state = VTIME_SYS;
         write_seqcount_end(&vtime->seqcount);
  }
  EXPORT_SYMBOL_GPL(vtime_guest_exit);
@@ -804,19 +807,30 @@ void vtime_account_idle(struct task_struct *tsk)
         account_idle_time(get_vtime_delta(&tsk->vtime));
  }
  
-void arch_vtime_task_switch(struct task_struct *prev)
+void vtime_task_switch_generic(struct task_struct *prev)
  {
         struct vtime *vtime = &prev->vtime;
  
         write_seqcount_begin(&vtime->seqcount);
+       if (vtime->state == VTIME_IDLE)
+               vtime_account_idle(prev);
+       else
+               __vtime_account_kernel(prev, vtime);
         vtime->state = VTIME_INACTIVE;
+       vtime->cpu = -1;
         write_seqcount_end(&vtime->seqcount);
  
         vtime = &current->vtime;
  
         write_seqcount_begin(&vtime->seqcount);
-       vtime->state = VTIME_SYS;
+       if (is_idle_task(current))
+               vtime->state = VTIME_IDLE;
+       else if (current->flags & PF_VCPU)
+               vtime->state = VTIME_GUEST;
+       else
+               vtime->state = VTIME_SYS;
         vtime->starttime = sched_clock();
+       vtime->cpu = smp_processor_id();
         write_seqcount_end(&vtime->seqcount);
  }
  
@@ -827,8 +841,9 @@ void vtime_init_idle(struct task_struct *t, int cpu)
  
         local_irq_save(flags);
         write_seqcount_begin(&vtime->seqcount);
-       vtime->state = VTIME_SYS;
+       vtime->state = VTIME_IDLE;
         vtime->starttime = sched_clock();
+       vtime->cpu = cpu;
         write_seqcount_end(&vtime->seqcount);
         local_irq_restore(flags);
  }
@@ -846,7 +861,7 @@ u64 task_gtime(struct task_struct *t)
                 seq = read_seqcount_begin(&vtime->seqcount);
  
                 gtime = t->gtime;
-               if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
+               if (vtime->state == VTIME_GUEST)
                         gtime += vtime->gtime + vtime_delta(vtime);
  
         } while (read_seqcount_retry(&vtime->seqcount, seq));
@@ -877,20 +892,230 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
                 *utime = t->utime;
                 *stime = t->stime;
  
-               /* Task is sleeping, nothing to add */
-               if (vtime->state == VTIME_INACTIVE || is_idle_task(t))
+               /* Task is sleeping or idle, nothing to add */
+               if (vtime->state < VTIME_SYS)
                         continue;
  
                 delta = vtime_delta(vtime);
  
                 /*
-                * Task runs either in user or kernel space, add pending nohz time to
-                * the right place.
+                * Task runs either in user (including guest) or kernel space,
+                * add pending nohz time to the right place.
                  */
-               if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
-                       *utime += vtime->utime + delta;
-               else if (vtime->state == VTIME_SYS)
+               if (vtime->state == VTIME_SYS)
                         *stime += vtime->stime + delta;
+               else
+                       *utime += vtime->utime + delta;
         } while (read_seqcount_retry(&vtime->seqcount, seq));
  }
+
+static int vtime_state_check(struct vtime *vtime, int cpu)
+{
+       /*
+        * We raced against a context switch, fetch the
+        * kcpustat task again.
+        */
+       if (vtime->cpu != cpu && vtime->cpu != -1)
+               return -EAGAIN;
+
+       /*
+        * Two possible things here:
+        * 1) We are seeing the scheduling out task (prev) or any past one.
+        * 2) We are seeing the scheduling in task (next) but it hasn't
+        *    passed though vtime_task_switch() yet so the pending
+        *    cputime of the prev task may not be flushed yet.
+        *
+        * Case 1) is ok but 2) is not. So wait for a safe VTIME state.
+        */
+       if (vtime->state == VTIME_INACTIVE)
+               return -EAGAIN;
+
+       return 0;
+}
+
+static u64 kcpustat_user_vtime(struct vtime *vtime)
+{
+       if (vtime->state == VTIME_USER)
+               return vtime->utime + vtime_delta(vtime);
+       else if (vtime->state == VTIME_GUEST)
+               return vtime->gtime + vtime_delta(vtime);
+       return 0;
+}
+
+static int kcpustat_field_vtime(u64 *cpustat,
+                               struct task_struct *tsk,
+                               enum cpu_usage_stat usage,
+                               int cpu, u64 *val)
+{
+       struct vtime *vtime = &tsk->vtime;
+       unsigned int seq;
+       int err;
+
+       do {
+               seq = read_seqcount_begin(&vtime->seqcount);
+
+               err = vtime_state_check(vtime, cpu);
+               if (err < 0)
+                       return err;
+
+               *val = cpustat[usage];
+
+               /*
+                * Nice VS unnice cputime accounting may be inaccurate if
+                * the nice value has changed since the last vtime update.
+                * But proper fix would involve interrupting target on nice
+                * updates which is a no go on nohz_full (although the scheduler
+                * may still interrupt the target if rescheduling is needed...)
+                */
+               switch (usage) {
+               case CPUTIME_SYSTEM:
+                       if (vtime->state == VTIME_SYS)
+                               *val += vtime->stime + vtime_delta(vtime);
+                       break;
+               case CPUTIME_USER:
+                       if (task_nice(tsk) <= 0)
+                               *val += kcpustat_user_vtime(vtime);
+                       break;
+               case CPUTIME_NICE:
+                       if (task_nice(tsk) > 0)
+                               *val += kcpustat_user_vtime(vtime);
+                       break;
+               case CPUTIME_GUEST:
+                       if (vtime->state == VTIME_GUEST && task_nice(tsk) <= 0)
+                               *val += vtime->gtime + vtime_delta(vtime);
+                       break;
+               case CPUTIME_GUEST_NICE:
+                       if (vtime->state == VTIME_GUEST && task_nice(tsk) > 0)
+                               *val += vtime->gtime + vtime_delta(vtime);
+                       break;
+               default:
+                       break;
+               }
+       } while (read_seqcount_retry(&vtime->seqcount, seq));
+
+       return 0;
+}
+
+u64 kcpustat_field(struct kernel_cpustat *kcpustat,
+                  enum cpu_usage_stat usage, int cpu)
+{
+       u64 *cpustat = kcpustat->cpustat;
+       struct rq *rq;
+       u64 val;
+       int err;
+
+       if (!vtime_accounting_enabled_cpu(cpu))
+               return cpustat[usage];
+
+       rq = cpu_rq(cpu);
+
+       for (;;) {
+               struct task_struct *curr;
+
+               rcu_read_lock();
+               curr = rcu_dereference(rq->curr);
+               if (WARN_ON_ONCE(!curr)) {
+                       rcu_read_unlock();
+                       return cpustat[usage];
+               }
+
+               err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val);
+               rcu_read_unlock();
+
+               if (!err)
+                       return val;
+
+               cpu_relax();
+       }
+}
+EXPORT_SYMBOL_GPL(kcpustat_field);
+
+static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
+                                   const struct kernel_cpustat *src,
+                                   struct task_struct *tsk, int cpu)
+{
+       struct vtime *vtime = &tsk->vtime;
+       unsigned int seq;
+       int err;
+
+       do {
+               u64 *cpustat;
+               u64 delta;
+
+               seq = read_seqcount_begin(&vtime->seqcount);
+
+               err = vtime_state_check(vtime, cpu);
+               if (err < 0)
+                       return err;
+
+               *dst = *src;
+               cpustat = dst->cpustat;
+
+               /* Task is sleeping, dead or idle, nothing to add */
+               if (vtime->state < VTIME_SYS)
+                       continue;
+
+               delta = vtime_delta(vtime);
+
+               /*
+                * Task runs either in user (including guest) or kernel space,
+                * add pending nohz time to the right place.
+                */
+               if (vtime->state == VTIME_SYS) {
+                       cpustat[CPUTIME_SYSTEM] += vtime->stime + delta;
+               } else if (vtime->state == VTIME_USER) {
+                       if (task_nice(tsk) > 0)
+                               cpustat[CPUTIME_NICE] += vtime->utime + delta;
+                       else
+                               cpustat[CPUTIME_USER] += vtime->utime + delta;
+               } else {
+                       WARN_ON_ONCE(vtime->state != VTIME_GUEST);
+                       if (task_nice(tsk) > 0) {
+                               cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta;
+                               cpustat[CPUTIME_NICE] += vtime->gtime + delta;
+                       } else {
+                               cpustat[CPUTIME_GUEST] += vtime->gtime + delta;
+                               cpustat[CPUTIME_USER] += vtime->gtime + delta;
+                       }
+               }
+       } while (read_seqcount_retry(&vtime->seqcount, seq));
+
+       return err;
+}
+
+void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
+{
+       const struct kernel_cpustat *src = &kcpustat_cpu(cpu);
+       struct rq *rq;
+       int err;
+
+       if (!vtime_accounting_enabled_cpu(cpu)) {
+               *dst = *src;
+               return;
+       }
+
+       rq = cpu_rq(cpu);
+
+       for (;;) {
+               struct task_struct *curr;
+
+               rcu_read_lock();
+               curr = rcu_dereference(rq->curr);
+               if (WARN_ON_ONCE(!curr)) {
+                       rcu_read_unlock();
+                       *dst = *src;
+                       return;
+               }
+
+               err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu);
+               rcu_read_unlock();
+
+               if (!err)
+                       return;
+
+               cpu_relax();
+       }
+}
+EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch);
+
  #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */