]> asedeno.scripts.mit.edu Git - linux.git/blob - kernel/sched/cpufreq_schedutil.c
Merge tag 'ceph-for-5.5-rc1' of git://github.com/ceph/ceph-client
[linux.git] / kernel / sched / cpufreq_schedutil.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * CPUFreq governor based on scheduler-provided CPU utilization data.
4  *
5  * Copyright (C) 2016, Intel Corporation
6  * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
7  */
8
9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10
11 #include "sched.h"
12
13 #include <linux/sched/cpufreq.h>
14 #include <trace/events/power.h>
15
16 #define IOWAIT_BOOST_MIN        (SCHED_CAPACITY_SCALE / 8)
17
18 struct sugov_tunables {
19         struct gov_attr_set     attr_set;
20         unsigned int            rate_limit_us;
21 };
22
23 struct sugov_policy {
24         struct cpufreq_policy   *policy;
25
26         struct sugov_tunables   *tunables;
27         struct list_head        tunables_hook;
28
29         raw_spinlock_t          update_lock;    /* For shared policies */
30         u64                     last_freq_update_time;
31         s64                     freq_update_delay_ns;
32         unsigned int            next_freq;
33         unsigned int            cached_raw_freq;
34
35         /* The next fields are only needed if fast switch cannot be used: */
36         struct                  irq_work irq_work;
37         struct                  kthread_work work;
38         struct                  mutex work_lock;
39         struct                  kthread_worker worker;
40         struct task_struct      *thread;
41         bool                    work_in_progress;
42
43         bool                    limits_changed;
44         bool                    need_freq_update;
45 };
46
47 struct sugov_cpu {
48         struct update_util_data update_util;
49         struct sugov_policy     *sg_policy;
50         unsigned int            cpu;
51
52         bool                    iowait_boost_pending;
53         unsigned int            iowait_boost;
54         u64                     last_update;
55
56         unsigned long           bw_dl;
57         unsigned long           max;
58
59         /* The field below is for single-CPU policies only: */
60 #ifdef CONFIG_NO_HZ_COMMON
61         unsigned long           saved_idle_calls;
62 #endif
63 };
64
65 static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
66
67 /************************ Governor internals ***********************/
68
69 static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
70 {
71         s64 delta_ns;
72
73         /*
74          * Since cpufreq_update_util() is called with rq->lock held for
75          * the @target_cpu, our per-CPU data is fully serialized.
76          *
77          * However, drivers cannot in general deal with cross-CPU
78          * requests, so while get_next_freq() will work, our
79          * sugov_update_commit() call may not for the fast switching platforms.
80          *
81          * Hence stop here for remote requests if they aren't supported
82          * by the hardware, as calculating the frequency is pointless if
83          * we cannot in fact act on it.
84          *
85          * For the slow switching platforms, the kthread is always scheduled on
86          * the right set of CPUs and any CPU can find the next frequency and
87          * schedule the kthread.
88          */
89         if (sg_policy->policy->fast_switch_enabled &&
90             !cpufreq_this_cpu_can_update(sg_policy->policy))
91                 return false;
92
93         if (unlikely(sg_policy->limits_changed)) {
94                 sg_policy->limits_changed = false;
95                 sg_policy->need_freq_update = true;
96                 return true;
97         }
98
99         delta_ns = time - sg_policy->last_freq_update_time;
100
101         return delta_ns >= sg_policy->freq_update_delay_ns;
102 }
103
104 static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
105                                    unsigned int next_freq)
106 {
107         if (sg_policy->next_freq == next_freq)
108                 return false;
109
110         sg_policy->next_freq = next_freq;
111         sg_policy->last_freq_update_time = time;
112
113         return true;
114 }
115
116 static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,
117                               unsigned int next_freq)
118 {
119         struct cpufreq_policy *policy = sg_policy->policy;
120         int cpu;
121
122         if (!sugov_update_next_freq(sg_policy, time, next_freq))
123                 return;
124
125         next_freq = cpufreq_driver_fast_switch(policy, next_freq);
126         if (!next_freq)
127                 return;
128
129         policy->cur = next_freq;
130
131         if (trace_cpu_frequency_enabled()) {
132                 for_each_cpu(cpu, policy->cpus)
133                         trace_cpu_frequency(next_freq, cpu);
134         }
135 }
136
137 static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time,
138                                   unsigned int next_freq)
139 {
140         if (!sugov_update_next_freq(sg_policy, time, next_freq))
141                 return;
142
143         if (!sg_policy->work_in_progress) {
144                 sg_policy->work_in_progress = true;
145                 irq_work_queue(&sg_policy->irq_work);
146         }
147 }
148
149 /**
150  * get_next_freq - Compute a new frequency for a given cpufreq policy.
151  * @sg_policy: schedutil policy object to compute the new frequency for.
152  * @util: Current CPU utilization.
153  * @max: CPU capacity.
154  *
155  * If the utilization is frequency-invariant, choose the new frequency to be
156  * proportional to it, that is
157  *
158  * next_freq = C * max_freq * util / max
159  *
160  * Otherwise, approximate the would-be frequency-invariant utilization by
161  * util_raw * (curr_freq / max_freq) which leads to
162  *
163  * next_freq = C * curr_freq * util_raw / max
164  *
165  * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
166  *
167  * The lowest driver-supported frequency which is equal or greater than the raw
168  * next_freq (as calculated above) is returned, subject to policy min/max and
169  * cpufreq driver limitations.
170  */
171 static unsigned int get_next_freq(struct sugov_policy *sg_policy,
172                                   unsigned long util, unsigned long max)
173 {
174         struct cpufreq_policy *policy = sg_policy->policy;
175         unsigned int freq = arch_scale_freq_invariant() ?
176                                 policy->cpuinfo.max_freq : policy->cur;
177
178         freq = map_util_freq(util, freq, max);
179
180         if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
181                 return sg_policy->next_freq;
182
183         sg_policy->need_freq_update = false;
184         sg_policy->cached_raw_freq = freq;
185         return cpufreq_driver_resolve_freq(policy, freq);
186 }
187
188 /*
189  * This function computes an effective utilization for the given CPU, to be
190  * used for frequency selection given the linear relation: f = u * f_max.
191  *
192  * The scheduler tracks the following metrics:
193  *
194  *   cpu_util_{cfs,rt,dl,irq}()
195  *   cpu_bw_dl()
196  *
197  * Where the cfs,rt and dl util numbers are tracked with the same metric and
198  * synchronized windows and are thus directly comparable.
199  *
200  * The cfs,rt,dl utilization are the running times measured with rq->clock_task
201  * which excludes things like IRQ and steal-time. These latter are then accrued
202  * in the irq utilization.
203  *
204  * The DL bandwidth number otoh is not a measured metric but a value computed
205  * based on the task model parameters and gives the minimal utilization
206  * required to meet deadlines.
207  */
208 unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
209                                  unsigned long max, enum schedutil_type type,
210                                  struct task_struct *p)
211 {
212         unsigned long dl_util, util, irq;
213         struct rq *rq = cpu_rq(cpu);
214
215         if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) &&
216             type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
217                 return max;
218         }
219
220         /*
221          * Early check to see if IRQ/steal time saturates the CPU, can be
222          * because of inaccuracies in how we track these -- see
223          * update_irq_load_avg().
224          */
225         irq = cpu_util_irq(rq);
226         if (unlikely(irq >= max))
227                 return max;
228
229         /*
230          * Because the time spend on RT/DL tasks is visible as 'lost' time to
231          * CFS tasks and we use the same metric to track the effective
232          * utilization (PELT windows are synchronized) we can directly add them
233          * to obtain the CPU's actual utilization.
234          *
235          * CFS and RT utilization can be boosted or capped, depending on
236          * utilization clamp constraints requested by currently RUNNABLE
237          * tasks.
238          * When there are no CFS RUNNABLE tasks, clamps are released and
239          * frequency will be gracefully reduced with the utilization decay.
240          */
241         util = util_cfs + cpu_util_rt(rq);
242         if (type == FREQUENCY_UTIL)
243                 util = uclamp_util_with(rq, util, p);
244
245         dl_util = cpu_util_dl(rq);
246
247         /*
248          * For frequency selection we do not make cpu_util_dl() a permanent part
249          * of this sum because we want to use cpu_bw_dl() later on, but we need
250          * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
251          * that we select f_max when there is no idle time.
252          *
253          * NOTE: numerical errors or stop class might cause us to not quite hit
254          * saturation when we should -- something for later.
255          */
256         if (util + dl_util >= max)
257                 return max;
258
259         /*
260          * OTOH, for energy computation we need the estimated running time, so
261          * include util_dl and ignore dl_bw.
262          */
263         if (type == ENERGY_UTIL)
264                 util += dl_util;
265
266         /*
267          * There is still idle time; further improve the number by using the
268          * irq metric. Because IRQ/steal time is hidden from the task clock we
269          * need to scale the task numbers:
270          *
271          *              max - irq
272          *   U' = irq + --------- * U
273          *                 max
274          */
275         util = scale_irq_capacity(util, irq, max);
276         util += irq;
277
278         /*
279          * Bandwidth required by DEADLINE must always be granted while, for
280          * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
281          * to gracefully reduce the frequency when no tasks show up for longer
282          * periods of time.
283          *
284          * Ideally we would like to set bw_dl as min/guaranteed freq and util +
285          * bw_dl as requested freq. However, cpufreq is not yet ready for such
286          * an interface. So, we only do the latter for now.
287          */
288         if (type == FREQUENCY_UTIL)
289                 util += cpu_bw_dl(rq);
290
291         return min(max, util);
292 }
293
294 static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
295 {
296         struct rq *rq = cpu_rq(sg_cpu->cpu);
297         unsigned long util = cpu_util_cfs(rq);
298         unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);
299
300         sg_cpu->max = max;
301         sg_cpu->bw_dl = cpu_bw_dl(rq);
302
303         return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL);
304 }
305
306 /**
307  * sugov_iowait_reset() - Reset the IO boost status of a CPU.
308  * @sg_cpu: the sugov data for the CPU to boost
309  * @time: the update time from the caller
310  * @set_iowait_boost: true if an IO boost has been requested
311  *
312  * The IO wait boost of a task is disabled after a tick since the last update
313  * of a CPU. If a new IO wait boost is requested after more then a tick, then
314  * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy
315  * efficiency by ignoring sporadic wakeups from IO.
316  */
317 static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
318                                bool set_iowait_boost)
319 {
320         s64 delta_ns = time - sg_cpu->last_update;
321
322         /* Reset boost only if a tick has elapsed since last request */
323         if (delta_ns <= TICK_NSEC)
324                 return false;
325
326         sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0;
327         sg_cpu->iowait_boost_pending = set_iowait_boost;
328
329         return true;
330 }
331
332 /**
333  * sugov_iowait_boost() - Updates the IO boost status of a CPU.
334  * @sg_cpu: the sugov data for the CPU to boost
335  * @time: the update time from the caller
336  * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait
337  *
338  * Each time a task wakes up after an IO operation, the CPU utilization can be
339  * boosted to a certain utilization which doubles at each "frequent and
340  * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization
341  * of the maximum OPP.
342  *
343  * To keep doubling, an IO boost has to be requested at least once per tick,
344  * otherwise we restart from the utilization of the minimum OPP.
345  */
346 static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
347                                unsigned int flags)
348 {
349         bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT;
350
351         /* Reset boost if the CPU appears to have been idle enough */
352         if (sg_cpu->iowait_boost &&
353             sugov_iowait_reset(sg_cpu, time, set_iowait_boost))
354                 return;
355
356         /* Boost only tasks waking up after IO */
357         if (!set_iowait_boost)
358                 return;
359
360         /* Ensure boost doubles only one time at each request */
361         if (sg_cpu->iowait_boost_pending)
362                 return;
363         sg_cpu->iowait_boost_pending = true;
364
365         /* Double the boost at each request */
366         if (sg_cpu->iowait_boost) {
367                 sg_cpu->iowait_boost =
368                         min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE);
369                 return;
370         }
371
372         /* First wakeup after IO: start with minimum boost */
373         sg_cpu->iowait_boost = IOWAIT_BOOST_MIN;
374 }
375
376 /**
377  * sugov_iowait_apply() - Apply the IO boost to a CPU.
378  * @sg_cpu: the sugov data for the cpu to boost
379  * @time: the update time from the caller
380  * @util: the utilization to (eventually) boost
381  * @max: the maximum value the utilization can be boosted to
382  *
383  * A CPU running a task which woken up after an IO operation can have its
384  * utilization boosted to speed up the completion of those IO operations.
385  * The IO boost value is increased each time a task wakes up from IO, in
386  * sugov_iowait_apply(), and it's instead decreased by this function,
387  * each time an increase has not been requested (!iowait_boost_pending).
388  *
389  * A CPU which also appears to have been idle for at least one tick has also
390  * its IO boost utilization reset.
391  *
392  * This mechanism is designed to boost high frequently IO waiting tasks, while
393  * being more conservative on tasks which does sporadic IO operations.
394  */
395 static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
396                                         unsigned long util, unsigned long max)
397 {
398         unsigned long boost;
399
400         /* No boost currently required */
401         if (!sg_cpu->iowait_boost)
402                 return util;
403
404         /* Reset boost if the CPU appears to have been idle enough */
405         if (sugov_iowait_reset(sg_cpu, time, false))
406                 return util;
407
408         if (!sg_cpu->iowait_boost_pending) {
409                 /*
410                  * No boost pending; reduce the boost value.
411                  */
412                 sg_cpu->iowait_boost >>= 1;
413                 if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) {
414                         sg_cpu->iowait_boost = 0;
415                         return util;
416                 }
417         }
418
419         sg_cpu->iowait_boost_pending = false;
420
421         /*
422          * @util is already in capacity scale; convert iowait_boost
423          * into the same scale so we can compare.
424          */
425         boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT;
426         return max(boost, util);
427 }
428
429 #ifdef CONFIG_NO_HZ_COMMON
430 static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
431 {
432         unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
433         bool ret = idle_calls == sg_cpu->saved_idle_calls;
434
435         sg_cpu->saved_idle_calls = idle_calls;
436         return ret;
437 }
438 #else
439 static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
440 #endif /* CONFIG_NO_HZ_COMMON */
441
442 /*
443  * Make sugov_should_update_freq() ignore the rate limit when DL
444  * has increased the utilization.
445  */
446 static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
447 {
448         if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
449                 sg_policy->limits_changed = true;
450 }
451
452 static void sugov_update_single(struct update_util_data *hook, u64 time,
453                                 unsigned int flags)
454 {
455         struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
456         struct sugov_policy *sg_policy = sg_cpu->sg_policy;
457         unsigned long util, max;
458         unsigned int next_f;
459         bool busy;
460
461         sugov_iowait_boost(sg_cpu, time, flags);
462         sg_cpu->last_update = time;
463
464         ignore_dl_rate_limit(sg_cpu, sg_policy);
465
466         if (!sugov_should_update_freq(sg_policy, time))
467                 return;
468
469         /* Limits may have changed, don't skip frequency update */
470         busy = !sg_policy->need_freq_update && sugov_cpu_is_busy(sg_cpu);
471
472         util = sugov_get_util(sg_cpu);
473         max = sg_cpu->max;
474         util = sugov_iowait_apply(sg_cpu, time, util, max);
475         next_f = get_next_freq(sg_policy, util, max);
476         /*
477          * Do not reduce the frequency if the CPU has not been idle
478          * recently, as the reduction is likely to be premature then.
479          */
480         if (busy && next_f < sg_policy->next_freq) {
481                 next_f = sg_policy->next_freq;
482
483                 /* Reset cached freq as next_freq has changed */
484                 sg_policy->cached_raw_freq = 0;
485         }
486
487         /*
488          * This code runs under rq->lock for the target CPU, so it won't run
489          * concurrently on two different CPUs for the same target and it is not
490          * necessary to acquire the lock in the fast switch case.
491          */
492         if (sg_policy->policy->fast_switch_enabled) {
493                 sugov_fast_switch(sg_policy, time, next_f);
494         } else {
495                 raw_spin_lock(&sg_policy->update_lock);
496                 sugov_deferred_update(sg_policy, time, next_f);
497                 raw_spin_unlock(&sg_policy->update_lock);
498         }
499 }
500
501 static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
502 {
503         struct sugov_policy *sg_policy = sg_cpu->sg_policy;
504         struct cpufreq_policy *policy = sg_policy->policy;
505         unsigned long util = 0, max = 1;
506         unsigned int j;
507
508         for_each_cpu(j, policy->cpus) {
509                 struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
510                 unsigned long j_util, j_max;
511
512                 j_util = sugov_get_util(j_sg_cpu);
513                 j_max = j_sg_cpu->max;
514                 j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max);
515
516                 if (j_util * max > j_max * util) {
517                         util = j_util;
518                         max = j_max;
519                 }
520         }
521
522         return get_next_freq(sg_policy, util, max);
523 }
524
525 static void
526 sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
527 {
528         struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
529         struct sugov_policy *sg_policy = sg_cpu->sg_policy;
530         unsigned int next_f;
531
532         raw_spin_lock(&sg_policy->update_lock);
533
534         sugov_iowait_boost(sg_cpu, time, flags);
535         sg_cpu->last_update = time;
536
537         ignore_dl_rate_limit(sg_cpu, sg_policy);
538
539         if (sugov_should_update_freq(sg_policy, time)) {
540                 next_f = sugov_next_freq_shared(sg_cpu, time);
541
542                 if (sg_policy->policy->fast_switch_enabled)
543                         sugov_fast_switch(sg_policy, time, next_f);
544                 else
545                         sugov_deferred_update(sg_policy, time, next_f);
546         }
547
548         raw_spin_unlock(&sg_policy->update_lock);
549 }
550
551 static void sugov_work(struct kthread_work *work)
552 {
553         struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
554         unsigned int freq;
555         unsigned long flags;
556
557         /*
558          * Hold sg_policy->update_lock shortly to handle the case where:
559          * incase sg_policy->next_freq is read here, and then updated by
560          * sugov_deferred_update() just before work_in_progress is set to false
561          * here, we may miss queueing the new update.
562          *
563          * Note: If a work was queued after the update_lock is released,
564          * sugov_work() will just be called again by kthread_work code; and the
565          * request will be proceed before the sugov thread sleeps.
566          */
567         raw_spin_lock_irqsave(&sg_policy->update_lock, flags);
568         freq = sg_policy->next_freq;
569         sg_policy->work_in_progress = false;
570         raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags);
571
572         mutex_lock(&sg_policy->work_lock);
573         __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L);
574         mutex_unlock(&sg_policy->work_lock);
575 }
576
577 static void sugov_irq_work(struct irq_work *irq_work)
578 {
579         struct sugov_policy *sg_policy;
580
581         sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
582
583         kthread_queue_work(&sg_policy->worker, &sg_policy->work);
584 }
585
586 /************************** sysfs interface ************************/
587
588 static struct sugov_tunables *global_tunables;
589 static DEFINE_MUTEX(global_tunables_lock);
590
591 static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set)
592 {
593         return container_of(attr_set, struct sugov_tunables, attr_set);
594 }
595
596 static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
597 {
598         struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
599
600         return sprintf(buf, "%u\n", tunables->rate_limit_us);
601 }
602
603 static ssize_t
604 rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
605 {
606         struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
607         struct sugov_policy *sg_policy;
608         unsigned int rate_limit_us;
609
610         if (kstrtouint(buf, 10, &rate_limit_us))
611                 return -EINVAL;
612
613         tunables->rate_limit_us = rate_limit_us;
614
615         list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
616                 sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;
617
618         return count;
619 }
620
621 static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
622
623 static struct attribute *sugov_attrs[] = {
624         &rate_limit_us.attr,
625         NULL
626 };
627 ATTRIBUTE_GROUPS(sugov);
628
629 static struct kobj_type sugov_tunables_ktype = {
630         .default_groups = sugov_groups,
631         .sysfs_ops = &governor_sysfs_ops,
632 };
633
634 /********************** cpufreq governor interface *********************/
635
636 struct cpufreq_governor schedutil_gov;
637
638 static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
639 {
640         struct sugov_policy *sg_policy;
641
642         sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL);
643         if (!sg_policy)
644                 return NULL;
645
646         sg_policy->policy = policy;
647         raw_spin_lock_init(&sg_policy->update_lock);
648         return sg_policy;
649 }
650
651 static void sugov_policy_free(struct sugov_policy *sg_policy)
652 {
653         kfree(sg_policy);
654 }
655
656 static int sugov_kthread_create(struct sugov_policy *sg_policy)
657 {
658         struct task_struct *thread;
659         struct sched_attr attr = {
660                 .size           = sizeof(struct sched_attr),
661                 .sched_policy   = SCHED_DEADLINE,
662                 .sched_flags    = SCHED_FLAG_SUGOV,
663                 .sched_nice     = 0,
664                 .sched_priority = 0,
665                 /*
666                  * Fake (unused) bandwidth; workaround to "fix"
667                  * priority inheritance.
668                  */
669                 .sched_runtime  =  1000000,
670                 .sched_deadline = 10000000,
671                 .sched_period   = 10000000,
672         };
673         struct cpufreq_policy *policy = sg_policy->policy;
674         int ret;
675
676         /* kthread only required for slow path */
677         if (policy->fast_switch_enabled)
678                 return 0;
679
680         kthread_init_work(&sg_policy->work, sugov_work);
681         kthread_init_worker(&sg_policy->worker);
682         thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
683                                 "sugov:%d",
684                                 cpumask_first(policy->related_cpus));
685         if (IS_ERR(thread)) {
686                 pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
687                 return PTR_ERR(thread);
688         }
689
690         ret = sched_setattr_nocheck(thread, &attr);
691         if (ret) {
692                 kthread_stop(thread);
693                 pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__);
694                 return ret;
695         }
696
697         sg_policy->thread = thread;
698         kthread_bind_mask(thread, policy->related_cpus);
699         init_irq_work(&sg_policy->irq_work, sugov_irq_work);
700         mutex_init(&sg_policy->work_lock);
701
702         wake_up_process(thread);
703
704         return 0;
705 }
706
707 static void sugov_kthread_stop(struct sugov_policy *sg_policy)
708 {
709         /* kthread only required for slow path */
710         if (sg_policy->policy->fast_switch_enabled)
711                 return;
712
713         kthread_flush_worker(&sg_policy->worker);
714         kthread_stop(sg_policy->thread);
715         mutex_destroy(&sg_policy->work_lock);
716 }
717
718 static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
719 {
720         struct sugov_tunables *tunables;
721
722         tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
723         if (tunables) {
724                 gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook);
725                 if (!have_governor_per_policy())
726                         global_tunables = tunables;
727         }
728         return tunables;
729 }
730
731 static void sugov_tunables_free(struct sugov_tunables *tunables)
732 {
733         if (!have_governor_per_policy())
734                 global_tunables = NULL;
735
736         kfree(tunables);
737 }
738
739 static int sugov_init(struct cpufreq_policy *policy)
740 {
741         struct sugov_policy *sg_policy;
742         struct sugov_tunables *tunables;
743         int ret = 0;
744
745         /* State should be equivalent to EXIT */
746         if (policy->governor_data)
747                 return -EBUSY;
748
749         cpufreq_enable_fast_switch(policy);
750
751         sg_policy = sugov_policy_alloc(policy);
752         if (!sg_policy) {
753                 ret = -ENOMEM;
754                 goto disable_fast_switch;
755         }
756
757         ret = sugov_kthread_create(sg_policy);
758         if (ret)
759                 goto free_sg_policy;
760
761         mutex_lock(&global_tunables_lock);
762
763         if (global_tunables) {
764                 if (WARN_ON(have_governor_per_policy())) {
765                         ret = -EINVAL;
766                         goto stop_kthread;
767                 }
768                 policy->governor_data = sg_policy;
769                 sg_policy->tunables = global_tunables;
770
771                 gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);
772                 goto out;
773         }
774
775         tunables = sugov_tunables_alloc(sg_policy);
776         if (!tunables) {
777                 ret = -ENOMEM;
778                 goto stop_kthread;
779         }
780
781         tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy);
782
783         policy->governor_data = sg_policy;
784         sg_policy->tunables = tunables;
785
786         ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
787                                    get_governor_parent_kobj(policy), "%s",
788                                    schedutil_gov.name);
789         if (ret)
790                 goto fail;
791
792 out:
793         mutex_unlock(&global_tunables_lock);
794         return 0;
795
796 fail:
797         kobject_put(&tunables->attr_set.kobj);
798         policy->governor_data = NULL;
799         sugov_tunables_free(tunables);
800
801 stop_kthread:
802         sugov_kthread_stop(sg_policy);
803         mutex_unlock(&global_tunables_lock);
804
805 free_sg_policy:
806         sugov_policy_free(sg_policy);
807
808 disable_fast_switch:
809         cpufreq_disable_fast_switch(policy);
810
811         pr_err("initialization failed (error %d)\n", ret);
812         return ret;
813 }
814
815 static void sugov_exit(struct cpufreq_policy *policy)
816 {
817         struct sugov_policy *sg_policy = policy->governor_data;
818         struct sugov_tunables *tunables = sg_policy->tunables;
819         unsigned int count;
820
821         mutex_lock(&global_tunables_lock);
822
823         count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
824         policy->governor_data = NULL;
825         if (!count)
826                 sugov_tunables_free(tunables);
827
828         mutex_unlock(&global_tunables_lock);
829
830         sugov_kthread_stop(sg_policy);
831         sugov_policy_free(sg_policy);
832         cpufreq_disable_fast_switch(policy);
833 }
834
835 static int sugov_start(struct cpufreq_policy *policy)
836 {
837         struct sugov_policy *sg_policy = policy->governor_data;
838         unsigned int cpu;
839
840         sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
841         sg_policy->last_freq_update_time        = 0;
842         sg_policy->next_freq                    = 0;
843         sg_policy->work_in_progress             = false;
844         sg_policy->limits_changed               = false;
845         sg_policy->need_freq_update             = false;
846         sg_policy->cached_raw_freq              = 0;
847
848         for_each_cpu(cpu, policy->cpus) {
849                 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
850
851                 memset(sg_cpu, 0, sizeof(*sg_cpu));
852                 sg_cpu->cpu                     = cpu;
853                 sg_cpu->sg_policy               = sg_policy;
854         }
855
856         for_each_cpu(cpu, policy->cpus) {
857                 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
858
859                 cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
860                                              policy_is_shared(policy) ?
861                                                         sugov_update_shared :
862                                                         sugov_update_single);
863         }
864         return 0;
865 }
866
867 static void sugov_stop(struct cpufreq_policy *policy)
868 {
869         struct sugov_policy *sg_policy = policy->governor_data;
870         unsigned int cpu;
871
872         for_each_cpu(cpu, policy->cpus)
873                 cpufreq_remove_update_util_hook(cpu);
874
875         synchronize_rcu();
876
877         if (!policy->fast_switch_enabled) {
878                 irq_work_sync(&sg_policy->irq_work);
879                 kthread_cancel_work_sync(&sg_policy->work);
880         }
881 }
882
883 static void sugov_limits(struct cpufreq_policy *policy)
884 {
885         struct sugov_policy *sg_policy = policy->governor_data;
886
887         if (!policy->fast_switch_enabled) {
888                 mutex_lock(&sg_policy->work_lock);
889                 cpufreq_policy_apply_limits(policy);
890                 mutex_unlock(&sg_policy->work_lock);
891         }
892
893         sg_policy->limits_changed = true;
894 }
895
896 struct cpufreq_governor schedutil_gov = {
897         .name                   = "schedutil",
898         .owner                  = THIS_MODULE,
899         .dynamic_switching      = true,
900         .init                   = sugov_init,
901         .exit                   = sugov_exit,
902         .start                  = sugov_start,
903         .stop                   = sugov_stop,
904         .limits                 = sugov_limits,
905 };
906
907 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
908 struct cpufreq_governor *cpufreq_default_governor(void)
909 {
910         return &schedutil_gov;
911 }
912 #endif
913
914 static int __init sugov_register(void)
915 {
916         return cpufreq_register_governor(&schedutil_gov);
917 }
918 core_initcall(sugov_register);
919
920 #ifdef CONFIG_ENERGY_MODEL
921 extern bool sched_energy_update;
922 extern struct mutex sched_energy_mutex;
923
924 static void rebuild_sd_workfn(struct work_struct *work)
925 {
926         mutex_lock(&sched_energy_mutex);
927         sched_energy_update = true;
928         rebuild_sched_domains();
929         sched_energy_update = false;
930         mutex_unlock(&sched_energy_mutex);
931 }
932 static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
933
934 /*
935  * EAS shouldn't be attempted without sugov, so rebuild the sched_domains
936  * on governor changes to make sure the scheduler knows about it.
937  */
938 void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
939                                   struct cpufreq_governor *old_gov)
940 {
941         if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) {
942                 /*
943                  * When called from the cpufreq_register_driver() path, the
944                  * cpu_hotplug_lock is already held, so use a work item to
945                  * avoid nested locking in rebuild_sched_domains().
946                  */
947                 schedule_work(&rebuild_sd_work);
948         }
949
950 }
951 #endif