nr_masks *= info->mag[TOPOLOGY_NR_MAG - offset - 1 - i];
nr_masks = max(nr_masks, 1);
for (i = 0; i < nr_masks; i++) {
- mask->next = alloc_bootmem(sizeof(struct mask_info));
+ mask->next = alloc_bootmem_align(
+ roundup_pow_of_two(sizeof(struct mask_info)),
+ roundup_pow_of_two(sizeof(struct mask_info)));
mask = mask->next;
}
}
return sysfs_create_group(&cpu->dev.kobj, &topology_cpu_attr_group);
}
+ const struct cpumask *cpu_coregroup_mask(int cpu)
+ {
+ return &cpu_topology[cpu].core_mask;
+ }
+
+ static const struct cpumask *cpu_book_mask(int cpu)
+ {
+ return &cpu_topology[cpu].book_mask;
+ }
+
+ static struct sched_domain_topology_level s390_topology[] = {
+ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+ { cpu_book_mask, SD_INIT_NAME(BOOK) },
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { NULL, },
+ };
+
static int __init topology_init(void)
{
if (!MACHINE_HAS_TOPOLOGY) {
}
set_topology_timer();
out:
+
+ set_sched_topology(s390_topology);
+
return device_create_file(cpu_subsys.dev_root, &dev_attr_dispatching);
}
device_initcall(topology_init);
struct kernel_param *kp)
{
int ret;
+
ret = param_set_int(val, kp);
if (binder_stop_on_user_error < 2)
wake_up(&binder_user_error_wait);
struct binder_transaction_log *log)
{
struct binder_transaction_log_entry *e;
+
e = &log->entry[log->next];
memset(e, 0, sizeof(*e));
log->next++;
static void binder_set_nice(long nice)
{
long min_nice;
+
if (can_nice(current, nice)) {
set_user_nice(current, nice);
return;
}
- min_nice = 20 - current->signal->rlim[RLIMIT_NICE].rlim_cur;
+ min_nice = rlimit_to_nice(current->signal->rlim[RLIMIT_NICE].rlim_cur);
binder_debug(BINDER_DEBUG_PRIORITY_CAP,
"%d: nice value %ld not allowed use %ld instead\n",
current->pid, nice, min_nice);
set_user_nice(current, min_nice);
- if (min_nice < 20)
+ if (min_nice <= MAX_NICE)
return;
binder_user_error("%d RLIMIT_NICE not set\n", current->pid);
}
for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) {
int ret;
struct page **page_array_ptr;
+
page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE];
BUG_ON(*page);
binder_insert_allocated_buffer(proc, buffer);
if (buffer_size != size) {
struct binder_buffer *new_buffer = (void *)buffer->data + size;
+
list_add(&new_buffer->entry, &buffer->entry);
new_buffer->free = 1;
binder_insert_free_buffer(proc, new_buffer);
if (!list_is_last(&buffer->entry, &proc->buffers)) {
struct binder_buffer *next = list_entry(buffer->entry.next,
struct binder_buffer, entry);
+
if (next->free) {
rb_erase(&next->rb_node, &proc->free_buffers);
binder_delete_free_buffer(proc, next);
if (proc->buffers.next != &buffer->entry) {
struct binder_buffer *prev = list_entry(buffer->entry.prev,
struct binder_buffer, entry);
+
if (prev->free) {
binder_delete_free_buffer(proc, buffer);
rb_erase(&prev->rb_node, &proc->free_buffers);
struct list_head *target_list)
{
int ret;
+
if (strong) {
if (ref->strong == 0) {
ret = binder_inc_node(ref->node, 1, 1, target_list);
ref->strong--;
if (ref->strong == 0) {
int ret;
+
ret = binder_dec_node(ref->node, strong, 1);
if (ret)
return ret;
uint32_t error_code)
{
struct binder_thread *target_thread;
+
BUG_ON(t->flags & TF_ONE_WAY);
while (1) {
target_thread = t->from;
off_end = (void *)offp + buffer->offsets_size;
for (; offp < off_end; offp++) {
struct flat_binder_object *fp;
+
if (*offp > buffer->data_size - sizeof(*fp) ||
buffer->data_size < sizeof(*fp) ||
!IS_ALIGNED(*offp, sizeof(u32))) {
case BINDER_TYPE_BINDER:
case BINDER_TYPE_WEAK_BINDER: {
struct binder_node *node = binder_get_node(proc, fp->binder);
+
if (node == NULL) {
pr_err("transaction release %d bad node %016llx\n",
debug_id, (u64)fp->binder);
case BINDER_TYPE_HANDLE:
case BINDER_TYPE_WEAK_HANDLE: {
struct binder_ref *ref = binder_get_ref(proc, fp->handle);
+
if (ref == NULL) {
pr_err("transaction release %d bad handle %d\n",
debug_id, fp->handle);
} else {
if (tr->target.handle) {
struct binder_ref *ref;
+
ref = binder_get_ref(proc, tr->target.handle);
if (ref == NULL) {
binder_user_error("%d:%d got transaction to invalid handle\n",
}
if (!(tr->flags & TF_ONE_WAY) && thread->transaction_stack) {
struct binder_transaction *tmp;
+
tmp = thread->transaction_stack;
if (tmp->to_thread != thread) {
binder_user_error("%d:%d got new transaction with bad transaction stack, transaction %d has target %d:%d\n",
t->from = thread;
else
t->from = NULL;
- t->sender_euid = proc->tsk->cred->euid;
+ t->sender_euid = task_euid(proc->tsk);
t->to_proc = target_proc;
t->to_thread = target_thread;
t->code = tr->code;
off_end = (void *)offp + tr->offsets_size;
for (; offp < off_end; offp++) {
struct flat_binder_object *fp;
+
if (*offp > t->buffer->data_size - sizeof(*fp) ||
t->buffer->data_size < sizeof(*fp) ||
!IS_ALIGNED(*offp, sizeof(u32))) {
case BINDER_TYPE_WEAK_BINDER: {
struct binder_ref *ref;
struct binder_node *node = binder_get_node(proc, fp->binder);
+
if (node == NULL) {
node = binder_new_node(proc, fp->binder, fp->cookie);
if (node == NULL) {
proc->pid, thread->pid,
(u64)fp->binder, node->debug_id,
(u64)fp->cookie, (u64)node->cookie);
+ return_error = BR_FAILED_REPLY;
goto err_binder_get_ref_for_node_failed;
}
ref = binder_get_ref_for_node(target_proc, node);
case BINDER_TYPE_HANDLE:
case BINDER_TYPE_WEAK_HANDLE: {
struct binder_ref *ref = binder_get_ref(proc, fp->handle);
+
if (ref == NULL) {
binder_user_error("%d:%d got transaction with invalid handle, %d\n",
proc->pid,
(u64)ref->node->ptr);
} else {
struct binder_ref *new_ref;
+
new_ref = binder_get_ref_for_node(target_proc, ref->node);
if (new_ref == NULL) {
return_error = BR_FAILED_REPLY;
{
struct binder_transaction_log_entry *fe;
+
fe = binder_transaction_log_add(&binder_transaction_log_failed);
*fe = *e;
}
struct binder_work *w;
binder_uintptr_t cookie;
struct binder_ref_death *death = NULL;
+
if (get_user(cookie, (binder_uintptr_t __user *)ptr))
return -EFAULT;
ptr += sizeof(void *);
list_for_each_entry(w, &proc->delivered_death, entry) {
struct binder_ref_death *tmp_death = container_of(w, struct binder_ref_death, work);
+
if (tmp_death->cookie == cookie) {
death = tmp_death;
break;
const char *cmd_name;
int strong = node->internal_strong_refs || node->local_strong_refs;
int weak = !hlist_empty(&node->refs) || node->local_weak_refs || strong;
+
if (weak && !node->has_weak_ref) {
cmd = BR_INCREFS;
cmd_name = "BR_INCREFS";
BUG_ON(t->buffer == NULL);
if (t->buffer->target_node) {
struct binder_node *target_node = t->buffer->target_node;
+
tr.target.ptr = target_node->ptr;
tr.cookie = target_node->cookie;
t->saved_priority = task_nice(current);
if (t->from) {
struct task_struct *sender = t->from->proc->tsk;
+
tr.sender_pid = task_tgid_nr_ns(sender,
task_active_pid_ns(current));
} else {
static void binder_release_work(struct list_head *list)
{
struct binder_work *w;
+
while (!list_empty(list)) {
w = list_first_entry(list, struct binder_work, entry);
list_del_init(&w->entry);
struct binder_thread *thread;
unsigned int size = _IOC_SIZE(cmd);
void __user *ubuf = (void __user *)arg;
+ kuid_t curr_euid = current_euid();
/*pr_info("binder_ioctl: %d:%d %x %lx\n", proc->pid, current->pid, cmd, arg);*/
switch (cmd) {
case BINDER_WRITE_READ: {
struct binder_write_read bwr;
+
if (size != sizeof(struct binder_write_read)) {
ret = -EINVAL;
goto err;
goto err;
}
if (uid_valid(binder_context_mgr_uid)) {
- if (!uid_eq(binder_context_mgr_uid, current->cred->euid)) {
+ if (!uid_eq(binder_context_mgr_uid, curr_euid)) {
pr_err("BINDER_SET_CONTEXT_MGR bad uid %d != %d\n",
- from_kuid(&init_user_ns, current->cred->euid),
+ from_kuid(&init_user_ns, curr_euid),
from_kuid(&init_user_ns, binder_context_mgr_uid));
ret = -EPERM;
goto err;
}
- } else
- binder_context_mgr_uid = current->cred->euid;
+ } else {
+ binder_context_mgr_uid = curr_euid;
+ }
binder_context_mgr_node = binder_new_node(proc, 0, 0);
if (binder_context_mgr_node == NULL) {
ret = -ENOMEM;
binder_free_thread(proc, thread);
thread = NULL;
break;
- case BINDER_VERSION:
+ case BINDER_VERSION: {
+ struct binder_version __user *ver = ubuf;
+
if (size != sizeof(struct binder_version)) {
ret = -EINVAL;
goto err;
}
- if (put_user(BINDER_CURRENT_PROTOCOL_VERSION, &((struct binder_version *)ubuf)->protocol_version)) {
+ if (put_user(BINDER_CURRENT_PROTOCOL_VERSION,
+ &ver->protocol_version)) {
ret = -EINVAL;
goto err;
}
break;
+ }
default:
ret = -EINVAL;
goto err;
static void binder_vma_open(struct vm_area_struct *vma)
{
struct binder_proc *proc = vma->vm_private_data;
+
binder_debug(BINDER_DEBUG_OPEN_CLOSE,
"%d open vm area %lx-%lx (%ld K) vma %lx pagep %lx\n",
proc->pid, vma->vm_start, vma->vm_end,
static void binder_vma_close(struct vm_area_struct *vma)
{
struct binder_proc *proc = vma->vm_private_data;
+
binder_debug(BINDER_DEBUG_OPEN_CLOSE,
"%d close vm area %lx-%lx (%ld K) vma %lx pagep %lx\n",
proc->pid, vma->vm_start, vma->vm_end,
if (binder_debugfs_dir_entry_proc) {
char strbuf[11];
+
snprintf(strbuf, sizeof(strbuf), "%u", proc->pid);
proc->debugfs_entry = debugfs_create_file(strbuf, S_IRUGO,
binder_debugfs_dir_entry_proc, proc, &binder_proc_fops);
{
struct rb_node *n;
int wake_count = 0;
+
for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) {
struct binder_thread *thread = rb_entry(n, struct binder_thread, rb_node);
+
thread->looper |= BINDER_LOOPER_STATE_NEED_RETURN;
if (thread->looper & BINDER_LOOPER_STATE_WAITING) {
wake_up_interruptible(&thread->wait);
static int binder_release(struct inode *nodp, struct file *filp)
{
struct binder_proc *proc = filp->private_data;
+
debugfs_remove(proc->debugfs_entry);
binder_defer_work(proc, BINDER_DEFERRED_RELEASE);
struct files_struct *files;
int defer;
+
do {
binder_lock(__func__);
mutex_lock(&binder_deferred_lock);
loff_t lo_offset;
loff_t lo_sizelimit;
int lo_flags;
- int (*ioctl)(struct lloop_device *, int cmd,
- unsigned long arg);
-
struct file *lo_backing_file;
struct block_device *lo_device;
unsigned lo_blocksize;
int refcheck;
int ret = 0;
- set_user_nice(current, -20);
+ set_user_nice(current, MIN_NICE);
lo->lo_state = LLOOP_BOUND;
lo->lo_device = bdev;
lo->lo_flags = lo_flags;
lo->lo_backing_file = file;
- lo->ioctl = NULL;
lo->lo_sizelimit = 0;
lo->old_gfp_mask = mapping_gfp_mask(mapping);
mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
down(&lo->lo_sem);
lo->lo_backing_file = NULL;
- lo->ioctl = NULL;
lo->lo_device = NULL;
lo->lo_offset = 0;
lo->lo_sizelimit = 0;
case LL_IOC_LLOOP_INFO: {
struct lu_fid fid;
- LASSERT(lo->lo_backing_file != NULL);
+ if (lo->lo_backing_file == NULL) {
+ err = -ENOENT;
+ break;
+ }
if (inode == NULL)
inode = lo->lo_backing_file->f_dentry->d_inode;
if (lo->lo_state == LLOOP_BOUND)
#define TASK_PARKED 512
#define TASK_STATE_MAX 1024
-#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
+#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWP"
extern char ___assert_task_state[1 - 2*!!(
sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */
+ #define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */
#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */
#define SD_NUMA 0x4000 /* cross-node balancing */
- extern int __weak arch_sd_sibiling_asym_packing(void);
+ #ifdef CONFIG_SCHED_SMT
+ static inline const int cpu_smt_flags(void)
+ {
+ return SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES;
+ }
+ #endif
+
+ #ifdef CONFIG_SCHED_MC
+ static inline const int cpu_core_flags(void)
+ {
+ return SD_SHARE_PKG_RESOURCES;
+ }
+ #endif
+
+ #ifdef CONFIG_NUMA
+ static inline const int cpu_numa_flags(void)
+ {
+ return SD_NUMA;
+ }
+ #endif
struct sched_domain_attr {
int relax_domain_level;
bool cpus_share_cache(int this_cpu, int that_cpu);
+ typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
+ typedef const int (*sched_domain_flags_f)(void);
+
+ #define SDTL_OVERLAP 0x01
+
+ struct sd_data {
+ struct sched_domain **__percpu sd;
+ struct sched_group **__percpu sg;
+ struct sched_group_power **__percpu sgp;
+ };
+
+ struct sched_domain_topology_level {
+ sched_domain_mask_f mask;
+ sched_domain_flags_f sd_flags;
+ int flags;
+ int numa_level;
+ struct sd_data data;
+ #ifdef CONFIG_SCHED_DEBUG
+ char *name;
+ #endif
+ };
+
+ extern struct sched_domain_topology_level *sched_domain_topology;
+
+ extern void set_sched_topology(struct sched_domain_topology_level *tl);
+
+ #ifdef CONFIG_SCHED_DEBUG
+ # define SD_INIT_NAME(type) .name = #type
+ #else
+ # define SD_INIT_NAME(type)
+ #endif
+
#else /* CONFIG_SMP */
struct sched_domain_attr;
/*
* Original scheduling parameters. Copied here from sched_attr
- * during sched_setscheduler2(), they will remain the same until
- * the next sched_setscheduler2().
+ * during sched_setattr(), they will remain the same until
+ * the next sched_setattr().
*/
u64 dl_runtime; /* maximum runtime for each instance */
u64 dl_deadline; /* relative deadline of each instance */
/*
* Idle thread specific functions to determine the need_resched
- * polling state. We have two versions, one based on TS_POLLING in
- * thread_info.status and one based on TIF_POLLING_NRFLAG in
- * thread_info.flags
+ * polling state.
*/
- #ifdef TS_POLLING
- static inline int tsk_is_polling(struct task_struct *p)
- {
- return task_thread_info(p)->status & TS_POLLING;
- }
- static inline void __current_set_polling(void)
- {
- current_thread_info()->status |= TS_POLLING;
- }
-
- static inline bool __must_check current_set_polling_and_test(void)
- {
- __current_set_polling();
-
- /*
- * Polling state must be visible before we test NEED_RESCHED,
- * paired by resched_task()
- */
- smp_mb();
-
- return unlikely(tif_need_resched());
- }
-
- static inline void __current_clr_polling(void)
- {
- current_thread_info()->status &= ~TS_POLLING;
- }
-
- static inline bool __must_check current_clr_polling_and_test(void)
- {
- __current_clr_polling();
-
- /*
- * Polling state must be visible before we test NEED_RESCHED,
- * paired by resched_task()
- */
- smp_mb();
-
- return unlikely(tif_need_resched());
- }
- #elif defined(TIF_POLLING_NRFLAG)
+ #ifdef TIF_POLLING_NRFLAG
static inline int tsk_is_polling(struct task_struct *p)
{
return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
/*
* Polling state must be visible before we test NEED_RESCHED,
* paired by resched_task()
- *
- * XXX: assumes set/clear bit are identical barrier wise.
*/
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
return unlikely(tif_need_resched());
}
* Polling state must be visible before we test NEED_RESCHED,
* paired by resched_task()
*/
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
return unlikely(tif_need_resched());
}
};
static struct lock_writer_stress_stats *lwsa;
-#if defined(MODULE) || defined(CONFIG_LOCK_TORTURE_TEST_RUNNABLE)
+#if defined(MODULE)
#define LOCKTORTURE_RUNNABLE_INIT 1
#else
#define LOCKTORTURE_RUNNABLE_INIT 0
#endif
int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT;
module_param(locktorture_runnable, int, 0444);
-MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at boot");
+MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init");
/* Forward reference. */
static void lock_torture_cleanup(void);
static DEFINE_TORTURE_RANDOM(rand);
VERBOSE_TOROUT_STRING("lock_torture_writer task started");
- set_user_nice(current, 19);
+ set_user_nice(current, MAX_NICE);
do {
- schedule_timeout_uninterruptible(1);
+ if ((torture_random(&rand) & 0xfffff) == 0)
+ schedule_timeout_uninterruptible(1);
cur_ops->writelock();
if (WARN_ON_ONCE(lock_is_write_held))
lwsp->n_write_lock_fail++;
&lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops,
};
- torture_init_begin(torture_type, verbose, &locktorture_runnable);
+ if (!torture_init_begin(torture_type, verbose, &locktorture_runnable))
+ return -EBUSY;
/* Process args and tell the world that the torturer is on the job. */
for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
+#ifdef smp_mb__before_atomic
+void __smp_mb__before_atomic(void)
+{
+ smp_mb__before_atomic();
+}
+EXPORT_SYMBOL(__smp_mb__before_atomic);
+#endif
+
+#ifdef smp_mb__after_atomic
+void __smp_mb__after_atomic(void)
+{
+ smp_mb__after_atomic();
+}
+EXPORT_SYMBOL(__smp_mb__after_atomic);
+#endif
+
void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
{
unsigned long delta;
}
#endif /* CONFIG_SCHED_HRTICK */
+ /*
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+ #define fetch_or(ptr, val) \
+ ({ typeof(*(ptr)) __old, __val = *(ptr); \
+ for (;;) { \
+ __old = cmpxchg((ptr), __val, __val | (val)); \
+ if (__old == __val) \
+ break; \
+ __val = __old; \
+ } \
+ __old; \
+ })
+
+ #ifdef TIF_POLLING_NRFLAG
+ /*
+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
+ * this avoids any races wrt polling state changes and thereby avoids
+ * spurious IPIs.
+ */
+ static bool set_nr_and_not_polling(struct task_struct *p)
+ {
+ struct thread_info *ti = task_thread_info(p);
+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+ }
+ #else
+ static bool set_nr_and_not_polling(struct task_struct *p)
+ {
+ set_tsk_need_resched(p);
+ return true;
+ }
+ #endif
+
/*
* resched_task - mark a task 'to be rescheduled now'.
*
if (test_tsk_need_resched(p))
return;
- set_tsk_need_resched(p);
-
cpu = task_cpu(p);
+
if (cpu == smp_processor_id()) {
+ set_tsk_need_resched(p);
set_preempt_need_resched();
return;
}
- /* NEED_RESCHED must be visible before we test polling */
- smp_mb();
- if (!tsk_is_polling(p))
+ if (set_nr_and_not_polling(p))
smp_send_reschedule(cpu);
}
int can_nice(const struct task_struct *p, const int nice)
{
/* convert nice value [19,-20] to rlimit style value [1,40] */
- int nice_rlim = 20 - nice;
+ int nice_rlim = nice_to_rlimit(nice);
return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
capable(CAP_SYS_NICE));
* We don't have to worry. Conceptually one call occurs first
* and we have a single winner.
*/
- if (increment < -40)
- increment = -40;
- if (increment > 40)
- increment = 40;
-
+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
nice = task_nice(current) + increment;
- if (nice < MIN_NICE)
- nice = MIN_NICE;
- if (nice > MAX_NICE)
- nice = MAX_NICE;
+ nice = clamp_val(nice, MIN_NICE, MAX_NICE);
if (increment < 0 && !can_nice(current, nice))
return -EPERM;
*/
attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
- out:
- return ret;
+ return 0;
err_size:
put_user(sizeof(*attr), &uattr->size);
- ret = -E2BIG;
- goto out;
+ return -E2BIG;
}
/**
for (; addr < end; addr++) {
if (*addr)
- goto err_size;
+ return -EFBIG;
}
attr->size = usize;
if (ret)
return -EFAULT;
- out:
- return ret;
-
- err_size:
- ret = -E2BIG;
- goto out;
+ return 0;
}
/**
int __sched _cond_resched(void)
{
+ rcu_cond_resched();
if (should_resched()) {
__cond_resched();
return 1;
*/
int __cond_resched_lock(spinlock_t *lock)
{
+ bool need_rcu_resched = rcu_should_resched();
int resched = should_resched();
int ret = 0;
lockdep_assert_held(lock);
- if (spin_needbreak(lock) || resched) {
+ if (spin_needbreak(lock) || resched || need_rcu_resched) {
spin_unlock(lock);
if (resched)
__cond_resched();
+ else if (unlikely(need_rcu_resched))
+ rcu_resched();
else
cpu_relax();
ret = 1;
{
BUG_ON(!in_softirq());
+ rcu_cond_resched(); /* BH disabled OK, just recording QSes. */
if (should_resched()) {
local_bh_enable();
__cond_resched();
.priority = CPU_PRI_MIGRATION,
};
+ static void __cpuinit set_cpu_rq_start_time(void)
+ {
+ int cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+ rq->age_stamp = sched_clock_cpu(cpu);
+ }
+
static int sched_cpu_active(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_STARTING:
+ set_cpu_rq_start_time();
+ return NOTIFY_OK;
case CPU_DOWN_FAILED:
set_cpu_active((long)hcpu, true);
return NOTIFY_OK;
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
SD_SHARE_CPUPOWER |
- SD_SHARE_PKG_RESOURCES)) {
+ SD_SHARE_PKG_RESOURCES |
+ SD_SHARE_POWERDOMAIN)) {
if (sd->groups != sd->groups->next)
return 0;
}
SD_BALANCE_EXEC |
SD_SHARE_CPUPOWER |
SD_SHARE_PKG_RESOURCES |
- SD_PREFER_SIBLING);
+ SD_PREFER_SIBLING |
+ SD_SHARE_POWERDOMAIN);
if (nr_node_ids == 1)
pflags &= ~SD_SERIALIZE;
}
__setup("isolcpus=", isolated_cpu_setup);
- static const struct cpumask *cpu_cpu_mask(int cpu)
- {
- return cpumask_of_node(cpu_to_node(cpu));
- }
-
- struct sd_data {
- struct sched_domain **__percpu sd;
- struct sched_group **__percpu sg;
- struct sched_group_power **__percpu sgp;
- };
-
struct s_data {
struct sched_domain ** __percpu sd;
struct root_domain *rd;
sa_none,
};
- struct sched_domain_topology_level;
-
- typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
- typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
-
- #define SDTL_OVERLAP 0x01
-
- struct sched_domain_topology_level {
- sched_domain_init_f init;
- sched_domain_mask_f mask;
- int flags;
- int numa_level;
- struct sd_data data;
- };
-
/*
* Build an iteration mask that can exclude certain CPUs from the upwards
* domain traversal.
continue;
group = get_group(i, sdd, &sg);
- cpumask_clear(sched_group_cpus(sg));
- sg->sgp->power = 0;
cpumask_setall(sched_group_mask(sg));
for_each_cpu(j, span) {
atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
}
- int __weak arch_sd_sibling_asym_packing(void)
- {
- return 0*SD_ASYM_PACKING;
- }
-
/*
* Initializers for schedule domains
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
*/
- #ifdef CONFIG_SCHED_DEBUG
- # define SD_INIT_NAME(sd, type) sd->name = #type
- #else
- # define SD_INIT_NAME(sd, type) do { } while (0)
- #endif
-
- #define SD_INIT_FUNC(type) \
- static noinline struct sched_domain * \
- sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
- { \
- struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
- *sd = SD_##type##_INIT; \
- SD_INIT_NAME(sd, type); \
- sd->private = &tl->data; \
- return sd; \
- }
-
- SD_INIT_FUNC(CPU)
- #ifdef CONFIG_SCHED_SMT
- SD_INIT_FUNC(SIBLING)
- #endif
- #ifdef CONFIG_SCHED_MC
- SD_INIT_FUNC(MC)
- #endif
- #ifdef CONFIG_SCHED_BOOK
- SD_INIT_FUNC(BOOK)
- #endif
-
static int default_relax_domain_level = -1;
int sched_domain_level_max;
*per_cpu_ptr(sdd->sgp, cpu) = NULL;
}
- #ifdef CONFIG_SCHED_SMT
- static const struct cpumask *cpu_smt_mask(int cpu)
- {
- return topology_thread_cpumask(cpu);
- }
- #endif
-
- /*
- * Topology list, bottom-up.
- */
- static struct sched_domain_topology_level default_topology[] = {
- #ifdef CONFIG_SCHED_SMT
- { sd_init_SIBLING, cpu_smt_mask, },
- #endif
- #ifdef CONFIG_SCHED_MC
- { sd_init_MC, cpu_coregroup_mask, },
- #endif
- #ifdef CONFIG_SCHED_BOOK
- { sd_init_BOOK, cpu_book_mask, },
- #endif
- { sd_init_CPU, cpu_cpu_mask, },
- { NULL, },
- };
-
- static struct sched_domain_topology_level *sched_domain_topology = default_topology;
-
- #define for_each_sd_topology(tl) \
- for (tl = sched_domain_topology; tl->init; tl++)
-
#ifdef CONFIG_NUMA
-
static int sched_domains_numa_levels;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
static int sched_domains_curr_level;
+ #endif
- static inline int sd_local_flags(int level)
- {
- if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
- return 0;
-
- return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
- }
+ /*
+ * SD_flags allowed in topology descriptions.
+ *
+ * SD_SHARE_CPUPOWER - describes SMT topologies
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
+ * SD_NUMA - describes NUMA topologies
+ * SD_SHARE_POWERDOMAIN - describes shared power domain
+ *
+ * Odd one out:
+ * SD_ASYM_PACKING - describes SMT quirks
+ */
+ #define TOPOLOGY_SD_FLAGS \
+ (SD_SHARE_CPUPOWER | \
+ SD_SHARE_PKG_RESOURCES | \
+ SD_NUMA | \
+ SD_ASYM_PACKING | \
+ SD_SHARE_POWERDOMAIN)
static struct sched_domain *
- sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
+ sd_init(struct sched_domain_topology_level *tl, int cpu)
{
struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
- int level = tl->numa_level;
- int sd_weight = cpumask_weight(
- sched_domains_numa_masks[level][cpu_to_node(cpu)]);
+ int sd_weight, sd_flags = 0;
+
+ #ifdef CONFIG_NUMA
+ /*
+ * Ugly hack to pass state to sd_numa_mask()...
+ */
+ sched_domains_curr_level = tl->numa_level;
+ #endif
+
+ sd_weight = cpumask_weight(tl->mask(cpu));
+
+ if (tl->sd_flags)
+ sd_flags = (*tl->sd_flags)();
+ if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
+ "wrong sd_flags in topology description\n"))
+ sd_flags &= ~TOPOLOGY_SD_FLAGS;
*sd = (struct sched_domain){
.min_interval = sd_weight,
.max_interval = 2*sd_weight,
.busy_factor = 32,
.imbalance_pct = 125,
- .cache_nice_tries = 2,
- .busy_idx = 3,
- .idle_idx = 2,
+
+ .cache_nice_tries = 0,
+ .busy_idx = 0,
+ .idle_idx = 0,
.newidle_idx = 0,
.wake_idx = 0,
.forkexec_idx = 0,
.flags = 1*SD_LOAD_BALANCE
| 1*SD_BALANCE_NEWIDLE
- | 0*SD_BALANCE_EXEC
- | 0*SD_BALANCE_FORK
+ | 1*SD_BALANCE_EXEC
+ | 1*SD_BALANCE_FORK
| 0*SD_BALANCE_WAKE
- | 0*SD_WAKE_AFFINE
+ | 1*SD_WAKE_AFFINE
| 0*SD_SHARE_CPUPOWER
| 0*SD_SHARE_PKG_RESOURCES
- | 1*SD_SERIALIZE
+ | 0*SD_SERIALIZE
| 0*SD_PREFER_SIBLING
- | 1*SD_NUMA
- | sd_local_flags(level)
+ | 0*SD_NUMA
+ | sd_flags
,
+
.last_balance = jiffies,
.balance_interval = sd_weight,
+ .smt_gain = 0,
.max_newidle_lb_cost = 0,
.next_decay_max_lb_cost = jiffies,
+ #ifdef CONFIG_SCHED_DEBUG
+ .name = tl->name,
+ #endif
};
- SD_INIT_NAME(sd, NUMA);
- sd->private = &tl->data;
/*
- * Ugly hack to pass state to sd_numa_mask()...
+ * Convert topological properties into behaviour.
*/
- sched_domains_curr_level = tl->numa_level;
+
+ if (sd->flags & SD_SHARE_CPUPOWER) {
+ sd->imbalance_pct = 110;
+ sd->smt_gain = 1178; /* ~15% */
+
+ } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+ sd->imbalance_pct = 117;
+ sd->cache_nice_tries = 1;
+ sd->busy_idx = 2;
+
+ #ifdef CONFIG_NUMA
+ } else if (sd->flags & SD_NUMA) {
+ sd->cache_nice_tries = 2;
+ sd->busy_idx = 3;
+ sd->idle_idx = 2;
+
+ sd->flags |= SD_SERIALIZE;
+ if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+ sd->flags &= ~(SD_BALANCE_EXEC |
+ SD_BALANCE_FORK |
+ SD_WAKE_AFFINE);
+ }
+
+ #endif
+ } else {
+ sd->flags |= SD_PREFER_SIBLING;
+ sd->cache_nice_tries = 1;
+ sd->busy_idx = 2;
+ sd->idle_idx = 1;
+ }
+
+ sd->private = &tl->data;
return sd;
}
+ /*
+ * Topology list, bottom-up.
+ */
+ static struct sched_domain_topology_level default_topology[] = {
+ #ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+ #endif
+ #ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+ #endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { NULL, },
+ };
+
+ struct sched_domain_topology_level *sched_domain_topology = default_topology;
+
+ #define for_each_sd_topology(tl) \
+ for (tl = sched_domain_topology; tl->mask; tl++)
+
+ void set_sched_topology(struct sched_domain_topology_level *tl)
+ {
+ sched_domain_topology = tl;
+ }
+
+ #ifdef CONFIG_NUMA
+
static const struct cpumask *sd_numa_mask(int cpu)
{
return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
}
}
- tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
+ /* Compute default topology size */
+ for (i = 0; sched_domain_topology[i].mask; i++);
+
+ tl = kzalloc((i + level + 1) *
sizeof(struct sched_domain_topology_level), GFP_KERNEL);
if (!tl)
return;
/*
* Copy the default topology bits..
*/
- for (i = 0; default_topology[i].init; i++)
- tl[i] = default_topology[i];
+ for (i = 0; sched_domain_topology[i].mask; i++)
+ tl[i] = sched_domain_topology[i];
/*
* .. and append 'j' levels of NUMA goodness.
*/
for (j = 0; j < level; i++, j++) {
tl[i] = (struct sched_domain_topology_level){
- .init = sd_numa_init,
.mask = sd_numa_mask,
+ .sd_flags = cpu_numa_flags,
.flags = SDTL_OVERLAP,
.numa_level = j,
+ SD_INIT_NAME(NUMA)
};
}
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu)
{
- struct sched_domain *sd = tl->init(tl, cpu);
+ struct sched_domain *sd = sd_init(tl, cpu);
if (!sd)
return child;
if (cpu_isolated_map == NULL)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
idle_thread_set_boot_cpu();
+ set_cpu_rq_start_time();
#endif
init_sched_fair_class();