kernel/sched/membarrier.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
   4  *
   5  * membarrier system call
   6  */
   7 #include "sched.h"
   8
   9 /*
  10  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
  11  * except MEMBARRIER_CMD_QUERY.
  12  */
  13 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
  14 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK                  \
  15         (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE                     \
  16         | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
  17 #else
  18 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK  0
  19 #endif
  20
  21 #define MEMBARRIER_CMD_BITMASK                                          \
  22         (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED        \
  23         | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED                      \
  24         | MEMBARRIER_CMD_PRIVATE_EXPEDITED                              \
  25         | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED                     \
  26         | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
  27
  28 static void ipi_mb(void *info)
  29 {
  30         smp_mb();       /* IPIs should be serializing but paranoid. */
  31 }
  32
  33 static void ipi_sync_rq_state(void *info)
  34 {
  35         struct mm_struct *mm = (struct mm_struct *) info;
  36
  37         if (current->mm != mm)
  38                 return;
  39         this_cpu_write(runqueues.membarrier_state,
  40                        atomic_read(&mm->membarrier_state));
  41         /*
  42          * Issue a memory barrier after setting
  43          * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
  44          * guarantee that no memory access following registration is reordered
  45          * before registration.
  46          */
  47         smp_mb();
  48 }
  49
  50 void membarrier_exec_mmap(struct mm_struct *mm)
  51 {
  52         /*
  53          * Issue a memory barrier before clearing membarrier_state to
  54          * guarantee that no memory access prior to exec is reordered after
  55          * clearing this state.
  56          */
  57         smp_mb();
  58         atomic_set(&mm->membarrier_state, 0);
  59         /*
  60          * Keep the runqueue membarrier_state in sync with this mm
  61          * membarrier_state.
  62          */
  63         this_cpu_write(runqueues.membarrier_state, 0);
  64 }
  65
  66 static int membarrier_global_expedited(void)
  67 {
  68         int cpu;
  69         cpumask_var_t tmpmask;
  70
  71         if (num_online_cpus() == 1)
  72                 return 0;
  73
  74         /*
  75          * Matches memory barriers around rq->curr modification in
  76          * scheduler.
  77          */
  78         smp_mb();       /* system call entry is not a mb. */
  79
  80         if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
  81                 return -ENOMEM;
  82
  83         cpus_read_lock();
  84         rcu_read_lock();
  85         for_each_online_cpu(cpu) {
  86                 struct task_struct *p;
  87
  88                 /*
  89                  * Skipping the current CPU is OK even through we can be
  90                  * migrated at any point. The current CPU, at the point
  91                  * where we read raw_smp_processor_id(), is ensured to
  92                  * be in program order with respect to the caller
  93                  * thread. Therefore, we can skip this CPU from the
  94                  * iteration.
  95                  */
  96                 if (cpu == raw_smp_processor_id())
  97                         continue;
  98
  99                 if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
 100                     MEMBARRIER_STATE_GLOBAL_EXPEDITED))
 101                         continue;
 102
 103                 /*
 104                  * Skip the CPU if it runs a kernel thread. The scheduler
 105                  * leaves the prior task mm in place as an optimization when
 106                  * scheduling a kthread.
 107                  */
 108                 p = rcu_dereference(cpu_rq(cpu)->curr);
 109                 if (p->flags & PF_KTHREAD)
 110                         continue;
 111
 112                 __cpumask_set_cpu(cpu, tmpmask);
 113         }
 114         rcu_read_unlock();
 115
 116         preempt_disable();
 117         smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
 118         preempt_enable();
 119
 120         free_cpumask_var(tmpmask);
 121         cpus_read_unlock();
 122
 123         /*
 124          * Memory barrier on the caller thread _after_ we finished
 125          * waiting for the last IPI. Matches memory barriers around
 126          * rq->curr modification in scheduler.
 127          */
 128         smp_mb();       /* exit from system call is not a mb */
 129         return 0;
 130 }
 131
 132 static int membarrier_private_expedited(int flags)
 133 {
 134         int cpu;
 135         cpumask_var_t tmpmask;
 136         struct mm_struct *mm = current->mm;
 137
 138         if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
 139                 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
 140                         return -EINVAL;
 141                 if (!(atomic_read(&mm->membarrier_state) &
 142                       MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
 143                         return -EPERM;
 144         } else {
 145                 if (!(atomic_read(&mm->membarrier_state) &
 146                       MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
 147                         return -EPERM;
 148         }
 149
 150         if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)
 151                 return 0;
 152
 153         /*
 154          * Matches memory barriers around rq->curr modification in
 155          * scheduler.
 156          */
 157         smp_mb();       /* system call entry is not a mb. */
 158
 159         if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
 160                 return -ENOMEM;
 161
 162         cpus_read_lock();
 163         rcu_read_lock();
 164         for_each_online_cpu(cpu) {
 165                 struct task_struct *p;
 166
 167                 /*
 168                  * Skipping the current CPU is OK even through we can be
 169                  * migrated at any point. The current CPU, at the point
 170                  * where we read raw_smp_processor_id(), is ensured to
 171                  * be in program order with respect to the caller
 172                  * thread. Therefore, we can skip this CPU from the
 173                  * iteration.
 174                  */
 175                 if (cpu == raw_smp_processor_id())
 176                         continue;
 177                 rcu_read_lock();
 178                 p = rcu_dereference(cpu_rq(cpu)->curr);
 179                 if (p && p->mm == mm)
 180                         __cpumask_set_cpu(cpu, tmpmask);
 181         }
 182         rcu_read_unlock();
 183
 184         preempt_disable();
 185         smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
 186         preempt_enable();
 187
 188         free_cpumask_var(tmpmask);
 189         cpus_read_unlock();
 190
 191         /*
 192          * Memory barrier on the caller thread _after_ we finished
 193          * waiting for the last IPI. Matches memory barriers around
 194          * rq->curr modification in scheduler.
 195          */
 196         smp_mb();       /* exit from system call is not a mb */
 197
 198         return 0;
 199 }
 200
 201 static int sync_runqueues_membarrier_state(struct mm_struct *mm)
 202 {
 203         int membarrier_state = atomic_read(&mm->membarrier_state);
 204         cpumask_var_t tmpmask;
 205         int cpu;
 206
 207         if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
 208                 this_cpu_write(runqueues.membarrier_state, membarrier_state);
 209
 210                 /*
 211                  * For single mm user, we can simply issue a memory barrier
 212                  * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
 213                  * mm and in the current runqueue to guarantee that no memory
 214                  * access following registration is reordered before
 215                  * registration.
 216                  */
 217                 smp_mb();
 218                 return 0;
 219         }
 220
 221         if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
 222                 return -ENOMEM;
 223
 224         /*
 225          * For mm with multiple users, we need to ensure all future
 226          * scheduler executions will observe @mm's new membarrier
 227          * state.
 228          */
 229         synchronize_rcu();
 230
 231         /*
 232          * For each cpu runqueue, if the task's mm match @mm, ensure that all
 233          * @mm's membarrier state set bits are also set in in the runqueue's
 234          * membarrier state. This ensures that a runqueue scheduling
 235          * between threads which are users of @mm has its membarrier state
 236          * updated.
 237          */
 238         cpus_read_lock();
 239         rcu_read_lock();
 240         for_each_online_cpu(cpu) {
 241                 struct rq *rq = cpu_rq(cpu);
 242                 struct task_struct *p;
 243
 244                 p = rcu_dereference(rq->curr);
 245                 if (p && p->mm == mm)
 246                         __cpumask_set_cpu(cpu, tmpmask);
 247         }
 248         rcu_read_unlock();
 249
 250         preempt_disable();
 251         smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
 252         preempt_enable();
 253
 254         free_cpumask_var(tmpmask);
 255         cpus_read_unlock();
 256
 257         return 0;
 258 }
 259
 260 static int membarrier_register_global_expedited(void)
 261 {
 262         struct task_struct *p = current;
 263         struct mm_struct *mm = p->mm;
 264         int ret;
 265
 266         if (atomic_read(&mm->membarrier_state) &
 267             MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
 268                 return 0;
 269         atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
 270         ret = sync_runqueues_membarrier_state(mm);
 271         if (ret)
 272                 return ret;
 273         atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
 274                   &mm->membarrier_state);
 275
 276         return 0;
 277 }
 278
 279 static int membarrier_register_private_expedited(int flags)
 280 {
 281         struct task_struct *p = current;
 282         struct mm_struct *mm = p->mm;
 283         int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
 284             set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
 285             ret;
 286
 287         if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
 288                 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
 289                         return -EINVAL;
 290                 ready_state =
 291                         MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
 292         }
 293
 294         /*
 295          * We need to consider threads belonging to different thread
 296          * groups, which use the same mm. (CLONE_VM but not
 297          * CLONE_THREAD).
 298          */
 299         if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
 300                 return 0;
 301         if (flags & MEMBARRIER_FLAG_SYNC_CORE)
 302                 set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
 303         atomic_or(set_state, &mm->membarrier_state);
 304         ret = sync_runqueues_membarrier_state(mm);
 305         if (ret)
 306                 return ret;
 307         atomic_or(ready_state, &mm->membarrier_state);
 308
 309         return 0;
 310 }
 311
 312 /**
 313  * sys_membarrier - issue memory barriers on a set of threads
 314  * @cmd:   Takes command values defined in enum membarrier_cmd.
 315  * @flags: Currently needs to be 0. For future extensions.
 316  *
 317  * If this system call is not implemented, -ENOSYS is returned. If the
 318  * command specified does not exist, not available on the running
 319  * kernel, or if the command argument is invalid, this system call
 320  * returns -EINVAL. For a given command, with flags argument set to 0,
 321  * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
 322  * always return the same value until reboot. In addition, it can return
 323  * -ENOMEM if there is not enough memory available to perform the system
 324  * call.
 325  *
 326  * All memory accesses performed in program order from each targeted thread
 327  * is guaranteed to be ordered with respect to sys_membarrier(). If we use
 328  * the semantic "barrier()" to represent a compiler barrier forcing memory
 329  * accesses to be performed in program order across the barrier, and
 330  * smp_mb() to represent explicit memory barriers forcing full memory
 331  * ordering across the barrier, we have the following ordering table for
 332  * each pair of barrier(), sys_membarrier() and smp_mb():
 333  *
 334  * The pair ordering is detailed as (O: ordered, X: not ordered):
 335  *
 336  *                        barrier()   smp_mb() sys_membarrier()
 337  *        barrier()          X           X            O
 338  *        smp_mb()           X           O            O
 339  *        sys_membarrier()   O           O            O
 340  */
 341 SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
 342 {
 343         if (unlikely(flags))
 344                 return -EINVAL;
 345         switch (cmd) {
 346         case MEMBARRIER_CMD_QUERY:
 347         {
 348                 int cmd_mask = MEMBARRIER_CMD_BITMASK;
 349
 350                 if (tick_nohz_full_enabled())
 351                         cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
 352                 return cmd_mask;
 353         }
 354         case MEMBARRIER_CMD_GLOBAL:
 355                 /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
 356                 if (tick_nohz_full_enabled())
 357                         return -EINVAL;
 358                 if (num_online_cpus() > 1)
 359                         synchronize_rcu();
 360                 return 0;
 361         case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
 362                 return membarrier_global_expedited();
 363         case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
 364                 return membarrier_register_global_expedited();
 365         case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
 366                 return membarrier_private_expedited(0);
 367         case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
 368                 return membarrier_register_private_expedited(0);
 369         case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
 370                 return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
 371         case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
 372                 return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
 373         default:
 374                 return -EINVAL;
 375         }
 376 }