x86/mm: Don't reenter flush_tlb_func_common()

[linux.git] / arch / x86 / mm / tlb.c
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c

index 6e7bedf69af78d4fd9fec1f4144542b91ce5892b..1cc47838d1e817b3579f7b78c64ee5ead263ffb4 100644 (file)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -15,7 +15,7 @@
  #include <linux/debugfs.h>
  
  /*
- *     Smarter SMP flushing macros.
+ *     TLB flushing, formerly SMP-only
   *             c/o Linus Torvalds.
   *
   *     These mean you can really definitely utterly forget about
@@ -28,39 +28,28 @@
   *     Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
   */
  
-#ifdef CONFIG_SMP
-
-struct flush_tlb_info {
-       struct mm_struct *flush_mm;
-       unsigned long flush_start;
-       unsigned long flush_end;
-};
-
-/*
- * We cannot call mmdrop() because we are in interrupt context,
- * instead update mm->cpu_vm_mask.
- */
  void leave_mm(int cpu)
  {
-       struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm);
+       struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+
+       /*
+        * It's plausible that we're in lazy TLB mode while our mm is init_mm.
+        * If so, our callers still expect us to flush the TLB, but there
+        * aren't any user TLB entries in init_mm to worry about.
+        *
+        * This needs to happen before any other sanity checks due to
+        * intel_idle's shenanigans.
+        */
+       if (loaded_mm == &init_mm)
+               return;
+
         if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
                 BUG();
-       if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
-               cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
-               load_cr3(swapper_pg_dir);
-               /*
-                * This gets called in the idle path where RCU
-                * functions differently.  Tracing normally
-                * uses RCU, so we have to call the tracepoint
-                * specially here.
-                */
-               trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
-       }
+
+       switch_mm(NULL, &init_mm, NULL);
  }
  EXPORT_SYMBOL_GPL(leave_mm);
  
-#endif /* CONFIG_SMP */
-
  void switch_mm(struct mm_struct *prev, struct mm_struct *next,
                struct task_struct *tsk)
  {
@@ -75,116 +64,94 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                         struct task_struct *tsk)
  {
         unsigned cpu = smp_processor_id();
+       struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
  
-       if (likely(prev != next)) {
-               if (IS_ENABLED(CONFIG_VMAP_STACK)) {
-                       /*
-                        * If our current stack is in vmalloc space and isn't
-                        * mapped in the new pgd, we'll double-fault.  Forcibly
-                        * map it.
-                        */
-                       unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
-
-                       pgd_t *pgd = next->pgd + stack_pgd_index;
-
-                       if (unlikely(pgd_none(*pgd)))
-                               set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
-               }
+       /*
+        * NB: The scheduler will call us with prev == next when
+        * switching from lazy TLB mode to normal mode if active_mm
+        * isn't changing.  When this happens, there is no guarantee
+        * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next.
+        *
+        * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
+        */
  
-#ifdef CONFIG_SMP
-               this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
-               this_cpu_write(cpu_tlbstate.active_mm, next);
-#endif
+       this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
  
-               cpumask_set_cpu(cpu, mm_cpumask(next));
+       if (real_prev == next) {
+               /*
+                * There's nothing to do: we always keep the per-mm control
+                * regs in sync with cpu_tlbstate.loaded_mm.  Just
+                * sanity-check mm_cpumask.
+                */
+               if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next))))
+                       cpumask_set_cpu(cpu, mm_cpumask(next));
+               return;
+       }
  
+       if (IS_ENABLED(CONFIG_VMAP_STACK)) {
                 /*
-                * Re-load page tables.
-                *
-                * This logic has an ordering constraint:
-                *
-                *  CPU 0: Write to a PTE for 'next'
-                *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
-                *  CPU 1: set bit 1 in next's mm_cpumask
-                *  CPU 1: load from the PTE that CPU 0 writes (implicit)
-                *
-                * We need to prevent an outcome in which CPU 1 observes
-                * the new PTE value and CPU 0 observes bit 1 clear in
-                * mm_cpumask.  (If that occurs, then the IPI will never
-                * be sent, and CPU 0's TLB will contain a stale entry.)
-                *
-                * The bad outcome can occur if either CPU's load is
-                * reordered before that CPU's store, so both CPUs must
-                * execute full barriers to prevent this from happening.
-                *
-                * Thus, switch_mm needs a full barrier between the
-                * store to mm_cpumask and any operation that could load
-                * from next->pgd.  TLB fills are special and can happen
-                * due to instruction fetches or for no reason at all,
-                * and neither LOCK nor MFENCE orders them.
-                * Fortunately, load_cr3() is serializing and gives the
-                * ordering guarantee we need.
-                *
+                * If our current stack is in vmalloc space and isn't
+                * mapped in the new pgd, we'll double-fault.  Forcibly
+                * map it.
                  */
-               load_cr3(next->pgd);
+               unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
  
-               trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+               pgd_t *pgd = next->pgd + stack_pgd_index;
  
-               /* Stop flush ipis for the previous mm */
-               cpumask_clear_cpu(cpu, mm_cpumask(prev));
+               if (unlikely(pgd_none(*pgd)))
+                       set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
+       }
  
-               /* Load per-mm CR4 state */
-               load_mm_cr4(next);
+       this_cpu_write(cpu_tlbstate.loaded_mm, next);
  
-#ifdef CONFIG_MODIFY_LDT_SYSCALL
-               /*
-                * Load the LDT, if the LDT is different.
-                *
-                * It's possible that prev->context.ldt doesn't match
-                * the LDT register.  This can happen if leave_mm(prev)
-                * was called and then modify_ldt changed
-                * prev->context.ldt but suppressed an IPI to this CPU.
-                * In this case, prev->context.ldt != NULL, because we
-                * never set context.ldt to NULL while the mm still
-                * exists.  That means that next->context.ldt !=
-                * prev->context.ldt, because mms never share an LDT.
-                */
-               if (unlikely(prev->context.ldt != next->context.ldt))
-                       load_mm_ldt(next);
-#endif
-       }
-#ifdef CONFIG_SMP
-         else {
-               this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
-               BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
-
-               if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
-                       /*
-                        * On established mms, the mm_cpumask is only changed
-                        * from irq context, from ptep_clear_flush() while in
-                        * lazy tlb mode, and here. Irqs are blocked during
-                        * schedule, protecting us from simultaneous changes.
-                        */
-                       cpumask_set_cpu(cpu, mm_cpumask(next));
+       WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
+       cpumask_set_cpu(cpu, mm_cpumask(next));
  
-                       /*
-                        * We were in lazy tlb mode and leave_mm disabled
-                        * tlb flush IPI delivery. We must reload CR3
-                        * to make sure to use no freed page tables.
-                        *
-                        * As above, load_cr3() is serializing and orders TLB
-                        * fills with respect to the mm_cpumask write.
-                        */
-                       load_cr3(next->pgd);
-                       trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
-                       load_mm_cr4(next);
-                       load_mm_ldt(next);
-               }
-       }
-#endif
-}
+       /*
+        * Re-load page tables.
+        *
+        * This logic has an ordering constraint:
+        *
+        *  CPU 0: Write to a PTE for 'next'
+        *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
+        *  CPU 1: set bit 1 in next's mm_cpumask
+        *  CPU 1: load from the PTE that CPU 0 writes (implicit)
+        *
+        * We need to prevent an outcome in which CPU 1 observes
+        * the new PTE value and CPU 0 observes bit 1 clear in
+        * mm_cpumask.  (If that occurs, then the IPI will never
+        * be sent, and CPU 0's TLB will contain a stale entry.)
+        *
+        * The bad outcome can occur if either CPU's load is
+        * reordered before that CPU's store, so both CPUs must
+        * execute full barriers to prevent this from happening.
+        *
+        * Thus, switch_mm needs a full barrier between the
+        * store to mm_cpumask and any operation that could load
+        * from next->pgd.  TLB fills are special and can happen
+        * due to instruction fetches or for no reason at all,
+        * and neither LOCK nor MFENCE orders them.
+        * Fortunately, load_cr3() is serializing and gives the
+        * ordering guarantee we need.
+        */
+       load_cr3(next->pgd);
  
-#ifdef CONFIG_SMP
+       /*
+        * This gets called via leave_mm() in the idle path where RCU
+        * functions differently.  Tracing normally uses RCU, so we have to
+        * call the tracepoint specially here.
+        */
+       trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+
+       /* Stop flush ipis for the previous mm */
+       WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
+                    real_prev != &init_mm);
+       cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+
+       /* Load per-mm CR4 and LDTR state */
+       load_mm_cr4(next);
+       switch_ldt(real_prev, next);
+}
  
  /*
   * The flush IPI assumes that a thread switch happens in this order:
@@ -222,69 +189,78 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
   * write/read ordering problems.
   */
  
-/*
- * TLB flush funcation:
- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
- * 2) Leave the mm if we are in the lazy tlb mode.
- */
-static void flush_tlb_func(void *info)
+static void flush_tlb_func_common(const struct flush_tlb_info *f,
+                                 bool local, enum tlb_flush_reason reason)
  {
-       struct flush_tlb_info *f = info;
+       /* This code cannot presently handle being reentered. */
+       VM_WARN_ON(!irqs_disabled());
  
-       inc_irq_stat(irq_tlb_count);
-
-       if (f->flush_mm && f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
+       if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
+               leave_mm(smp_processor_id());
                 return;
+       }
  
-       count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
-       if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
-               if (f->flush_end == TLB_FLUSH_ALL) {
-                       local_flush_tlb();
-                       trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
-               } else {
-                       unsigned long addr;
-                       unsigned long nr_pages =
-                               (f->flush_end - f->flush_start) / PAGE_SIZE;
-                       addr = f->flush_start;
-                       while (addr < f->flush_end) {
-                               __flush_tlb_single(addr);
-                               addr += PAGE_SIZE;
-                       }
-                       trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages);
+       if (f->end == TLB_FLUSH_ALL) {
+               local_flush_tlb();
+               if (local)
+                       count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+               trace_tlb_flush(reason, TLB_FLUSH_ALL);
+       } else {
+               unsigned long addr;
+               unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
+               addr = f->start;
+               while (addr < f->end) {
+                       __flush_tlb_single(addr);
+                       addr += PAGE_SIZE;
                 }
-       } else
-               leave_mm(smp_processor_id());
+               if (local)
+                       count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
+               trace_tlb_flush(reason, nr_pages);
+       }
+}
+
+static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
+{
+       const struct flush_tlb_info *f = info;
  
+       flush_tlb_func_common(f, true, reason);
  }
  
-void native_flush_tlb_others(const struct cpumask *cpumask,
-                                struct mm_struct *mm, unsigned long start,
-                                unsigned long end)
+static void flush_tlb_func_remote(void *info)
  {
-       struct flush_tlb_info info;
+       const struct flush_tlb_info *f = info;
  
-       info.flush_mm = mm;
-       info.flush_start = start;
-       info.flush_end = end;
+       inc_irq_stat(irq_tlb_count);
  
+       if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
+               return;
+
+       count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+       flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
+}
+
+void native_flush_tlb_others(const struct cpumask *cpumask,
+                            const struct flush_tlb_info *info)
+{
         count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
-       if (end == TLB_FLUSH_ALL)
+       if (info->end == TLB_FLUSH_ALL)
                 trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
         else
                 trace_tlb_flush(TLB_REMOTE_SEND_IPI,
-                               (end - start) >> PAGE_SHIFT);
+                               (info->end - info->start) >> PAGE_SHIFT);
  
         if (is_uv_system()) {
                 unsigned int cpu;
  
                 cpu = smp_processor_id();
-               cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
+               cpumask = uv_flush_tlb_others(cpumask, info);
                 if (cpumask)
-                       smp_call_function_many(cpumask, flush_tlb_func,
-                                                               &info, 1);
+                       smp_call_function_many(cpumask, flush_tlb_func_remote,
+                                              (void *)info, 1);
                 return;
         }
-       smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
+       smp_call_function_many(cpumask, flush_tlb_func_remote,
+                              (void *)info, 1);
  }
  
  /*
@@ -302,85 +278,41 @@ static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
  void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
                                 unsigned long end, unsigned long vmflag)
  {
-       unsigned long addr;
-       /* do a global flush by default */
-       unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
+       int cpu;
  
-       preempt_disable();
+       struct flush_tlb_info info = {
+               .mm = mm,
+       };
  
-       if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
-               base_pages_to_flush = (end - start) >> PAGE_SHIFT;
-       if (base_pages_to_flush > tlb_single_page_flush_ceiling)
-               base_pages_to_flush = TLB_FLUSH_ALL;
+       cpu = get_cpu();
  
-       if (current->active_mm != mm) {
-               /* Synchronize with switch_mm. */
-               smp_mb();
+       /* Synchronize with switch_mm. */
+       smp_mb();
  
-               goto out;
-       }
-
-       if (!current->mm) {
-               leave_mm(smp_processor_id());
-
-               /* Synchronize with switch_mm. */
-               smp_mb();
-
-               goto out;
-       }
-
-       /*
-        * Both branches below are implicit full barriers (MOV to CR or
-        * INVLPG) that synchronize with switch_mm.
-        */
-       if (base_pages_to_flush == TLB_FLUSH_ALL) {
-               count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
-               local_flush_tlb();
+       /* Should we flush just the requested range? */
+       if ((end != TLB_FLUSH_ALL) &&
+           !(vmflag & VM_HUGETLB) &&
+           ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
+               info.start = start;
+               info.end = end;
         } else {
-               /* flush range by one by one 'invlpg' */
-               for (addr = start; addr < end;  addr += PAGE_SIZE) {
-                       count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
-                       __flush_tlb_single(addr);
-               }
+               info.start = 0UL;
+               info.end = TLB_FLUSH_ALL;
         }
-       trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush);
-out:
-       if (base_pages_to_flush == TLB_FLUSH_ALL) {
-               start = 0UL;
-               end = TLB_FLUSH_ALL;
-       }
-       if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
-               flush_tlb_others(mm_cpumask(mm), mm, start, end);
-       preempt_enable();
-}
  
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
-{
-       struct mm_struct *mm = vma->vm_mm;
-
-       preempt_disable();
-
-       if (current->active_mm == mm) {
-               if (current->mm) {
-                       /*
-                        * Implicit full barrier (INVLPG) that synchronizes
-                        * with switch_mm.
-                        */
-                       __flush_tlb_one(start);
-               } else {
-                       leave_mm(smp_processor_id());
-
-                       /* Synchronize with switch_mm. */
-                       smp_mb();
-               }
+       if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
+               VM_WARN_ON(irqs_disabled());
+               local_irq_disable();
+               flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
+               local_irq_enable();
         }
  
-       if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
-               flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE);
-
-       preempt_enable();
+       if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
+               flush_tlb_others(mm_cpumask(mm), &info);
+       put_cpu();
  }
  
+
  static void do_flush_tlb_all(void *info)
  {
         count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
@@ -401,7 +333,7 @@ static void do_kernel_range_flush(void *info)
         unsigned long addr;
  
         /* flush range by one by one 'invlpg' */
-       for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE)
+       for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
                 __flush_tlb_single(addr);
  }
  
@@ -410,16 +342,40 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
  
         /* Balance as user space task's flush, a bit conservative */
         if (end == TLB_FLUSH_ALL ||
-           (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) {
+           (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
                 on_each_cpu(do_flush_tlb_all, NULL, 1);
         } else {
                 struct flush_tlb_info info;
-               info.flush_start = start;
-               info.flush_end = end;
+               info.start = start;
+               info.end = end;
                 on_each_cpu(do_kernel_range_flush, &info, 1);
         }
  }
  
+void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
+{
+       struct flush_tlb_info info = {
+               .mm = NULL,
+               .start = 0UL,
+               .end = TLB_FLUSH_ALL,
+       };
+
+       int cpu = get_cpu();
+
+       if (cpumask_test_cpu(cpu, &batch->cpumask)) {
+               VM_WARN_ON(irqs_disabled());
+               local_irq_disable();
+               flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN);
+               local_irq_enable();
+       }
+
+       if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
+               flush_tlb_others(&batch->cpumask, &info);
+       cpumask_clear(&batch->cpumask);
+
+       put_cpu();
+}
+
  static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
                              size_t count, loff_t *ppos)
  {
@@ -465,5 +421,3 @@ static int __init create_tlb_single_page_flush_ceiling(void)
         return 0;
  }
  late_initcall(create_tlb_single_page_flush_ceiling);
-
-#endif /* CONFIG_SMP */