Merge branch 'linus' into locking/core, to resolve conflicts

author Ingo Molnar <mingo@kernel.org>

Fri, 11 Aug 2017 11:51:59 +0000 (13:51 +0200)

committer Ingo Molnar <mingo@kernel.org>

Fri, 11 Aug 2017 11:51:59 +0000 (13:51 +0200)
author Ingo Molnar <mingo@kernel.org>
Fri, 11 Aug 2017 11:51:59 +0000 (13:51 +0200)
committer Ingo Molnar <mingo@kernel.org>
Fri, 11 Aug 2017 11:51:59 +0000 (13:51 +0200)
diff --combined fs/userfaultfd.c

index 44fcbefd84a2008209bf48ecdd2321799ca452f0,b0d5897bc4e6d0e019c79f65b6d41df1d3b0d050..886085b47c75e6914cb1084cb4e8ec681af92945
--- 1/fs/userfaultfd.c
--- 2/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@@ -109,24 -109,27 +109,24 @@@ static int userfaultfd_wake_function(wa
                 goto out;
         WRITE_ONCE(uwq->waken, true);
         /*
- -       * The implicit smp_mb__before_spinlock in try_to_wake_up()
- -       * renders uwq->waken visible to other CPUs before the task is
- -       * waken.
+ +       * The Program-Order guarantees provided by the scheduler
+ +       * ensure uwq->waken is visible before the task is woken.
          */
         ret = wake_up_state(wq->private, mode);
- -      if (ret)
+ +      if (ret) {
                 /*
                  * Wake only once, autoremove behavior.
                  *
- -               * After the effect of list_del_init is visible to the
- -               * other CPUs, the waitqueue may disappear from under
- -               * us, see the !list_empty_careful() in
- -               * handle_userfault(). try_to_wake_up() has an
- -               * implicit smp_mb__before_spinlock, and the
- -               * wq->private is read before calling the extern
- -               * function "wake_up_state" (which in turns calls
- -               * try_to_wake_up). While the spin_lock;spin_unlock;
- -               * wouldn't be enough, the smp_mb__before_spinlock is
- -               * enough to avoid an explicit smp_mb() here.
+ +               * After the effect of list_del_init is visible to the other
+ +               * CPUs, the waitqueue may disappear from under us, see the
+ +               * !list_empty_careful() in handle_userfault().
+ +               *
+ +               * try_to_wake_up() has an implicit smp_mb(), and the
+ +               * wq->private is read before calling the extern function
+ +               * "wake_up_state" (which in turns calls try_to_wake_up).
                  */
                 list_del_init(&wq->entry);
+ +      }
   out:
         return ret;
   }
@@@ -1597,7 -1600,7 +1597,7 @@@ static int userfaultfd_copy(struct user
                                    uffdio_copy.len);
                 mmput(ctx->mm);
         } else {
-               return -ENOSPC;
+               return -ESRCH;
         }
         if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
                 return -EFAULT;
@@@ -1644,7 -1647,7 +1644,7 @@@ static int userfaultfd_zeropage(struct 
                                      uffdio_zeropage.range.len);
                 mmput(ctx->mm);
         } else {
-               return -ENOSPC;
+               return -ESRCH;
         }
         if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
                 return -EFAULT;
diff --combined include/linux/mm_types.h

index 36ea3cf7d85e625bd5fe4259cfeae4217d5e6be7,3cadee0a350889f748e7b1a999b449ae003e9c3f..dc1edec05a3fa7459cd34d3ac934478daecb7fdb
--- 1/include/linux/mm_types.h
--- 2/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@@ -487,14 -487,12 +487,12 @@@ struct mm_struct 
         /* numa_scan_seq prevents two threads setting pte_numa */
         int numa_scan_seq;
   #endif
- #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
         /*
          * An operation with batched TLB flushing is going on. Anything that
          * can move process memory needs to flush the TLB when moving a
          * PROT_NONE or PROT_NUMA mapped page.
          */
-       bool tlb_flush_pending;
- #endif
+       atomic_t tlb_flush_pending;
   #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
         /* See flush_tlb_batched_pending() */
         bool tlb_flush_batched;
@@@ -522,67 -520,60 +520,83 @@@ static inline cpumask_t *mm_cpumask(str
         return mm->cpu_vm_mask_var;
   }
   
- #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
+ struct mmu_gather;
+ extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+                               unsigned long start, unsigned long end);
+ extern void tlb_finish_mmu(struct mmu_gather *tlb,
+                               unsigned long start, unsigned long end);
+ 
   /*
    * Memory barriers to keep this state in sync are graciously provided by
    * the page table locks, outside of which no page table modifications happen.
-  * The barriers below prevent the compiler from re-ordering the instructions
-  * around the memory barriers that are already present in the code.
+  * The barriers are used to ensure the order between tlb_flush_pending updates,
+  * which happen while the lock is not taken, and the PTE updates, which happen
+  * while the lock is taken, are serialized.
    */
   static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
   {
-       return mm->tlb_flush_pending;
+ +      /*
+ +       * Must be called with PTL held; such that our PTL acquire will have
+ +       * observed the store from set_tlb_flush_pending().
+ +       */
+       return atomic_read(&mm->tlb_flush_pending) > 0;
+ }
+ 
+ /*
+  * Returns true if there are two above TLB batching threads in parallel.
+  */
+ static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
+ {
+       return atomic_read(&mm->tlb_flush_pending) > 1;
+ }
+ 
+ static inline void init_tlb_flush_pending(struct mm_struct *mm)
+ {
+       atomic_set(&mm->tlb_flush_pending, 0);
   }
- static inline void set_tlb_flush_pending(struct mm_struct *mm)
+ 
+ static inline void inc_tlb_flush_pending(struct mm_struct *mm)
   {
-       mm->tlb_flush_pending = true;
+       atomic_inc(&mm->tlb_flush_pending);
+ 
         /*
- -       * Guarantee that the tlb_flush_pending increase does not leak into the
- -       * critical section updating the page tables
+ +       * The only time this value is relevant is when there are indeed pages
+ +       * to flush. And we'll only flush pages after changing them, which
+ +       * requires the PTL.
+ +       *
+ +       * So the ordering here is:
+ +       *
-        *      mm->tlb_flush_pending = true;
++       *      atomic_inc(&mm->tlb_flush_pending);
+ +       *      spin_lock(&ptl);
+ +       *      ...
+ +       *      set_pte_at();
+ +       *      spin_unlock(&ptl);
+ +       *
+ +       *                              spin_lock(&ptl)
+ +       *                              mm_tlb_flush_pending();
+ +       *                              ....
+ +       *                              spin_unlock(&ptl);
+ +       *
+ +       *      flush_tlb_range();
-        *      mm->tlb_flush_pending = false;
++       *      atomic_dec(&mm->tlb_flush_pending);
+ +       *
+ +       * So the =true store is constrained by the PTL unlock, and the =false
+ +       * store is constrained by the TLB invalidate.
          */
- -      smp_mb__before_spinlock();
   }
+ 
   /* Clearing is done after a TLB flush, which also provides a barrier. */
- static inline void clear_tlb_flush_pending(struct mm_struct *mm)
- {
-       /* see set_tlb_flush_pending */
-       mm->tlb_flush_pending = false;
- }
- #else
- static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
- {
-       return false;
- }
- static inline void set_tlb_flush_pending(struct mm_struct *mm)
- {
- }
- static inline void clear_tlb_flush_pending(struct mm_struct *mm)
+ static inline void dec_tlb_flush_pending(struct mm_struct *mm)
   {
+       /*
+        * Guarantee that the tlb_flush_pending does not not leak into the
+        * critical section, since we must order the PTE change and changes to
+        * the pending TLB flush indication. We could have relied on TLB flush
+        * as a memory barrier, but this behavior is not clearly documented.
+        */
+       smp_mb__before_atomic();
+       atomic_dec(&mm->tlb_flush_pending);
   }
- #endif
   
   struct vm_fault;
   
diff --combined kernel/fork.c

index cbf2221ee81a45dfd94be55f38f4c38fdece04ca,e075b7780421dee1d8243b9dc178248398c5f189..5fc09911fbb937bf01ab303f7ea7a94528c3d1d5
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -484,8 -484,6 +484,8 @@@ void __init fork_init(void
         cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
                           NULL, free_vm_stack_cache);
   #endif
+ +
+ +      lockdep_init_task(&init_task);
   }
   
   int __weak arch_dup_task_struct(struct task_struct *dst,
@@@ -809,7 -807,7 +809,7 @@@ static struct mm_struct *mm_init(struc
         mm_init_aio(mm);
         mm_init_owner(mm, p);
         mmu_notifier_mm_init(mm);
-       clear_tlb_flush_pending(mm);
+       init_tlb_flush_pending(mm);
   #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
         mm->pmd_huge_pte = NULL;
   #endif
@@@ -1693,7 -1691,6 +1693,7 @@@ static __latent_entropy struct task_str
         p->lockdep_depth = 0; /* no locks held yet */
         p->curr_chain_key = 0;
         p->lockdep_recursion = 0;
+ +      lockdep_init_task(p);
   #endif
   
   #ifdef CONFIG_DEBUG_MUTEXES
@@@ -1952,7 -1949,6 +1952,7 @@@ bad_fork_cleanup_audit
   bad_fork_cleanup_perf:
         perf_event_free_task(p);
   bad_fork_cleanup_policy:
+ +      lockdep_free_task(p);
   #ifdef CONFIG_NUMA
         mpol_put(p->mempolicy);
   bad_fork_cleanup_threadgroup_lock:
diff --combined mm/huge_memory.c

index c76a720b936b8afbe525a4db8b22fcbee2d1736c,216114f6ef0b7f8c09378edd3615d6a39527ead0..ce883459e2466d70b8484d529f75697efdde96c4
--- 1/mm/huge_memory.c
--- 2/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@@ -1410,7 -1410,6 +1410,7 @@@ int do_huge_pmd_numa_page(struct vm_fau
         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
         int page_nid = -1, this_nid = numa_node_id();
         int target_nid, last_cpupid = -1;
+ +      bool need_flush = false;
         bool page_locked;
         bool migrated = false;
         bool was_writable;
@@@ -1496,30 -1495,18 +1496,37 @@@
                 goto clear_pmdnuma;
         }
   
+       /*
+        * The page_table_lock above provides a memory barrier
+        * with change_protection_range.
+        */
+       if (mm_tlb_flush_pending(vma->vm_mm))
+               flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
+ 
+ +      /*
+ +       * Since we took the NUMA fault, we must have observed the !accessible
+ +       * bit. Make sure all other CPUs agree with that, to avoid them
+ +       * modifying the page we're about to migrate.
+ +       *
+ +       * Must be done under PTL such that we'll observe the relevant
+ +       * set_tlb_flush_pending().
+ +       */
+ +      if (mm_tlb_flush_pending(vma->vm_mm))
+ +              need_flush = true;
+ +
         /*
          * Migrate the THP to the requested node, returns with page unlocked
          * and access rights restored.
          */
         spin_unlock(vmf->ptl);
+ +
+ +      /*
+ +       * We are not sure a pending tlb flush here is for a huge page
+ +       * mapping or not. Hence use the tlb range variant
+ +       */
+ +      if (need_flush)
+ +              flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
+ +
         migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
                                 vmf->pmd, pmd, vmf->address, page, target_nid);
         if (migrated) {
diff --combined mm/page_alloc.c

index c20d8960180261cd96adc483519fd0ad1fe674d2,6d00f746c2fd96452661fde3f704289eed7f1f70..6acf612fdd990005e4a2dd3f05f0105ddf4f1513
--- 1/mm/page_alloc.c
--- 2/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@@ -66,7 -66,6 +66,7 @@@
   #include <linux/kthread.h>
   #include <linux/memcontrol.h>
   #include <linux/ftrace.h>
+ +#include <linux/lockdep.h>
   
   #include <asm/sections.h>
   #include <asm/tlbflush.h>
@@@ -3491,47 -3490,6 +3491,47 @@@ should_compact_retry(struct alloc_conte
   }
   #endif /* CONFIG_COMPACTION */
   
+ +#ifdef CONFIG_LOCKDEP
+ +struct lockdep_map __fs_reclaim_map =
+ +      STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
+ +
+ +static bool __need_fs_reclaim(gfp_t gfp_mask)
+ +{
+ +      gfp_mask = current_gfp_context(gfp_mask);
+ +
+ +      /* no reclaim without waiting on it */
+ +      if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
+ +              return false;
+ +
+ +      /* this guy won't enter reclaim */
+ +      if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
+ +              return false;
+ +
+ +      /* We're only interested __GFP_FS allocations for now */
+ +      if (!(gfp_mask & __GFP_FS))
+ +              return false;
+ +
+ +      if (gfp_mask & __GFP_NOLOCKDEP)
+ +              return false;
+ +
+ +      return true;
+ +}
+ +
+ +void fs_reclaim_acquire(gfp_t gfp_mask)
+ +{
+ +      if (__need_fs_reclaim(gfp_mask))
+ +              lock_map_acquire(&__fs_reclaim_map);
+ +}
+ +EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
+ +
+ +void fs_reclaim_release(gfp_t gfp_mask)
+ +{
+ +      if (__need_fs_reclaim(gfp_mask))
+ +              lock_map_release(&__fs_reclaim_map);
+ +}
+ +EXPORT_SYMBOL_GPL(fs_reclaim_release);
+ +#endif
+ +
   /* Perform direct synchronous page reclaim */
   static int
   __perform_reclaim(gfp_t gfp_mask, unsigned int order,
@@@ -3546,7 -3504,7 +3546,7 @@@
         /* We now go into synchronous reclaim */
         cpuset_memory_pressure_bump();
         noreclaim_flag = memalloc_noreclaim_save();
- -      lockdep_set_current_reclaim_state(gfp_mask);
+ +      fs_reclaim_acquire(gfp_mask);
         reclaim_state.reclaimed_slab = 0;
         current->reclaim_state = &reclaim_state;
   
@@@ -3554,7 -3512,7 +3554,7 @@@
                                                                 ac->nodemask);
   
         current->reclaim_state = NULL;
- -      lockdep_clear_current_reclaim_state();
+ +      fs_reclaim_release(gfp_mask);
         memalloc_noreclaim_restore(noreclaim_flag);
   
         cond_resched();
@@@ -4083,8 -4041,7 +4083,8 @@@ static inline bool prepare_alloc_pages(
                         *alloc_flags |= ALLOC_CPUSET;
         }
   
- -      lockdep_trace_alloc(gfp_mask);
+ +      fs_reclaim_acquire(gfp_mask);
+ +      fs_reclaim_release(gfp_mask);
   
         might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
   
@@@ -4501,8 -4458,9 +4501,9 @@@ long si_mem_available(void
          * Part of the reclaimable slab consists of items that are in use,
          * and cannot be freed. Cap this estimate at the low watermark.
          */
-       available += global_page_state(NR_SLAB_RECLAIMABLE) -
-                    min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
+       available += global_node_page_state(NR_SLAB_RECLAIMABLE) -
+                    min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2,
+                        wmark_low);
   
         if (available < 0)
                 available = 0;
@@@ -4645,8 -4603,8 +4646,8 @@@ void show_free_areas(unsigned int filte
                 global_node_page_state(NR_FILE_DIRTY),
                 global_node_page_state(NR_WRITEBACK),
                 global_node_page_state(NR_UNSTABLE_NFS),
-               global_page_state(NR_SLAB_RECLAIMABLE),
-               global_page_state(NR_SLAB_UNRECLAIMABLE),
+               global_node_page_state(NR_SLAB_RECLAIMABLE),
+               global_node_page_state(NR_SLAB_UNRECLAIMABLE),
                 global_node_page_state(NR_FILE_MAPPED),
                 global_node_page_state(NR_SHMEM),
                 global_page_state(NR_PAGETABLE),
@@@ -7711,7 -7669,7 +7712,7 @@@ int alloc_contig_range(unsigned long st
   
         /* Make sure the range is really isolated. */
         if (test_pages_isolated(outer_start, end, false)) {
-               pr_info("%s: [%lx, %lx) PFNs busy\n",
+               pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
                         __func__, outer_start, end);
                 ret = -EBUSY;
                 goto done;
diff --combined net/ipv4/udp.c

index 3037339ed8b8a1cbf5673d204f78e1844073cb56,a7c804f73990a0610bc85c02fc2dd76858973c22..b34f09b20fef1f69fd2d95d776e572eed6485a04
--- 1/net/ipv4/udp.c
--- 2/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@@ -802,7 -802,7 +802,7 @@@ static int udp_send_skb(struct sk_buff 
         if (is_udplite)                                  /*     UDP-Lite      */
                 csum = udplite_csum(skb);
   
-       else if (sk->sk_no_check_tx) {   /* UDP csum disabled */
+       else if (sk->sk_no_check_tx && !skb_is_gso(skb)) {   /* UDP csum off */
   
                 skb->ip_summed = CHECKSUM_NONE;
                 goto send;
@@@ -1809,7 -1809,8 +1809,7 @@@ static int __udp_queue_rcv_skb(struct s
   static struct static_key udp_encap_needed __read_mostly;
   void udp_encap_enable(void)
   {
- -      if (!static_key_enabled(&udp_encap_needed))
- -              static_key_slow_inc(&udp_encap_needed);
+ +      static_key_enable(&udp_encap_needed);
   }
   EXPORT_SYMBOL(udp_encap_enable);
author	Ingo Molnar <mingo@kernel.org>
	Fri, 11 Aug 2017 11:51:59 +0000 (13:51 +0200)
committer	Ingo Molnar <mingo@kernel.org>
	Fri, 11 Aug 2017 11:51:59 +0000 (13:51 +0200)
		1	2
fs/userfaultfd.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mm_types.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/huge_memory.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page_alloc.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/ipv4/udp.c	patch \|	diff1 \|	diff2 \|	blob \| history