Merge tag 'for-linus-5.2b-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git...

[linux.git] / mm / hugetlb.c
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 2f901a6e13d23f3e54795084d9a8534068b661c6..ac843d32b0193924bd90dc96afc5aae6d35b4102 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
  /*
   * Generic hugetlb support.
   * (C) Nadia Yvette Chambers, April 2004
@@ -740,7 +741,15 @@ void resv_map_release(struct kref *ref)
  
  static inline struct resv_map *inode_resv_map(struct inode *inode)
  {
-       return inode->i_mapping->private_data;
+       /*
+        * At inode evict time, i_mapping may not point to the original
+        * address space within the inode.  This original address space
+        * contains the pointer to the resv_map.  So, always use the
+        * address space embedded within the inode.
+        * The VERY common case is inode->mapping == &inode->i_data but,
+        * this may not be true for device special inodes.
+        */
+       return (struct resv_map *)(&inode->i_data)->private_data;
  }
  
  static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
@@ -1268,12 +1277,23 @@ void free_huge_page(struct page *page)
         ClearPagePrivate(page);
  
         /*
-        * A return code of zero implies that the subpool will be under its
-        * minimum size if the reservation is not restored after page is free.
-        * Therefore, force restore_reserve operation.
+        * If PagePrivate() was set on page, page allocation consumed a
+        * reservation.  If the page was associated with a subpool, there
+        * would have been a page reserved in the subpool before allocation
+        * via hugepage_subpool_get_pages().  Since we are 'restoring' the
+        * reservtion, do not call hugepage_subpool_put_pages() as this will
+        * remove the reserved page from the subpool.
          */
-       if (hugepage_subpool_put_pages(spool, 1) == 0)
-               restore_reserve = true;
+       if (!restore_reserve) {
+               /*
+                * A return code of zero implies that the subpool will be
+                * under its minimum size if the reservation is not restored
+                * after page is free.  Therefore, force restore_reserve
+                * operation.
+                */
+               if (hugepage_subpool_put_pages(spool, 1) == 0)
+                       restore_reserve = true;
+       }
  
         spin_lock(&hugetlb_lock);
         clear_page_huge_active(page);
@@ -2288,13 +2308,33 @@ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
  }
  
  #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
-static int set_max_huge_pages(struct hstate *h, unsigned long count,
+static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
                               nodemask_t *nodes_allowed)
  {
         unsigned long min_count, ret;
  
         spin_lock(&hugetlb_lock);
  
+       /*
+        * Check for a node specific request.
+        * Changing node specific huge page count may require a corresponding
+        * change to the global count.  In any case, the passed node mask
+        * (nodes_allowed) will restrict alloc/free to the specified node.
+        */
+       if (nid != NUMA_NO_NODE) {
+               unsigned long old_count = count;
+
+               count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
+               /*
+                * User may have specified a large count value which caused the
+                * above calculation to overflow.  In this case, they wanted
+                * to allocate as many huge pages as possible.  Set count to
+                * largest possible value to align with their intention.
+                */
+               if (count < old_count)
+                       count = ULONG_MAX;
+       }
+
         /*
          * Gigantic pages runtime allocation depend on the capability for large
          * page range allocation.
@@ -2428,37 +2468,30 @@ static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
                                            unsigned long count, size_t len)
  {
         int err;
-       NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
+       nodemask_t nodes_allowed, *n_mask;
  
-       if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) {
-               err = -EINVAL;
-               goto out;
-       }
+       if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+               return -EINVAL;
  
         if (nid == NUMA_NO_NODE) {
                 /*
                  * global hstate attribute
                  */
                 if (!(obey_mempolicy &&
-                               init_nodemask_of_mempolicy(nodes_allowed))) {
-                       NODEMASK_FREE(nodes_allowed);
-                       nodes_allowed = &node_states[N_MEMORY];
-               }
-       } else if (nodes_allowed) {
+                               init_nodemask_of_mempolicy(&nodes_allowed)))
+                       n_mask = &node_states[N_MEMORY];
+               else
+                       n_mask = &nodes_allowed;
+       } else {
                 /*
-                * per node hstate attribute: adjust count to global,
-                * but restrict alloc/free to the specified node.
+                * Node specific request.  count adjustment happens in
+                * set_max_huge_pages() after acquiring hugetlb_lock.
                  */
-               count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
-               init_nodemask_of_node(nodes_allowed, nid);
-       } else
-               nodes_allowed = &node_states[N_MEMORY];
+               init_nodemask_of_node(&nodes_allowed, nid);
+               n_mask = &nodes_allowed;
+       }
  
-       err = set_max_huge_pages(h, count, nodes_allowed);
-
-out:
-       if (nodes_allowed != &node_states[N_MEMORY])
-               NODEMASK_FREE(nodes_allowed);
+       err = set_max_huge_pages(h, count, nid, n_mask);
  
         return err ? err : len;
  }
@@ -3270,7 +3303,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
         cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
  
         if (cow) {
-               mmu_notifier_range_init(&range, src, vma->vm_start,
+               mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
+                                       vma->vm_start,
                                         vma->vm_end);
                 mmu_notifier_invalidate_range_start(&range);
         }
@@ -3382,7 +3416,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
         /*
          * If sharing possible, alert mmu notifiers of worst case.
          */
-       mmu_notifier_range_init(&range, mm, start, end);
+       mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
+                               end);
         adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
         mmu_notifier_invalidate_range_start(&range);
         address = start;
@@ -3649,7 +3684,8 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
                             pages_per_huge_page(h));
         __SetPageUptodate(new_page);
  
-       mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h));
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
+                               haddr + huge_page_size(h));
         mmu_notifier_invalidate_range_start(&range);
  
         /*
@@ -3800,8 +3836,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                          * handling userfault.  Reacquire after handling
                          * fault to make calling code simpler.
                          */
-                       hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
-                                                       idx, haddr);
+                       hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
                         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                         ret = handle_userfault(&vmf, VM_UFFD_MISSING);
                         mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -3909,21 +3944,14 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
  }
  
  #ifdef CONFIG_SMP
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
-                           struct vm_area_struct *vma,
-                           struct address_space *mapping,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
                             pgoff_t idx, unsigned long address)
  {
         unsigned long key[2];
         u32 hash;
  
-       if (vma->vm_flags & VM_SHARED) {
-               key[0] = (unsigned long) mapping;
-               key[1] = idx;
-       } else {
-               key[0] = (unsigned long) mm;
-               key[1] = address >> huge_page_shift(h);
-       }
+       key[0] = (unsigned long) mapping;
+       key[1] = idx;
  
         hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
  
@@ -3934,9 +3962,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
   * For uniprocesor systems we always use a single mutex, so just
   * return 0 and avoid the hashing overhead.
   */
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
-                           struct vm_area_struct *vma,
-                           struct address_space *mapping,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
                             pgoff_t idx, unsigned long address)
  {
         return 0;
@@ -3981,7 +4007,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
          * get spurious allocation failures if two CPUs race to instantiate
          * the same page in the page cache.
          */
-       hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
+       hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
         mutex_lock(&hugetlb_fault_mutex_table[hash]);
  
         entry = huge_ptep_get(ptep);
@@ -4394,7 +4420,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
          * start/end.  Set range.start/range.end to cover the maximum possible
          * range if PMD sharing is possible.
          */
-       mmu_notifier_range_init(&range, mm, start, end);
+       mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
+                               0, vma, mm, start, end);
         adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
  
         BUG_ON(address >= end);
@@ -4500,6 +4527,11 @@ int hugetlb_reserve_pages(struct inode *inode,
          * called to make the mapping read-write. Assume !vma is a shm mapping
          */
         if (!vma || vma->vm_flags & VM_MAYSHARE) {
+               /*
+                * resv_map can not be NULL as hugetlb_reserve_pages is only
+                * called for inodes for which resv_maps were created (see
+                * hugetlbfs_get_inode).
+                */
                 resv_map = inode_resv_map(inode);
  
                 chg = region_chg(resv_map, from, to);
@@ -4591,6 +4623,10 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
         struct hugepage_subpool *spool = subpool_inode(inode);
         long gbl_reserve;
  
+       /*
+        * Since this routine can be called in the evict inode path for all
+        * hugetlbfs inodes, resv_map could be NULL.
+        */
         if (resv_map) {
                 chg = region_del(resv_map, start, end);
                 /*