mm, hotplug: fix page online with DEBUG_PAGEALLOC compiled but not enabled

[linux.git] / include / linux / mm.h
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 5adb8141d271adb6d27ec4549dda080b45ad679a..c54fb96cb1e6cc0fd85ab8cd337fe3b1b3f274b3 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -70,11 +70,6 @@ static inline void totalram_pages_add(long count)
         atomic_long_add(count, &_totalram_pages);
  }
  
-static inline void totalram_pages_set(long val)
-{
-       atomic_long_set(&_totalram_pages, val);
-}
-
  extern void * high_memory;
  extern int page_cluster;
  
@@ -625,24 +620,19 @@ unsigned long vmalloc_to_pfn(const void *addr);
   * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
   * is no special casing required.
   */
-static inline bool is_vmalloc_addr(const void *x)
-{
-#ifdef CONFIG_MMU
-       unsigned long addr = (unsigned long)x;
-
-       return addr >= VMALLOC_START && addr < VMALLOC_END;
-#else
-       return false;
-#endif
-}
  
  #ifndef is_ioremap_addr
  #define is_ioremap_addr(x) is_vmalloc_addr(x)
  #endif
  
  #ifdef CONFIG_MMU
+extern bool is_vmalloc_addr(const void *x);
  extern int is_vmalloc_or_module_addr(const void *x);
  #else
+static inline bool is_vmalloc_addr(const void *x)
+{
+       return false;
+}
  static inline int is_vmalloc_or_module_addr(const void *x)
  {
         return 0;
@@ -921,10 +911,6 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
  
  #define ZONEID_PGSHIFT         (ZONEID_PGOFF * (ZONEID_SHIFT != 0))
  
-#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
-#error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
-#endif
-
  #define ZONES_MASK             ((1UL << ZONES_WIDTH) - 1)
  #define NODES_MASK             ((1UL << NODES_WIDTH) - 1)
  #define SECTIONS_MASK          ((1UL << SECTIONS_WIDTH) - 1)
@@ -952,9 +938,10 @@ static inline bool is_zone_device_page(const struct page *page)
  #endif
  
  #ifdef CONFIG_DEV_PAGEMAP_OPS
-void __put_devmap_managed_page(struct page *page);
+void free_devmap_managed_page(struct page *page);
  DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
-static inline bool put_devmap_managed_page(struct page *page)
+
+static inline bool page_is_devmap_managed(struct page *page)
  {
         if (!static_branch_unlikely(&devmap_managed_key))
                 return false;
@@ -963,7 +950,6 @@ static inline bool put_devmap_managed_page(struct page *page)
         switch (page->pgmap->type) {
         case MEMORY_DEVICE_PRIVATE:
         case MEMORY_DEVICE_FS_DAX:
-               __put_devmap_managed_page(page);
                 return true;
         default:
                 break;
@@ -971,11 +957,17 @@ static inline bool put_devmap_managed_page(struct page *page)
         return false;
  }
  
+void put_devmap_managed_page(struct page *page);
+
  #else /* CONFIG_DEV_PAGEMAP_OPS */
-static inline bool put_devmap_managed_page(struct page *page)
+static inline bool page_is_devmap_managed(struct page *page)
  {
         return false;
  }
+
+static inline void put_devmap_managed_page(struct page *page)
+{
+}
  #endif /* CONFIG_DEV_PAGEMAP_OPS */
  
  static inline bool is_device_private_page(const struct page *page)
@@ -1028,37 +1020,37 @@ static inline void put_page(struct page *page)
          * need to inform the device driver through callback. See
          * include/linux/memremap.h and HMM for details.
          */
-       if (put_devmap_managed_page(page))
+       if (page_is_devmap_managed(page)) {
+               put_devmap_managed_page(page);
                 return;
+       }
  
         if (put_page_testzero(page))
                 __put_page(page);
  }
  
  /**
- * put_user_page() - release a gup-pinned page
+ * unpin_user_page() - release a gup-pinned page
   * @page:            pointer to page to be released
   *
- * Pages that were pinned via get_user_pages*() must be released via
- * either put_user_page(), or one of the put_user_pages*() routines
- * below. This is so that eventually, pages that are pinned via
- * get_user_pages*() can be separately tracked and uniquely handled. In
- * particular, interactions with RDMA and filesystems need special
- * handling.
+ * Pages that were pinned via pin_user_pages*() must be released via either
+ * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
+ * that eventually such pages can be separately tracked and uniquely handled. In
+ * particular, interactions with RDMA and filesystems need special handling.
   *
- * put_user_page() and put_page() are not interchangeable, despite this early
- * implementation that makes them look the same. put_user_page() calls must
- * be perfectly matched up with get_user_page() calls.
+ * unpin_user_page() and put_page() are not interchangeable, despite this early
+ * implementation that makes them look the same. unpin_user_page() calls must
+ * be perfectly matched up with pin*() calls.
   */
-static inline void put_user_page(struct page *page)
+static inline void unpin_user_page(struct page *page)
  {
         put_page(page);
  }
  
-void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
-                              bool make_dirty);
+void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
+                                bool make_dirty);
  
-void put_user_pages(struct page **pages, unsigned long npages);
+void unpin_user_pages(struct page **pages, unsigned long npages);
  
  #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
  #define SECTION_IN_PAGE_FLAGS
@@ -1506,9 +1498,16 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
                             unsigned long start, unsigned long nr_pages,
                             unsigned int gup_flags, struct page **pages,
                             struct vm_area_struct **vmas, int *locked);
+long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+                          unsigned long start, unsigned long nr_pages,
+                          unsigned int gup_flags, struct page **pages,
+                          struct vm_area_struct **vmas, int *locked);
  long get_user_pages(unsigned long start, unsigned long nr_pages,
                             unsigned int gup_flags, struct page **pages,
                             struct vm_area_struct **vmas);
+long pin_user_pages(unsigned long start, unsigned long nr_pages,
+                   unsigned int gup_flags, struct page **pages,
+                   struct vm_area_struct **vmas);
  long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
                     unsigned int gup_flags, struct page **pages, int *locked);
  long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
@@ -1516,6 +1515,8 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
  
  int get_user_pages_fast(unsigned long start, int nr_pages,
                         unsigned int gup_flags, struct page **pages);
+int pin_user_pages_fast(unsigned long start, int nr_pages,
+                       unsigned int gup_flags, struct page **pages);
  
  int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc);
  int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
@@ -2181,12 +2182,6 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn,
                                         struct mminit_pfnnid_cache *state);
  #endif
  
-#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
-void zero_resv_unavail(void);
-#else
-static inline void zero_resv_unavail(void) {}
-#endif
-
  extern void set_dma_reserve(unsigned long new_dma_reserve);
  extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long,
                 enum memmap_context, struct vmem_altmap *);
@@ -2328,6 +2323,7 @@ extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
                        struct list_head *uf, bool downgrade);
  extern int do_munmap(struct mm_struct *, unsigned long, size_t,
                      struct list_head *uf);
+extern int do_madvise(unsigned long start, size_t len_in, int behavior);
  
  static inline unsigned long
  do_mmap_pgoff(struct file *file, unsigned long addr,
@@ -2581,13 +2577,15 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
  #define FOLL_ANON      0x8000  /* don't do file mappings */
  #define FOLL_LONGTERM  0x10000 /* mapping lifetime is indefinite: see below */
  #define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */
+#define FOLL_PIN       0x40000 /* pages must be released via unpin_user_page */
  
  /*
- * NOTE on FOLL_LONGTERM:
+ * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each
+ * other. Here is what they mean, and how to use them:
   *
   * FOLL_LONGTERM indicates that the page will be held for an indefinite time
- * period _often_ under userspace control.  This is contrasted with
- * iov_iter_get_pages() where usages which are transient.
+ * period _often_ under userspace control.  This is in contrast to
+ * iov_iter_get_pages(), whose usages are transient.
   *
   * FIXME: For pages which are part of a filesystem, mappings are subject to the
   * lifetime enforced by the filesystem and we need guarantees that longterm
@@ -2602,11 +2600,39 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
   * Currently only get_user_pages() and get_user_pages_fast() support this flag
   * and calls to get_user_pages_[un]locked are specifically not allowed.  This
   * is due to an incompatibility with the FS DAX check and
- * FAULT_FLAG_ALLOW_RETRY
+ * FAULT_FLAG_ALLOW_RETRY.
   *
- * In the CMA case: longterm pins in a CMA region would unnecessarily fragment
- * that region.  And so CMA attempts to migrate the page before pinning when
+ * In the CMA case: long term pins in a CMA region would unnecessarily fragment
+ * that region.  And so, CMA attempts to migrate the page before pinning, when
   * FOLL_LONGTERM is specified.
+ *
+ * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount,
+ * but an additional pin counting system) will be invoked. This is intended for
+ * anything that gets a page reference and then touches page data (for example,
+ * Direct IO). This lets the filesystem know that some non-file-system entity is
+ * potentially changing the pages' data. In contrast to FOLL_GET (whose pages
+ * are released via put_page()), FOLL_PIN pages must be released, ultimately, by
+ * a call to unpin_user_page().
+ *
+ * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different
+ * and separate refcounting mechanisms, however, and that means that each has
+ * its own acquire and release mechanisms:
+ *
+ *     FOLL_GET: get_user_pages*() to acquire, and put_page() to release.
+ *
+ *     FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release.
+ *
+ * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call.
+ * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based
+ * calls applied to them, and that's perfectly OK. This is a constraint on the
+ * callers, not on the pages.)
+ *
+ * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never
+ * directly by the caller. That's in order to help avoid mismatches when
+ * releasing pages: get_user_pages*() pages must be released via put_page(),
+ * while pin_user_pages*() pages must be released via unpin_user_page().
+ *
+ * Please see Documentation/vm/pin_user_pages.rst for more information.
   */
  
  static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
@@ -2689,6 +2715,10 @@ static inline bool debug_pagealloc_enabled_static(void)
  #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP)
  extern void __kernel_map_pages(struct page *page, int numpages, int enable);
  
+/*
+ * When called in DEBUG_PAGEALLOC context, the call should most likely be
+ * guarded by debug_pagealloc_enabled() or debug_pagealloc_enabled_static()
+ */
  static inline void
  kernel_map_pages(struct page *page, int numpages, int enable)
  {