scripts/spelling.txt: add "algined" pattern and fix typo instances

[linux.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 6da3169d3750cf5876ef9c03f7e57ed5cf5aa61d..a7a6aac95a6d158690e1ca0981b1b652820248d4 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -59,7 +59,6 @@
  #include <linux/prefetch.h>
  #include <linux/mm_inline.h>
  #include <linux/migrate.h>
-#include <linux/page_ext.h>
  #include <linux/hugetlb.h>
  #include <linux/sched/rt.h>
  #include <linux/page_owner.h>
@@ -92,6 +91,10 @@ EXPORT_PER_CPU_SYMBOL(_numa_mem_);
  int _node_numa_mem_[MAX_NUMNODES];
  #endif
  
+/* work_structs for global per-cpu drains */
+DEFINE_MUTEX(pcpu_drain_mutex);
+DEFINE_PER_CPU(struct work_struct, pcpu_drain);
+
  #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
  volatile unsigned long latent_entropy __latent_entropy;
  EXPORT_SYMBOL(latent_entropy);
@@ -1085,10 +1088,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
  {
         int migratetype = 0;
         int batch_free = 0;
-       unsigned long nr_scanned;
+       unsigned long nr_scanned, flags;
         bool isolated_pageblocks;
  
-       spin_lock(&zone->lock);
+       spin_lock_irqsave(&zone->lock, flags);
         isolated_pageblocks = has_isolate_pageblock(zone);
         nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
         if (nr_scanned)
@@ -1137,7 +1140,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                         trace_mm_page_pcpu_drain(page, 0, mt);
                 } while (--count && --batch_free && !list_empty(list));
         }
-       spin_unlock(&zone->lock);
+       spin_unlock_irqrestore(&zone->lock, flags);
  }
  
  static void free_one_page(struct zone *zone,
@@ -1145,8 +1148,9 @@ static void free_one_page(struct zone *zone,
                                 unsigned int order,
                                 int migratetype)
  {
-       unsigned long nr_scanned;
-       spin_lock(&zone->lock);
+       unsigned long nr_scanned, flags;
+       spin_lock_irqsave(&zone->lock, flags);
+       __count_vm_events(PGFREE, 1 << order);
         nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
         if (nr_scanned)
                 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
@@ -1156,7 +1160,7 @@ static void free_one_page(struct zone *zone,
                 migratetype = get_pfnblock_migratetype(page, pfn);
         }
         __free_one_page(page, pfn, zone, order, migratetype);
-       spin_unlock(&zone->lock);
+       spin_unlock_irqrestore(&zone->lock, flags);
  }
  
  static void __meminit __init_single_page(struct page *page, unsigned long pfn,
@@ -1234,7 +1238,6 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
  
  static void __free_pages_ok(struct page *page, unsigned int order)
  {
-       unsigned long flags;
         int migratetype;
         unsigned long pfn = page_to_pfn(page);
  
@@ -1242,10 +1245,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
                 return;
  
         migratetype = get_pfnblock_migratetype(page, pfn);
-       local_irq_save(flags);
-       __count_vm_events(PGFREE, 1 << order);
         free_one_page(page_zone(page), page, pfn, order, migratetype);
-       local_irq_restore(flags);
  }
  
  static void __init __free_pages_boot_core(struct page *page, unsigned int order)
@@ -2217,8 +2217,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                         int migratetype, bool cold)
  {
         int i, alloced = 0;
+       unsigned long flags;
  
-       spin_lock(&zone->lock);
+       spin_lock_irqsave(&zone->lock, flags);
         for (i = 0; i < count; ++i) {
                 struct page *page = __rmqueue(zone, order, migratetype);
                 if (unlikely(page == NULL))
@@ -2254,7 +2255,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
          * pages added to the pcp list.
          */
         __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
-       spin_unlock(&zone->lock);
+       spin_unlock_irqrestore(&zone->lock, flags);
         return alloced;
  }
  
@@ -2339,16 +2340,26 @@ void drain_local_pages(struct zone *zone)
                 drain_pages(cpu);
  }
  
+static void drain_local_pages_wq(struct work_struct *work)
+{
+       /*
+        * drain_all_pages doesn't use proper cpu hotplug protection so
+        * we can race with cpu offline when the WQ can move this from
+        * a cpu pinned worker to an unbound one. We can operate on a different
+        * cpu which is allright but we also have to make sure to not move to
+        * a different one.
+        */
+       preempt_disable();
+       drain_local_pages(NULL);
+       preempt_enable();
+}
+
  /*
   * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
   *
   * When zone parameter is non-NULL, spill just the single zone's pages.
   *
- * Note that this code is protected against sending an IPI to an offline
- * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
- * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
- * nothing keeps CPUs from showing up after we populated the cpumask and
- * before the call to on_each_cpu_mask().
+ * Note that this can be extremely slow as the draining happens in a workqueue.
   */
  void drain_all_pages(struct zone *zone)
  {
@@ -2360,6 +2371,21 @@ void drain_all_pages(struct zone *zone)
          */
         static cpumask_t cpus_with_pcps;
  
+       /* Workqueues cannot recurse */
+       if (current->flags & PF_WQ_WORKER)
+               return;
+
+       /*
+        * Do not drain if one is already in progress unless it's specific to
+        * a zone. Such callers are primarily CMA and memory hotplug and need
+        * the drain to be complete when the call returns.
+        */
+       if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
+               if (!zone)
+                       return;
+               mutex_lock(&pcpu_drain_mutex);
+       }
+
         /*
          * We don't care about racing with CPU hotplug event
          * as offline notification will cause the notified
@@ -2390,8 +2416,16 @@ void drain_all_pages(struct zone *zone)
                 else
                         cpumask_clear_cpu(cpu, &cpus_with_pcps);
         }
-       on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
-                                                               zone, 1);
+
+       for_each_cpu(cpu, &cpus_with_pcps) {
+               struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
+               INIT_WORK(work, drain_local_pages_wq);
+               schedule_work_on(cpu, work);
+       }
+       for_each_cpu(cpu, &cpus_with_pcps)
+               flush_work(per_cpu_ptr(&pcpu_drain, cpu));
+
+       mutex_unlock(&pcpu_drain_mutex);
  }
  
  #ifdef CONFIG_HIBERNATION
@@ -2442,17 +2476,20 @@ void free_hot_cold_page(struct page *page, bool cold)
  {
         struct zone *zone = page_zone(page);
         struct per_cpu_pages *pcp;
-       unsigned long flags;
         unsigned long pfn = page_to_pfn(page);
         int migratetype;
  
+       if (in_interrupt()) {
+               __free_pages_ok(page, 0);
+               return;
+       }
+
         if (!free_pcp_prepare(page))
                 return;
  
         migratetype = get_pfnblock_migratetype(page, pfn);
         set_pcppage_migratetype(page, migratetype);
-       local_irq_save(flags);
-       __count_vm_event(PGFREE);
+       preempt_disable();
  
         /*
          * We only track unmovable, reclaimable and movable on pcp lists.
@@ -2469,6 +2506,7 @@ void free_hot_cold_page(struct page *page, bool cold)
                 migratetype = MIGRATE_MOVABLE;
         }
  
+       __count_vm_event(PGFREE);
         pcp = &this_cpu_ptr(zone->pageset)->pcp;
         if (!cold)
                 list_add(&page->lru, &pcp->lists[migratetype]);
@@ -2482,7 +2520,7 @@ void free_hot_cold_page(struct page *page, bool cold)
         }
  
  out:
-       local_irq_restore(flags);
+       preempt_enable();
  }
  
  /*
@@ -2600,74 +2638,105 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
  #endif
  }
  
+/* Remove page from the per-cpu list, caller must protect the list */
+static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
+                       bool cold, struct per_cpu_pages *pcp,
+                       struct list_head *list)
+{
+       struct page *page;
+
+       VM_BUG_ON(in_interrupt());
+
+       do {
+               if (list_empty(list)) {
+                       pcp->count += rmqueue_bulk(zone, 0,
+                                       pcp->batch, list,
+                                       migratetype, cold);
+                       if (unlikely(list_empty(list)))
+                               return NULL;
+               }
+
+               if (cold)
+                       page = list_last_entry(list, struct page, lru);
+               else
+                       page = list_first_entry(list, struct page, lru);
+
+               list_del(&page->lru);
+               pcp->count--;
+       } while (check_new_pcp(page));
+
+       return page;
+}
+
+/* Lock and remove page from the per-cpu list */
+static struct page *rmqueue_pcplist(struct zone *preferred_zone,
+                       struct zone *zone, unsigned int order,
+                       gfp_t gfp_flags, int migratetype)
+{
+       struct per_cpu_pages *pcp;
+       struct list_head *list;
+       bool cold = ((gfp_flags & __GFP_COLD) != 0);
+       struct page *page;
+
+       preempt_disable();
+       pcp = &this_cpu_ptr(zone->pageset)->pcp;
+       list = &pcp->lists[migratetype];
+       page = __rmqueue_pcplist(zone,  migratetype, cold, pcp, list);
+       if (page) {
+               __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+               zone_statistics(preferred_zone, zone);
+       }
+       preempt_enable();
+       return page;
+}
+
  /*
   * Allocate a page from the given zone. Use pcplists for order-0 allocations.
   */
  static inline
-struct page *buffered_rmqueue(struct zone *preferred_zone,
+struct page *rmqueue(struct zone *preferred_zone,
                         struct zone *zone, unsigned int order,
                         gfp_t gfp_flags, unsigned int alloc_flags,
                         int migratetype)
  {
         unsigned long flags;
         struct page *page;
-       bool cold = ((gfp_flags & __GFP_COLD) != 0);
-
-       if (likely(order == 0)) {
-               struct per_cpu_pages *pcp;
-               struct list_head *list;
-
-               local_irq_save(flags);
-               do {
-                       pcp = &this_cpu_ptr(zone->pageset)->pcp;
-                       list = &pcp->lists[migratetype];
-                       if (list_empty(list)) {
-                               pcp->count += rmqueue_bulk(zone, 0,
-                                               pcp->batch, list,
-                                               migratetype, cold);
-                               if (unlikely(list_empty(list)))
-                                       goto failed;
-                       }
-
-                       if (cold)
-                               page = list_last_entry(list, struct page, lru);
-                       else
-                               page = list_first_entry(list, struct page, lru);
  
-                       list_del(&page->lru);
-                       pcp->count--;
+       if (likely(order == 0) && !in_interrupt()) {
+               page = rmqueue_pcplist(preferred_zone, zone, order,
+                               gfp_flags, migratetype);
+               goto out;
+       }
  
-               } while (check_new_pcp(page));
-       } else {
-               /*
-                * We most definitely don't want callers attempting to
-                * allocate greater than order-1 page units with __GFP_NOFAIL.
-                */
-               WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
-               spin_lock_irqsave(&zone->lock, flags);
+       /*
+        * We most definitely don't want callers attempting to
+        * allocate greater than order-1 page units with __GFP_NOFAIL.
+        */
+       WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
+       spin_lock_irqsave(&zone->lock, flags);
  
-               do {
-                       page = NULL;
-                       if (alloc_flags & ALLOC_HARDER) {
-                               page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
-                               if (page)
-                                       trace_mm_page_alloc_zone_locked(page, order, migratetype);
-                       }
-                       if (!page)
-                               page = __rmqueue(zone, order, migratetype);
-               } while (page && check_new_pages(page, order));
-               spin_unlock(&zone->lock);
+       do {
+               page = NULL;
+               if (alloc_flags & ALLOC_HARDER) {
+                       page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+                       if (page)
+                               trace_mm_page_alloc_zone_locked(page, order, migratetype);
+               }
                 if (!page)
-                       goto failed;
-               __mod_zone_freepage_state(zone, -(1 << order),
-                                         get_pcppage_migratetype(page));
-       }
+                       page = __rmqueue(zone, order, migratetype);
+       } while (page && check_new_pages(page, order));
+       spin_unlock(&zone->lock);
+       if (!page)
+               goto failed;
+       __mod_zone_freepage_state(zone, -(1 << order),
+                                 get_pcppage_migratetype(page));
  
         __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
         zone_statistics(preferred_zone, zone);
         local_irq_restore(flags);
  
-       VM_BUG_ON_PAGE(bad_range(zone, page), page);
+out:
+       VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
         return page;
  
  failed:
@@ -2875,7 +2944,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
  #ifdef CONFIG_NUMA
  static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
  {
-       return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
+       return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
                                 RECLAIM_DISTANCE;
  }
  #else  /* CONFIG_NUMA */
@@ -2972,7 +3041,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
                 }
  
  try_this_zone:
-               page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order,
+               page = rmqueue(ac->preferred_zoneref->zone, zone, order,
                                 gfp_mask, alloc_flags, ac->migratetype);
                 if (page) {
                         prep_new_page(page, order, gfp_mask, alloc_flags);
@@ -3005,7 +3074,7 @@ static inline bool should_suppress_show_mem(void)
         return ret;
  }
  
-static void warn_alloc_show_mem(gfp_t gfp_mask)
+static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
  {
         unsigned int filter = SHOW_MEM_FILTER_NODES;
         static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
@@ -3025,10 +3094,10 @@ static void warn_alloc_show_mem(gfp_t gfp_mask)
         if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
                 filter &= ~SHOW_MEM_FILTER_NODES;
  
-       show_mem(filter);
+       show_mem(filter, nodemask);
  }
  
-void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
+void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
  {
         struct va_format vaf;
         va_list args;
@@ -3047,10 +3116,36 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
         pr_cont("%pV", &vaf);
         va_end(args);
  
-       pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask);
+       pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask);
+       if (nodemask)
+               pr_cont("%*pbl\n", nodemask_pr_args(nodemask));
+       else
+               pr_cont("(null)\n");
+
+       cpuset_print_current_mems_allowed();
  
         dump_stack();
-       warn_alloc_show_mem(gfp_mask);
+       warn_alloc_show_mem(gfp_mask, nodemask);
+}
+
+static inline struct page *
+__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
+                             unsigned int alloc_flags,
+                             const struct alloc_context *ac)
+{
+       struct page *page;
+
+       page = get_page_from_freelist(gfp_mask, order,
+                       alloc_flags|ALLOC_CPUSET, ac);
+       /*
+        * fallback to ignore cpuset restriction if our nodes
+        * are depleted
+        */
+       if (!page)
+               page = get_page_from_freelist(gfp_mask, order,
+                               alloc_flags, ac);
+
+       return page;
  }
  
  static inline struct page *
@@ -3088,47 +3183,42 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
         if (page)
                 goto out;
  
-       if (!(gfp_mask & __GFP_NOFAIL)) {
-               /* Coredumps can quickly deplete all memory reserves */
-               if (current->flags & PF_DUMPCORE)
-                       goto out;
-               /* The OOM killer will not help higher order allocs */
-               if (order > PAGE_ALLOC_COSTLY_ORDER)
-                       goto out;
-               /* The OOM killer does not needlessly kill tasks for lowmem */
-               if (ac->high_zoneidx < ZONE_NORMAL)
-                       goto out;
-               if (pm_suspended_storage())
-                       goto out;
-               /*
-                * XXX: GFP_NOFS allocations should rather fail than rely on
-                * other request to make a forward progress.
-                * We are in an unfortunate situation where out_of_memory cannot
-                * do much for this context but let's try it to at least get
-                * access to memory reserved if the current task is killed (see
-                * out_of_memory). Once filesystems are ready to handle allocation
-                * failures more gracefully we should just bail out here.
-                */
+       /* Coredumps can quickly deplete all memory reserves */
+       if (current->flags & PF_DUMPCORE)
+               goto out;
+       /* The OOM killer will not help higher order allocs */
+       if (order > PAGE_ALLOC_COSTLY_ORDER)
+               goto out;
+       /* The OOM killer does not needlessly kill tasks for lowmem */
+       if (ac->high_zoneidx < ZONE_NORMAL)
+               goto out;
+       if (pm_suspended_storage())
+               goto out;
+       /*
+        * XXX: GFP_NOFS allocations should rather fail than rely on
+        * other request to make a forward progress.
+        * We are in an unfortunate situation where out_of_memory cannot
+        * do much for this context but let's try it to at least get
+        * access to memory reserved if the current task is killed (see
+        * out_of_memory). Once filesystems are ready to handle allocation
+        * failures more gracefully we should just bail out here.
+        */
+
+       /* The OOM killer may not free memory on a specific node */
+       if (gfp_mask & __GFP_THISNODE)
+               goto out;
  
-               /* The OOM killer may not free memory on a specific node */
-               if (gfp_mask & __GFP_THISNODE)
-                       goto out;
-       }
         /* Exhausted what can be done so it's blamo time */
         if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
                 *did_some_progress = 1;
  
-               if (gfp_mask & __GFP_NOFAIL) {
-                       page = get_page_from_freelist(gfp_mask, order,
-                                       ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac);
-                       /*
-                        * fallback to ignore cpuset restriction if our nodes
-                        * are depleted
-                        */
-                       if (!page)
-                               page = get_page_from_freelist(gfp_mask, order,
+               /*
+                * Help non-failing allocations by giving them access to memory
+                * reserves
+                */
+               if (gfp_mask & __GFP_NOFAIL)
+                       page = __alloc_pages_cpuset_fallback(gfp_mask, order,
                                         ALLOC_NO_WATERMARKS, ac);
-               }
         }
  out:
         mutex_unlock(&oom_lock);
@@ -3575,6 +3665,14 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         no_progress_loops = 0;
         compact_priority = DEF_COMPACT_PRIORITY;
         cpuset_mems_cookie = read_mems_allowed_begin();
+
+       /*
+        * The fast path uses conservative alloc_flags to succeed only until
+        * kswapd needs to be woken up, and to avoid the cost of setting up
+        * alloc_flags precisely. So we do that now.
+        */
+       alloc_flags = gfp_to_alloc_flags(gfp_mask);
+
         /*
          * We need to recalculate the starting point for the zonelist iterator
          * because we might have used different nodemask in the fast path, or
@@ -3586,14 +3684,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         if (!ac->preferred_zoneref->zone)
                 goto nopage;
  
-
-       /*
-        * The fast path uses conservative alloc_flags to succeed only until
-        * kswapd needs to be woken up, and to avoid the cost of setting up
-        * alloc_flags precisely. So we do that now.
-        */
-       alloc_flags = gfp_to_alloc_flags(gfp_mask);
-
         if (gfp_mask & __GFP_KSWAPD_RECLAIM)
                 wake_all_kswapds(order, ac);
  
@@ -3670,35 +3760,21 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                 goto got_pg;
  
         /* Caller is not willing to reclaim, we can't balance anything */
-       if (!can_direct_reclaim) {
-               /*
-                * All existing users of the __GFP_NOFAIL are blockable, so warn
-                * of any new users that actually allow this type of allocation
-                * to fail.
-                */
-               WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
+       if (!can_direct_reclaim)
                 goto nopage;
-       }
  
-       /* Avoid recursion of direct reclaim */
-       if (current->flags & PF_MEMALLOC) {
-               /*
-                * __GFP_NOFAIL request from this context is rather bizarre
-                * because we cannot reclaim anything and only can loop waiting
-                * for somebody to do a work for us.
-                */
-               if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
-                       cond_resched();
-                       goto retry;
-               }
-               goto nopage;
+       /* Make sure we know about allocations which stall for too long */
+       if (time_after(jiffies, alloc_start + stall_timeout)) {
+               warn_alloc(gfp_mask, ac->nodemask,
+                       "page allocation stalls for %ums, order:%u",
+                       jiffies_to_msecs(jiffies-alloc_start), order);
+               stall_timeout += 10 * HZ;
         }
  
-       /* Avoid allocations with no watermarks from looping endlessly */
-       if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
+       /* Avoid recursion of direct reclaim */
+       if (current->flags & PF_MEMALLOC)
                 goto nopage;
  
-
         /* Try direct reclaim and then allocating */
         page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
                                                         &did_some_progress);
@@ -3722,14 +3798,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
                 goto nopage;
  
-       /* Make sure we know about allocations which stall for too long */
-       if (time_after(jiffies, alloc_start + stall_timeout)) {
-               warn_alloc(gfp_mask,
-                       "page allocation stalls for %ums, order:%u",
-                       jiffies_to_msecs(jiffies-alloc_start), order);
-               stall_timeout += 10 * HZ;
-       }
-
         if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
                                  did_some_progress > 0, &no_progress_loops))
                 goto retry;
@@ -3758,6 +3826,10 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         if (page)
                 goto got_pg;
  
+       /* Avoid allocations with no watermarks from looping endlessly */
+       if (test_thread_flag(TIF_MEMDIE))
+               goto nopage;
+
         /* Retry as long as the OOM killer is making progress */
         if (did_some_progress) {
                 no_progress_loops = 0;
@@ -3775,82 +3847,123 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         if (read_mems_allowed_retry(cpuset_mems_cookie))
                 goto retry_cpuset;
  
-       warn_alloc(gfp_mask,
+       /*
+        * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
+        * we always retry
+        */
+       if (gfp_mask & __GFP_NOFAIL) {
+               /*
+                * All existing users of the __GFP_NOFAIL are blockable, so warn
+                * of any new users that actually require GFP_NOWAIT
+                */
+               if (WARN_ON_ONCE(!can_direct_reclaim))
+                       goto fail;
+
+               /*
+                * PF_MEMALLOC request from this context is rather bizarre
+                * because we cannot reclaim anything and only can loop waiting
+                * for somebody to do a work for us
+                */
+               WARN_ON_ONCE(current->flags & PF_MEMALLOC);
+
+               /*
+                * non failing costly orders are a hard requirement which we
+                * are not prepared for much so let's warn about these users
+                * so that we can identify them and convert them to something
+                * else.
+                */
+               WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
+
+               /*
+                * Help non-failing allocations by giving them access to memory
+                * reserves but do not use ALLOC_NO_WATERMARKS because this
+                * could deplete whole memory reserves which would just make
+                * the situation worse
+                */
+               page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
+               if (page)
+                       goto got_pg;
+
+               cond_resched();
+               goto retry;
+       }
+fail:
+       warn_alloc(gfp_mask, ac->nodemask,
                         "page allocation failure: order:%u", order);
  got_pg:
         return page;
  }
  
-/*
- * This is the 'heart' of the zoned buddy allocator.
- */
-struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
-                       struct zonelist *zonelist, nodemask_t *nodemask)
+static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
+               struct zonelist *zonelist, nodemask_t *nodemask,
+               struct alloc_context *ac, gfp_t *alloc_mask,
+               unsigned int *alloc_flags)
  {
-       struct page *page;
-       unsigned int alloc_flags = ALLOC_WMARK_LOW;
-       gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
-       struct alloc_context ac = {
-               .high_zoneidx = gfp_zone(gfp_mask),
-               .zonelist = zonelist,
-               .nodemask = nodemask,
-               .migratetype = gfpflags_to_migratetype(gfp_mask),
-       };
+       ac->high_zoneidx = gfp_zone(gfp_mask);
+       ac->zonelist = zonelist;
+       ac->nodemask = nodemask;
+       ac->migratetype = gfpflags_to_migratetype(gfp_mask);
  
         if (cpusets_enabled()) {
-               alloc_mask |= __GFP_HARDWALL;
-               alloc_flags |= ALLOC_CPUSET;
-               if (!ac.nodemask)
-                       ac.nodemask = &cpuset_current_mems_allowed;
+               *alloc_mask |= __GFP_HARDWALL;
+               if (!ac->nodemask)
+                       ac->nodemask = &cpuset_current_mems_allowed;
+               else
+                       *alloc_flags |= ALLOC_CPUSET;
         }
  
-       gfp_mask &= gfp_allowed_mask;
-
         lockdep_trace_alloc(gfp_mask);
  
         might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
  
         if (should_fail_alloc_page(gfp_mask, order))
-               return NULL;
+               return false;
  
-       /*
-        * Check the zones suitable for the gfp_mask contain at least one
-        * valid zone. It's possible to have an empty zonelist as a result
-        * of __GFP_THISNODE and a memoryless node
-        */
-       if (unlikely(!zonelist->_zonerefs->zone))
-               return NULL;
+       if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
+               *alloc_flags |= ALLOC_CMA;
  
-       if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
-               alloc_flags |= ALLOC_CMA;
+       return true;
+}
  
+/* Determine whether to spread dirty pages and what the first usable zone */
+static inline void finalise_ac(gfp_t gfp_mask,
+               unsigned int order, struct alloc_context *ac)
+{
         /* Dirty zone balancing only done in the fast path */
-       ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
+       ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
  
         /*
          * The preferred zone is used for statistics but crucially it is
          * also used as the starting point for the zonelist iterator. It
          * may get reset for allocations that ignore memory policies.
          */
-       ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
-                                       ac.high_zoneidx, ac.nodemask);
-       if (!ac.preferred_zoneref->zone) {
-               page = NULL;
-               /*
-                * This might be due to race with cpuset_current_mems_allowed
-                * update, so make sure we retry with original nodemask in the
-                * slow path.
-                */
-               goto no_zone;
-       }
+       ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
+                                       ac->high_zoneidx, ac->nodemask);
+}
+
+/*
+ * This is the 'heart' of the zoned buddy allocator.
+ */
+struct page *
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
+                       struct zonelist *zonelist, nodemask_t *nodemask)
+{
+       struct page *page;
+       unsigned int alloc_flags = ALLOC_WMARK_LOW;
+       gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
+       struct alloc_context ac = { };
+
+       gfp_mask &= gfp_allowed_mask;
+       if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags))
+               return NULL;
+
+       finalise_ac(gfp_mask, order, &ac);
  
         /* First allocation attempt */
         page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
         if (likely(page))
                 goto out;
  
-no_zone:
         /*
          * Runtime PM, block IO and its error handling path can deadlock
          * because I/O on the device might not complete.
@@ -4272,20 +4385,20 @@ void si_meminfo_node(struct sysinfo *val, int nid)
   * Determine whether the node should be displayed or not, depending on whether
   * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
   */
-bool skip_free_areas_node(unsigned int flags, int nid)
+static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
  {
-       bool ret = false;
-       unsigned int cpuset_mems_cookie;
-
         if (!(flags & SHOW_MEM_FILTER_NODES))
-               goto out;
+               return false;
  
-       do {
-               cpuset_mems_cookie = read_mems_allowed_begin();
-               ret = !node_isset(nid, cpuset_current_mems_allowed);
-       } while (read_mems_allowed_retry(cpuset_mems_cookie));
-out:
-       return ret;
+       /*
+        * no node mask - aka implicit memory numa policy. Do not bother with
+        * the synchronization - read_mems_allowed_begin - because we do not
+        * have to be precise here.
+        */
+       if (!nodemask)
+               nodemask = &cpuset_current_mems_allowed;
+
+       return !node_isset(nid, *nodemask);
  }
  
  #define K(x) ((x) << (PAGE_SHIFT-10))
@@ -4326,7 +4439,7 @@ static void show_migration_types(unsigned char type)
   * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
   *   cpuset.
   */
-void show_free_areas(unsigned int filter)
+void show_free_areas(unsigned int filter, nodemask_t *nodemask)
  {
         unsigned long free_pcp = 0;
         int cpu;
@@ -4334,7 +4447,7 @@ void show_free_areas(unsigned int filter)
         pg_data_t *pgdat;
  
         for_each_populated_zone(zone) {
-               if (skip_free_areas_node(filter, zone_to_nid(zone)))
+               if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
                         continue;
  
                 for_each_online_cpu(cpu)
@@ -4368,6 +4481,9 @@ void show_free_areas(unsigned int filter)
                 global_page_state(NR_FREE_CMA_PAGES));
  
         for_each_online_pgdat(pgdat) {
+               if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
+                       continue;
+
                 printk("Node %d"
                         " active_anon:%lukB"
                         " inactive_anon:%lukB"
@@ -4417,7 +4533,7 @@ void show_free_areas(unsigned int filter)
         for_each_populated_zone(zone) {
                 int i;
  
-               if (skip_free_areas_node(filter, zone_to_nid(zone)))
+               if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
                         continue;
  
                 free_pcp = 0;
@@ -4482,7 +4598,7 @@ void show_free_areas(unsigned int filter)
                 unsigned long nr[MAX_ORDER], flags, total = 0;
                 unsigned char types[MAX_ORDER];
  
-               if (skip_free_areas_node(filter, zone_to_nid(zone)))
+               if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
                         continue;
                 show_node(zone);
                 printk(KERN_CONT "%s: ", zone->name);
@@ -5809,7 +5925,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
          * the zone and SPARSEMEM is in use. If there are holes within the
          * zone, each populated memory region may cost us one or two extra
          * memmap pages due to alignment because memmap pages for each
-        * populated regions may not naturally algined on page boundary.
+        * populated regions may not be naturally aligned on page boundary.
          * So the (present_pages >> 4) heuristic is a tradeoff for that.
          */
         if (spanned_pages > present_pages + (present_pages >> 4) &&
@@ -6373,8 +6489,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
  
                 start_pfn = end_pfn;
         }
-       arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
-       arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
  
         /* Find the PFNs that ZONE_MOVABLE begins at in each node */
         memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
@@ -7110,8 +7224,9 @@ void *__init alloc_large_system_hash(const char *tablename,
   * If @count is not zero, it is okay to include less @count unmovable pages
   *
   * PageLRU check without isolation or lru_lock could race so that
- * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
- * expect this function should be exact.
+ * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
+ * check without lock_page also may miss some movable non-lru pages at
+ * race condition. So you can't expect this function should be exact.
   */
  bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
                          bool skip_hwpoisoned_pages)
@@ -7167,6 +7282,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
                 if (skip_hwpoisoned_pages && PageHWPoison(page))
                         continue;
  
+               if (__PageMovable(page))
+                       continue;
+
                 if (!PageLRU(page))
                         found++;
                 /*
@@ -7278,6 +7396,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
   *                     #MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
   *                     in range must have the same migratetype and it must
   *                     be either of the two.
+ * @gfp_mask:  GFP mask to use during compaction
   *
   * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
   * aligned, however it's the caller's responsibility to guarantee that
@@ -7291,7 +7410,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
   * need to be freed with free_contig_range().
   */
  int alloc_contig_range(unsigned long start, unsigned long end,
-                      unsigned migratetype)
+                      unsigned migratetype, gfp_t gfp_mask)
  {
         unsigned long outer_start, outer_end;
         unsigned int order;
@@ -7303,7 +7422,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
                 .zone = page_zone(pfn_to_page(start)),
                 .mode = MIGRATE_SYNC,
                 .ignore_skip_hint = true,
-               .gfp_mask = GFP_KERNEL,
+               .gfp_mask = memalloc_noio_flags(gfp_mask),
         };
         INIT_LIST_HEAD(&cc.migratepages);