drm/prime: Ditch gem_prime_res_obj hook

[linux.git] / mm / vmscan.c
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 7889f583ced9fef1319cadf94a8e2ca7d9d4c40e..44df66a98f2adbbd2c18c38dec8eed9e45b4a6ce 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -131,6 +131,9 @@ struct scan_control {
                 unsigned int file_taken;
                 unsigned int taken;
         } nr;
+
+       /* for recording the reclaimed slab by now */
+       struct reclaim_state reclaim_state;
  };
  
  #ifdef ARCH_HAS_PREFETCH
@@ -238,6 +241,18 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
  }
  #endif /* CONFIG_MEMCG_KMEM */
  
+static void set_task_reclaim_state(struct task_struct *task,
+                                  struct reclaim_state *rs)
+{
+       /* Check for an overwrite */
+       WARN_ON_ONCE(rs && task->reclaim_state);
+
+       /* Check for the nulling of an already-nulled member */
+       WARN_ON_ONCE(!rs && !task->reclaim_state);
+
+       task->reclaim_state = rs;
+}
+
  #ifdef CONFIG_MEMCG
  static bool global_reclaim(struct scan_control *sc)
  {
@@ -1118,6 +1133,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 int may_enter_fs;
                 enum page_references references = PAGEREF_RECLAIM_CLEAN;
                 bool dirty, writeback;
+               unsigned int nr_pages;
  
                 cond_resched();
  
@@ -1129,7 +1145,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
  
                 VM_BUG_ON_PAGE(PageActive(page), page);
  
-               sc->nr_scanned++;
+               nr_pages = 1 << compound_order(page);
+
+               /* Account the number of base pages even though THP */
+               sc->nr_scanned += nr_pages;
  
                 if (unlikely(!page_evictable(page)))
                         goto activate_locked;
@@ -1137,11 +1156,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 if (!sc->may_unmap && page_mapped(page))
                         goto keep_locked;
  
-               /* Double the slab pressure for mapped and swapcache pages */
-               if ((page_mapped(page) || PageSwapCache(page)) &&
-                   !(PageAnon(page) && !PageSwapBacked(page)))
-                       sc->nr_scanned++;
-
                 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
                         (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
  
@@ -1255,7 +1269,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 case PAGEREF_ACTIVATE:
                         goto activate_locked;
                 case PAGEREF_KEEP:
-                       stat->nr_ref_keep++;
+                       stat->nr_ref_keep += nr_pages;
                         goto keep_locked;
                 case PAGEREF_RECLAIM:
                 case PAGEREF_RECLAIM_CLEAN:
@@ -1287,7 +1301,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                 }
                                 if (!add_to_swap(page)) {
                                         if (!PageTransHuge(page))
-                                               goto activate_locked;
+                                               goto activate_locked_split;
                                         /* Fallback to swap normal pages */
                                         if (split_huge_page_to_list(page,
                                                                     page_list))
@@ -1296,7 +1310,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                         count_vm_event(THP_SWPOUT_FALLBACK);
  #endif
                                         if (!add_to_swap(page))
-                                               goto activate_locked;
+                                               goto activate_locked_split;
                                 }
  
                                 may_enter_fs = 1;
@@ -1310,6 +1324,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                 goto keep_locked;
                 }
  
+               /*
+                * THP may get split above, need minus tail pages and update
+                * nr_pages to avoid accounting tail pages twice.
+                *
+                * The tail pages that are added into swap cache successfully
+                * reach here.
+                */
+               if ((nr_pages > 1) && !PageTransHuge(page)) {
+                       sc->nr_scanned -= (nr_pages - 1);
+                       nr_pages = 1;
+               }
+
                 /*
                  * The page is mapped into the page tables of one or more
                  * processes. Try to unmap it here.
@@ -1320,7 +1346,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         if (unlikely(PageTransHuge(page)))
                                 flags |= TTU_SPLIT_HUGE_PMD;
                         if (!try_to_unmap(page, flags)) {
-                               stat->nr_unmap_fail++;
+                               stat->nr_unmap_fail += nr_pages;
                                 goto activate_locked;
                         }
                 }
@@ -1447,7 +1473,11 @@ static unsigned long shrink_page_list(struct list_head *page_list,
  
                 unlock_page(page);
  free_it:
-               nr_reclaimed++;
+               /*
+                * THP may get swapped out in a whole, need account
+                * all base pages.
+                */
+               nr_reclaimed += nr_pages;
  
                 /*
                  * Is there need to periodically free_page_list? It would
@@ -1460,6 +1490,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         list_add(&page->lru, &free_pages);
                 continue;
  
+activate_locked_split:
+               /*
+                * The tail pages that are failed to add into swap cache
+                * reach here.  Fixup nr_scanned and nr_pages.
+                */
+               if (nr_pages > 1) {
+                       sc->nr_scanned -= (nr_pages - 1);
+                       nr_pages = 1;
+               }
  activate_locked:
                 /* Not a candidate for swapping, so reclaim swap space. */
                 if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
@@ -1469,8 +1508,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 if (!PageMlocked(page)) {
                         int type = page_is_file_cache(page);
                         SetPageActive(page);
-                       pgactivate++;
-                       stat->nr_activate[type] += hpage_nr_pages(page);
+                       stat->nr_activate[type] += nr_pages;
                         count_memcg_page_event(page, PGACTIVATE);
                 }
  keep_locked:
@@ -1480,6 +1518,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
         }
  
+       pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
+
         mem_cgroup_uncharge_list(&free_pages);
         try_to_unmap_flush();
         free_unref_page_list(&free_pages);
@@ -1651,10 +1691,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
         LIST_HEAD(pages_skipped);
         isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
  
+       total_scan = 0;
         scan = 0;
-       for (total_scan = 0;
-            scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src);
-            total_scan++) {
+       while (scan < nr_to_scan && !list_empty(src)) {
                 struct page *page;
  
                 page = lru_to_page(src);
@@ -1662,9 +1701,12 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
  
                 VM_BUG_ON_PAGE(!PageLRU(page), page);
  
+               nr_pages = 1 << compound_order(page);
+               total_scan += nr_pages;
+
                 if (page_zonenum(page) > sc->reclaim_idx) {
                         list_move(&page->lru, &pages_skipped);
-                       nr_skipped[page_zonenum(page)]++;
+                       nr_skipped[page_zonenum(page)] += nr_pages;
                         continue;
                 }
  
@@ -1673,11 +1715,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                  * return with no isolated pages if the LRU mostly contains
                  * ineligible pages.  This causes the VM to not reclaim any
                  * pages, triggering a premature OOM.
+                *
+                * Account all tail pages of THP.  This would not cause
+                * premature OOM since __isolate_lru_page() returns -EBUSY
+                * only when the page is being freed somewhere else.
                  */
-               scan++;
+               scan += nr_pages;
                 switch (__isolate_lru_page(page, mode)) {
                 case 0:
-                       nr_pages = hpage_nr_pages(page);
                         nr_taken += nr_pages;
                         nr_zone_taken[page_zonenum(page)] += nr_pages;
                         list_move(&page->lru, dst);
@@ -2125,7 +2170,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
   *   10TB     320        32GB
   */
  static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
-                                struct scan_control *sc, bool actual_reclaim)
+                                struct scan_control *sc, bool trace)
  {
         enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
         struct pglist_data *pgdat = lruvec_pgdat(lruvec);
@@ -2151,7 +2196,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
          * rid of the stale workingset quickly.
          */
         refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
-       if (file && actual_reclaim && lruvec->refaults != refaults) {
+       if (file && lruvec->refaults != refaults) {
                 inactive_ratio = 0;
         } else {
                 gb = (inactive + active) >> (30 - PAGE_SHIFT);
@@ -2161,7 +2206,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
                         inactive_ratio = 1;
         }
  
-       if (actual_reclaim)
+       if (trace)
                 trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
                         lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
                         lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
@@ -3161,11 +3206,13 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
         if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
                 return 1;
  
+       set_task_reclaim_state(current, &sc.reclaim_state);
         trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
  
         nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
  
         trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
+       set_task_reclaim_state(current, NULL);
  
         return nr_reclaimed;
  }
@@ -3188,6 +3235,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
         };
         unsigned long lru_pages;
  
+       set_task_reclaim_state(current, &sc.reclaim_state);
         sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                         (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
  
@@ -3205,7 +3253,9 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
  
         trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
  
+       set_task_reclaim_state(current, NULL);
         *nr_scanned = sc.nr_scanned;
+
         return sc.nr_reclaimed;
  }
  
@@ -3232,6 +3282,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                 .may_shrinkslab = 1,
         };
  
+       set_task_reclaim_state(current, &sc.reclaim_state);
         /*
          * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
          * take care of from where we get pages. So the node where we start the
@@ -3252,6 +3303,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
         psi_memstall_leave(&pflags);
  
         trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
+       set_task_reclaim_state(current, NULL);
  
         return nr_reclaimed;
  }
@@ -3453,6 +3505,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                 .may_unmap = 1,
         };
  
+       set_task_reclaim_state(current, &sc.reclaim_state);
         psi_memstall_enter(&pflags);
         __fs_reclaim_acquire();
  
@@ -3634,6 +3687,8 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
         snapshot_refaults(NULL, pgdat);
         __fs_reclaim_release();
         psi_memstall_leave(&pflags);
+       set_task_reclaim_state(current, NULL);
+
         /*
          * Return the order kswapd stopped reclaiming at as
          * prepare_kswapd_sleep() takes it into account. If another caller
@@ -3644,19 +3699,18 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
  }
  
  /*
- * pgdat->kswapd_classzone_idx is the highest zone index that a recent
- * allocation request woke kswapd for. When kswapd has not woken recently,
- * the value is MAX_NR_ZONES which is not a valid index. This compares a
- * given classzone and returns it or the highest classzone index kswapd
- * was recently woke for.
+ * The pgdat->kswapd_classzone_idx is used to pass the highest zone index to be
+ * reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is not
+ * a valid index then either kswapd runs for first time or kswapd couldn't sleep
+ * after previous reclaim attempt (node is still unbalanced). In that case
+ * return the zone index of the previous kswapd reclaim cycle.
   */
  static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
-                                          enum zone_type classzone_idx)
+                                          enum zone_type prev_classzone_idx)
  {
         if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
-               return classzone_idx;
-
-       return max(pgdat->kswapd_classzone_idx, classzone_idx);
+               return prev_classzone_idx;
+       return pgdat->kswapd_classzone_idx;
  }
  
  static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
@@ -3758,15 +3812,10 @@ static int kswapd(void *p)
         unsigned int classzone_idx = MAX_NR_ZONES - 1;
         pg_data_t *pgdat = (pg_data_t*)p;
         struct task_struct *tsk = current;
-
-       struct reclaim_state reclaim_state = {
-               .reclaimed_slab = 0,
-       };
         const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
  
         if (!cpumask_empty(cpumask))
                 set_cpus_allowed_ptr(tsk, cpumask);
-       current->reclaim_state = &reclaim_state;
  
         /*
          * Tell the memory management that we're a "memory allocator",
@@ -3797,7 +3846,7 @@ static int kswapd(void *p)
  
                 /* Read the new order and classzone_idx */
                 alloc_order = reclaim_order = pgdat->kswapd_order;
-               classzone_idx = kswapd_classzone_idx(pgdat, 0);
+               classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
                 pgdat->kswapd_order = 0;
                 pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
  
@@ -3828,7 +3877,6 @@ static int kswapd(void *p)
         }
  
         tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
-       current->reclaim_state = NULL;
  
         return 0;
  }
@@ -3851,8 +3899,12 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
         if (!cpuset_zone_allowed(zone, gfp_flags))
                 return;
         pgdat = zone->zone_pgdat;
-       pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
-                                                          classzone_idx);
+
+       if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
+               pgdat->kswapd_classzone_idx = classzone_idx;
+       else
+               pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx,
+                                                 classzone_idx);
         pgdat->kswapd_order = max(pgdat->kswapd_order, order);
         if (!waitqueue_active(&pgdat->kswapd_wait))
                 return;
@@ -3889,7 +3941,6 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
   */
  unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
  {
-       struct reclaim_state reclaim_state;
         struct scan_control sc = {
                 .nr_to_reclaim = nr_to_reclaim,
                 .gfp_mask = GFP_HIGHUSER_MOVABLE,
@@ -3901,18 +3952,16 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
                 .hibernation_mode = 1,
         };
         struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
-       struct task_struct *p = current;
         unsigned long nr_reclaimed;
         unsigned int noreclaim_flag;
  
         fs_reclaim_acquire(sc.gfp_mask);
         noreclaim_flag = memalloc_noreclaim_save();
-       reclaim_state.reclaimed_slab = 0;
-       p->reclaim_state = &reclaim_state;
+       set_task_reclaim_state(current, &sc.reclaim_state);
  
         nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
  
-       p->reclaim_state = NULL;
+       set_task_reclaim_state(current, NULL);
         memalloc_noreclaim_restore(noreclaim_flag);
         fs_reclaim_release(sc.gfp_mask);
  
@@ -4077,7 +4126,6 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
         /* Minimum pages needed in order to stay on node */
         const unsigned long nr_pages = 1 << order;
         struct task_struct *p = current;
-       struct reclaim_state reclaim_state;
         unsigned int noreclaim_flag;
         struct scan_control sc = {
                 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
@@ -4102,8 +4150,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
          */
         noreclaim_flag = memalloc_noreclaim_save();
         p->flags |= PF_SWAPWRITE;
-       reclaim_state.reclaimed_slab = 0;
-       p->reclaim_state = &reclaim_state;
+       set_task_reclaim_state(p, &sc.reclaim_state);
  
         if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
                 /*
@@ -4115,7 +4162,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
                 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
         }
  
-       p->reclaim_state = NULL;
+       set_task_reclaim_state(p, NULL);
         current->flags &= ~PF_SWAPWRITE;
         memalloc_noreclaim_restore(noreclaim_flag);
         fs_reclaim_release(sc.gfp_mask);