]> asedeno.scripts.mit.edu Git - linux.git/blobdiff - mm/vmscan.c
drm/prime: Ditch gem_prime_res_obj hook
[linux.git] / mm / vmscan.c
index 7889f583ced9fef1319cadf94a8e2ca7d9d4c40e..44df66a98f2adbbd2c18c38dec8eed9e45b4a6ce 100644 (file)
@@ -131,6 +131,9 @@ struct scan_control {
                unsigned int file_taken;
                unsigned int taken;
        } nr;
+
+       /* for recording the reclaimed slab by now */
+       struct reclaim_state reclaim_state;
 };
 
 #ifdef ARCH_HAS_PREFETCH
@@ -238,6 +241,18 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
 }
 #endif /* CONFIG_MEMCG_KMEM */
 
+static void set_task_reclaim_state(struct task_struct *task,
+                                  struct reclaim_state *rs)
+{
+       /* Check for an overwrite */
+       WARN_ON_ONCE(rs && task->reclaim_state);
+
+       /* Check for the nulling of an already-nulled member */
+       WARN_ON_ONCE(!rs && !task->reclaim_state);
+
+       task->reclaim_state = rs;
+}
+
 #ifdef CONFIG_MEMCG
 static bool global_reclaim(struct scan_control *sc)
 {
@@ -1118,6 +1133,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                int may_enter_fs;
                enum page_references references = PAGEREF_RECLAIM_CLEAN;
                bool dirty, writeback;
+               unsigned int nr_pages;
 
                cond_resched();
 
@@ -1129,7 +1145,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
                VM_BUG_ON_PAGE(PageActive(page), page);
 
-               sc->nr_scanned++;
+               nr_pages = 1 << compound_order(page);
+
+               /* Account the number of base pages even though THP */
+               sc->nr_scanned += nr_pages;
 
                if (unlikely(!page_evictable(page)))
                        goto activate_locked;
@@ -1137,11 +1156,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                if (!sc->may_unmap && page_mapped(page))
                        goto keep_locked;
 
-               /* Double the slab pressure for mapped and swapcache pages */
-               if ((page_mapped(page) || PageSwapCache(page)) &&
-                   !(PageAnon(page) && !PageSwapBacked(page)))
-                       sc->nr_scanned++;
-
                may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
                        (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
 
@@ -1255,7 +1269,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                case PAGEREF_ACTIVATE:
                        goto activate_locked;
                case PAGEREF_KEEP:
-                       stat->nr_ref_keep++;
+                       stat->nr_ref_keep += nr_pages;
                        goto keep_locked;
                case PAGEREF_RECLAIM:
                case PAGEREF_RECLAIM_CLEAN:
@@ -1287,7 +1301,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                }
                                if (!add_to_swap(page)) {
                                        if (!PageTransHuge(page))
-                                               goto activate_locked;
+                                               goto activate_locked_split;
                                        /* Fallback to swap normal pages */
                                        if (split_huge_page_to_list(page,
                                                                    page_list))
@@ -1296,7 +1310,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                        count_vm_event(THP_SWPOUT_FALLBACK);
 #endif
                                        if (!add_to_swap(page))
-                                               goto activate_locked;
+                                               goto activate_locked_split;
                                }
 
                                may_enter_fs = 1;
@@ -1310,6 +1324,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                goto keep_locked;
                }
 
+               /*
+                * THP may get split above, need minus tail pages and update
+                * nr_pages to avoid accounting tail pages twice.
+                *
+                * The tail pages that are added into swap cache successfully
+                * reach here.
+                */
+               if ((nr_pages > 1) && !PageTransHuge(page)) {
+                       sc->nr_scanned -= (nr_pages - 1);
+                       nr_pages = 1;
+               }
+
                /*
                 * The page is mapped into the page tables of one or more
                 * processes. Try to unmap it here.
@@ -1320,7 +1346,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        if (unlikely(PageTransHuge(page)))
                                flags |= TTU_SPLIT_HUGE_PMD;
                        if (!try_to_unmap(page, flags)) {
-                               stat->nr_unmap_fail++;
+                               stat->nr_unmap_fail += nr_pages;
                                goto activate_locked;
                        }
                }
@@ -1447,7 +1473,11 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
                unlock_page(page);
 free_it:
-               nr_reclaimed++;
+               /*
+                * THP may get swapped out in a whole, need account
+                * all base pages.
+                */
+               nr_reclaimed += nr_pages;
 
                /*
                 * Is there need to periodically free_page_list? It would
@@ -1460,6 +1490,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        list_add(&page->lru, &free_pages);
                continue;
 
+activate_locked_split:
+               /*
+                * The tail pages that are failed to add into swap cache
+                * reach here.  Fixup nr_scanned and nr_pages.
+                */
+               if (nr_pages > 1) {
+                       sc->nr_scanned -= (nr_pages - 1);
+                       nr_pages = 1;
+               }
 activate_locked:
                /* Not a candidate for swapping, so reclaim swap space. */
                if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
@@ -1469,8 +1508,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                if (!PageMlocked(page)) {
                        int type = page_is_file_cache(page);
                        SetPageActive(page);
-                       pgactivate++;
-                       stat->nr_activate[type] += hpage_nr_pages(page);
+                       stat->nr_activate[type] += nr_pages;
                        count_memcg_page_event(page, PGACTIVATE);
                }
 keep_locked:
@@ -1480,6 +1518,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
        }
 
+       pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
+
        mem_cgroup_uncharge_list(&free_pages);
        try_to_unmap_flush();
        free_unref_page_list(&free_pages);
@@ -1651,10 +1691,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
        LIST_HEAD(pages_skipped);
        isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
 
+       total_scan = 0;
        scan = 0;
-       for (total_scan = 0;
-            scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src);
-            total_scan++) {
+       while (scan < nr_to_scan && !list_empty(src)) {
                struct page *page;
 
                page = lru_to_page(src);
@@ -1662,9 +1701,12 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 
                VM_BUG_ON_PAGE(!PageLRU(page), page);
 
+               nr_pages = 1 << compound_order(page);
+               total_scan += nr_pages;
+
                if (page_zonenum(page) > sc->reclaim_idx) {
                        list_move(&page->lru, &pages_skipped);
-                       nr_skipped[page_zonenum(page)]++;
+                       nr_skipped[page_zonenum(page)] += nr_pages;
                        continue;
                }
 
@@ -1673,11 +1715,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                 * return with no isolated pages if the LRU mostly contains
                 * ineligible pages.  This causes the VM to not reclaim any
                 * pages, triggering a premature OOM.
+                *
+                * Account all tail pages of THP.  This would not cause
+                * premature OOM since __isolate_lru_page() returns -EBUSY
+                * only when the page is being freed somewhere else.
                 */
-               scan++;
+               scan += nr_pages;
                switch (__isolate_lru_page(page, mode)) {
                case 0:
-                       nr_pages = hpage_nr_pages(page);
                        nr_taken += nr_pages;
                        nr_zone_taken[page_zonenum(page)] += nr_pages;
                        list_move(&page->lru, dst);
@@ -2125,7 +2170,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
  *   10TB     320        32GB
  */
 static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
-                                struct scan_control *sc, bool actual_reclaim)
+                                struct scan_control *sc, bool trace)
 {
        enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);
@@ -2151,7 +2196,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
         * rid of the stale workingset quickly.
         */
        refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
-       if (file && actual_reclaim && lruvec->refaults != refaults) {
+       if (file && lruvec->refaults != refaults) {
                inactive_ratio = 0;
        } else {
                gb = (inactive + active) >> (30 - PAGE_SHIFT);
@@ -2161,7 +2206,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
                        inactive_ratio = 1;
        }
 
-       if (actual_reclaim)
+       if (trace)
                trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
                        lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
                        lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
@@ -3161,11 +3206,13 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
        if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
                return 1;
 
+       set_task_reclaim_state(current, &sc.reclaim_state);
        trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
 
        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
 
        trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
+       set_task_reclaim_state(current, NULL);
 
        return nr_reclaimed;
 }
@@ -3188,6 +3235,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
        };
        unsigned long lru_pages;
 
+       set_task_reclaim_state(current, &sc.reclaim_state);
        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
 
@@ -3205,7 +3253,9 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
 
        trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
 
+       set_task_reclaim_state(current, NULL);
        *nr_scanned = sc.nr_scanned;
+
        return sc.nr_reclaimed;
 }
 
@@ -3232,6 +3282,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                .may_shrinkslab = 1,
        };
 
+       set_task_reclaim_state(current, &sc.reclaim_state);
        /*
         * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
         * take care of from where we get pages. So the node where we start the
@@ -3252,6 +3303,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
        psi_memstall_leave(&pflags);
 
        trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
+       set_task_reclaim_state(current, NULL);
 
        return nr_reclaimed;
 }
@@ -3453,6 +3505,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                .may_unmap = 1,
        };
 
+       set_task_reclaim_state(current, &sc.reclaim_state);
        psi_memstall_enter(&pflags);
        __fs_reclaim_acquire();
 
@@ -3634,6 +3687,8 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
        snapshot_refaults(NULL, pgdat);
        __fs_reclaim_release();
        psi_memstall_leave(&pflags);
+       set_task_reclaim_state(current, NULL);
+
        /*
         * Return the order kswapd stopped reclaiming at as
         * prepare_kswapd_sleep() takes it into account. If another caller
@@ -3644,19 +3699,18 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 }
 
 /*
- * pgdat->kswapd_classzone_idx is the highest zone index that a recent
- * allocation request woke kswapd for. When kswapd has not woken recently,
- * the value is MAX_NR_ZONES which is not a valid index. This compares a
- * given classzone and returns it or the highest classzone index kswapd
- * was recently woke for.
+ * The pgdat->kswapd_classzone_idx is used to pass the highest zone index to be
+ * reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is not
+ * a valid index then either kswapd runs for first time or kswapd couldn't sleep
+ * after previous reclaim attempt (node is still unbalanced). In that case
+ * return the zone index of the previous kswapd reclaim cycle.
  */
 static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
-                                          enum zone_type classzone_idx)
+                                          enum zone_type prev_classzone_idx)
 {
        if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
-               return classzone_idx;
-
-       return max(pgdat->kswapd_classzone_idx, classzone_idx);
+               return prev_classzone_idx;
+       return pgdat->kswapd_classzone_idx;
 }
 
 static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
@@ -3758,15 +3812,10 @@ static int kswapd(void *p)
        unsigned int classzone_idx = MAX_NR_ZONES - 1;
        pg_data_t *pgdat = (pg_data_t*)p;
        struct task_struct *tsk = current;
-
-       struct reclaim_state reclaim_state = {
-               .reclaimed_slab = 0,
-       };
        const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
 
        if (!cpumask_empty(cpumask))
                set_cpus_allowed_ptr(tsk, cpumask);
-       current->reclaim_state = &reclaim_state;
 
        /*
         * Tell the memory management that we're a "memory allocator",
@@ -3797,7 +3846,7 @@ static int kswapd(void *p)
 
                /* Read the new order and classzone_idx */
                alloc_order = reclaim_order = pgdat->kswapd_order;
-               classzone_idx = kswapd_classzone_idx(pgdat, 0);
+               classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
                pgdat->kswapd_order = 0;
                pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
 
@@ -3828,7 +3877,6 @@ static int kswapd(void *p)
        }
 
        tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
-       current->reclaim_state = NULL;
 
        return 0;
 }
@@ -3851,8 +3899,12 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
        if (!cpuset_zone_allowed(zone, gfp_flags))
                return;
        pgdat = zone->zone_pgdat;
-       pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
-                                                          classzone_idx);
+
+       if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
+               pgdat->kswapd_classzone_idx = classzone_idx;
+       else
+               pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx,
+                                                 classzone_idx);
        pgdat->kswapd_order = max(pgdat->kswapd_order, order);
        if (!waitqueue_active(&pgdat->kswapd_wait))
                return;
@@ -3889,7 +3941,6 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
  */
 unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 {
-       struct reclaim_state reclaim_state;
        struct scan_control sc = {
                .nr_to_reclaim = nr_to_reclaim,
                .gfp_mask = GFP_HIGHUSER_MOVABLE,
@@ -3901,18 +3952,16 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
                .hibernation_mode = 1,
        };
        struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
-       struct task_struct *p = current;
        unsigned long nr_reclaimed;
        unsigned int noreclaim_flag;
 
        fs_reclaim_acquire(sc.gfp_mask);
        noreclaim_flag = memalloc_noreclaim_save();
-       reclaim_state.reclaimed_slab = 0;
-       p->reclaim_state = &reclaim_state;
+       set_task_reclaim_state(current, &sc.reclaim_state);
 
        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
 
-       p->reclaim_state = NULL;
+       set_task_reclaim_state(current, NULL);
        memalloc_noreclaim_restore(noreclaim_flag);
        fs_reclaim_release(sc.gfp_mask);
 
@@ -4077,7 +4126,6 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
        /* Minimum pages needed in order to stay on node */
        const unsigned long nr_pages = 1 << order;
        struct task_struct *p = current;
-       struct reclaim_state reclaim_state;
        unsigned int noreclaim_flag;
        struct scan_control sc = {
                .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
@@ -4102,8 +4150,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
         */
        noreclaim_flag = memalloc_noreclaim_save();
        p->flags |= PF_SWAPWRITE;
-       reclaim_state.reclaimed_slab = 0;
-       p->reclaim_state = &reclaim_state;
+       set_task_reclaim_state(p, &sc.reclaim_state);
 
        if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
                /*
@@ -4115,7 +4162,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
                } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
        }
 
-       p->reclaim_state = NULL;
+       set_task_reclaim_state(p, NULL);
        current->flags &= ~PF_SWAPWRITE;
        memalloc_noreclaim_restore(noreclaim_flag);
        fs_reclaim_release(sc.gfp_mask);