]> asedeno.scripts.mit.edu Git - linux.git/blobdiff - mm/swapfile.c
Merge tag 'ceph-for-4.20-rc1' of git://github.com/ceph/ceph-client
[linux.git] / mm / swapfile.c
index 2cc2972eedaf1088e8a1ea0017f701b18c729494..644f746e167acd65e8244088f483e50c71afbf20 100644 (file)
@@ -103,26 +103,39 @@ static inline unsigned char swap_count(unsigned char ent)
        return ent & ~SWAP_HAS_CACHE;   /* may include COUNT_CONTINUED flag */
 }
 
+/* Reclaim the swap entry anyway if possible */
+#define TTRS_ANYWAY            0x1
+/*
+ * Reclaim the swap entry if there are no more mappings of the
+ * corresponding page
+ */
+#define TTRS_UNMAPPED          0x2
+/* Reclaim the swap entry if swap is getting full*/
+#define TTRS_FULL              0x4
+
 /* returns 1 if swap entry is freed */
-static int
-__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
+static int __try_to_reclaim_swap(struct swap_info_struct *si,
+                                unsigned long offset, unsigned long flags)
 {
        swp_entry_t entry = swp_entry(si->type, offset);
        struct page *page;
        int ret = 0;
 
-       page = find_get_page(swap_address_space(entry), swp_offset(entry));
+       page = find_get_page(swap_address_space(entry), offset);
        if (!page)
                return 0;
        /*
-        * This function is called from scan_swap_map() and it's called
-        * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
-        * We have to use trylock for avoiding deadlock. This is a special
+        * When this function is called from scan_swap_map_slots() and it's
+        * called by vmscan.c at reclaiming pages. So, we hold a lock on a page,
+        * here. We have to use trylock for avoiding deadlock. This is a special
         * case and you should use try_to_free_swap() with explicit lock_page()
         * in usual operations.
         */
        if (trylock_page(page)) {
-               ret = try_to_free_swap(page);
+               if ((flags & TTRS_ANYWAY) ||
+                   ((flags & TTRS_UNMAPPED) && !page_mapped(page)) ||
+                   ((flags & TTRS_FULL) && mem_cgroup_swap_full(page)))
+                       ret = try_to_free_swap(page);
                unlock_page(page);
        }
        put_page(page);
@@ -204,8 +217,16 @@ static void discard_swap_cluster(struct swap_info_struct *si,
 
 #ifdef CONFIG_THP_SWAP
 #define SWAPFILE_CLUSTER       HPAGE_PMD_NR
+
+#define swap_entry_size(size)  (size)
 #else
 #define SWAPFILE_CLUSTER       256
+
+/*
+ * Define swap_entry_size() as constant to let compiler to optimize
+ * out some code if !CONFIG_THP_SWAP
+ */
+#define swap_entry_size(size)  1
 #endif
 #define LATENCY_LIMIT          256
 
@@ -269,7 +290,9 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
 
 static inline bool cluster_is_huge(struct swap_cluster_info *info)
 {
-       return info->flags & CLUSTER_FLAG_HUGE;
+       if (IS_ENABLED(CONFIG_THP_SWAP))
+               return info->flags & CLUSTER_FLAG_HUGE;
+       return false;
 }
 
 static inline void cluster_clear_huge(struct swap_cluster_info *info)
@@ -296,13 +319,18 @@ static inline void unlock_cluster(struct swap_cluster_info *ci)
                spin_unlock(&ci->lock);
 }
 
+/*
+ * Determine the locking method in use for this device.  Return
+ * swap_cluster_info if SSD-style cluster-based locking is in place.
+ */
 static inline struct swap_cluster_info *lock_cluster_or_swap_info(
-       struct swap_info_struct *si,
-       unsigned long offset)
+               struct swap_info_struct *si, unsigned long offset)
 {
        struct swap_cluster_info *ci;
 
+       /* Try to use fine-grained SSD-style locking if available: */
        ci = lock_cluster(si, offset);
+       /* Otherwise, fall back to traditional, coarse locking: */
        if (!ci)
                spin_lock(&si->lock);
 
@@ -765,7 +793,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
                int swap_was_freed;
                unlock_cluster(ci);
                spin_unlock(&si->lock);
-               swap_was_freed = __try_to_reclaim_swap(si, offset);
+               swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
                spin_lock(&si->lock);
                /* entry was freed successfully, try to use this again */
                if (swap_was_freed)
@@ -863,7 +891,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
        return n_ret;
 }
 
-#ifdef CONFIG_THP_SWAP
 static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
 {
        unsigned long idx;
@@ -871,6 +898,15 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
        unsigned long offset, i;
        unsigned char *map;
 
+       /*
+        * Should not even be attempting cluster allocations when huge
+        * page swap is disabled.  Warn and fail the allocation.
+        */
+       if (!IS_ENABLED(CONFIG_THP_SWAP)) {
+               VM_WARN_ON_ONCE(1);
+               return 0;
+       }
+
        if (cluster_list_empty(&si->free_clusters))
                return 0;
 
@@ -896,18 +932,12 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
        struct swap_cluster_info *ci;
 
        ci = lock_cluster(si, offset);
+       memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
        cluster_set_count_flag(ci, 0, 0);
        free_cluster(si, idx);
        unlock_cluster(ci);
        swap_range_free(si, offset, SWAPFILE_CLUSTER);
 }
-#else
-static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
-{
-       VM_WARN_ON_ONCE(1);
-       return 0;
-}
-#endif /* CONFIG_THP_SWAP */
 
 static unsigned long scan_swap_map(struct swap_info_struct *si,
                                   unsigned char usage)
@@ -924,18 +954,18 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
 
 }
 
-int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
+int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
 {
-       unsigned long nr_pages = cluster ? SWAPFILE_CLUSTER : 1;
+       unsigned long size = swap_entry_size(entry_size);
        struct swap_info_struct *si, *next;
        long avail_pgs;
        int n_ret = 0;
        int node;
 
        /* Only single cluster request supported */
-       WARN_ON_ONCE(n_goal > 1 && cluster);
+       WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
 
-       avail_pgs = atomic_long_read(&nr_swap_pages) / nr_pages;
+       avail_pgs = atomic_long_read(&nr_swap_pages) / size;
        if (avail_pgs <= 0)
                goto noswap;
 
@@ -945,7 +975,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
        if (n_goal > avail_pgs)
                n_goal = avail_pgs;
 
-       atomic_long_sub(n_goal * nr_pages, &nr_swap_pages);
+       atomic_long_sub(n_goal * size, &nr_swap_pages);
 
        spin_lock(&swap_avail_lock);
 
@@ -972,14 +1002,14 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
                        spin_unlock(&si->lock);
                        goto nextsi;
                }
-               if (cluster) {
-                       if (!(si->flags & SWP_FILE))
+               if (size == SWAPFILE_CLUSTER) {
+                       if (!(si->flags & SWP_FS))
                                n_ret = swap_alloc_cluster(si, swp_entries);
                } else
                        n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
                                                    n_goal, swp_entries);
                spin_unlock(&si->lock);
-               if (n_ret || cluster)
+               if (n_ret || size == SWAPFILE_CLUSTER)
                        goto check_out;
                pr_debug("scan_swap_map of si %d failed to find offset\n",
                        si->type);
@@ -1005,7 +1035,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
 
 check_out:
        if (n_ret < n_goal)
-               atomic_long_add((long)(n_goal - n_ret) * nr_pages,
+               atomic_long_add((long)(n_goal - n_ret) * size,
                                &nr_swap_pages);
 noswap:
        return n_ret;
@@ -1107,16 +1137,13 @@ static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
        return p;
 }
 
-static unsigned char __swap_entry_free(struct swap_info_struct *p,
-                                      swp_entry_t entry, unsigned char usage)
+static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
+                                             unsigned long offset,
+                                             unsigned char usage)
 {
-       struct swap_cluster_info *ci;
-       unsigned long offset = swp_offset(entry);
        unsigned char count;
        unsigned char has_cache;
 
-       ci = lock_cluster_or_swap_info(p, offset);
-
        count = p->swap_map[offset];
 
        has_cache = count & SWAP_HAS_CACHE;
@@ -1144,7 +1171,20 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p,
        usage = count | has_cache;
        p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
 
+       return usage;
+}
+
+static unsigned char __swap_entry_free(struct swap_info_struct *p,
+                                      swp_entry_t entry, unsigned char usage)
+{
+       struct swap_cluster_info *ci;
+       unsigned long offset = swp_offset(entry);
+
+       ci = lock_cluster_or_swap_info(p, offset);
+       usage = __swap_entry_free_locked(p, offset, usage);
        unlock_cluster_or_swap_info(p, ci);
+       if (!usage)
+               free_swap_slot(entry);
 
        return usage;
 }
@@ -1175,28 +1215,14 @@ void swap_free(swp_entry_t entry)
        struct swap_info_struct *p;
 
        p = _swap_info_get(entry);
-       if (p) {
-               if (!__swap_entry_free(p, entry, 1))
-                       free_swap_slot(entry);
-       }
+       if (p)
+               __swap_entry_free(p, entry, 1);
 }
 
 /*
  * Called after dropping swapcache to decrease refcnt to swap entries.
  */
-static void swapcache_free(swp_entry_t entry)
-{
-       struct swap_info_struct *p;
-
-       p = _swap_info_get(entry);
-       if (p) {
-               if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE))
-                       free_swap_slot(entry);
-       }
-}
-
-#ifdef CONFIG_THP_SWAP
-static void swapcache_free_cluster(swp_entry_t entry)
+void put_swap_page(struct page *page, swp_entry_t entry)
 {
        unsigned long offset = swp_offset(entry);
        unsigned long idx = offset / SWAPFILE_CLUSTER;
@@ -1205,42 +1231,45 @@ static void swapcache_free_cluster(swp_entry_t entry)
        unsigned char *map;
        unsigned int i, free_entries = 0;
        unsigned char val;
+       int size = swap_entry_size(hpage_nr_pages(page));
 
        si = _swap_info_get(entry);
        if (!si)
                return;
 
-       ci = lock_cluster(si, offset);
-       VM_BUG_ON(!cluster_is_huge(ci));
-       map = si->swap_map + offset;
-       for (i = 0; i < SWAPFILE_CLUSTER; i++) {
-               val = map[i];
-               VM_BUG_ON(!(val & SWAP_HAS_CACHE));
-               if (val == SWAP_HAS_CACHE)
-                       free_entries++;
-       }
-       if (!free_entries) {
-               for (i = 0; i < SWAPFILE_CLUSTER; i++)
-                       map[i] &= ~SWAP_HAS_CACHE;
+       ci = lock_cluster_or_swap_info(si, offset);
+       if (size == SWAPFILE_CLUSTER) {
+               VM_BUG_ON(!cluster_is_huge(ci));
+               map = si->swap_map + offset;
+               for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+                       val = map[i];
+                       VM_BUG_ON(!(val & SWAP_HAS_CACHE));
+                       if (val == SWAP_HAS_CACHE)
+                               free_entries++;
+               }
+               cluster_clear_huge(ci);
+               if (free_entries == SWAPFILE_CLUSTER) {
+                       unlock_cluster_or_swap_info(si, ci);
+                       spin_lock(&si->lock);
+                       mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+                       swap_free_cluster(si, idx);
+                       spin_unlock(&si->lock);
+                       return;
+               }
        }
-       cluster_clear_huge(ci);
-       unlock_cluster(ci);
-       if (free_entries == SWAPFILE_CLUSTER) {
-               spin_lock(&si->lock);
-               ci = lock_cluster(si, offset);
-               memset(map, 0, SWAPFILE_CLUSTER);
-               unlock_cluster(ci);
-               mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
-               swap_free_cluster(si, idx);
-               spin_unlock(&si->lock);
-       } else if (free_entries) {
-               for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) {
-                       if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
-                               free_swap_slot(entry);
+       for (i = 0; i < size; i++, entry.val++) {
+               if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
+                       unlock_cluster_or_swap_info(si, ci);
+                       free_swap_slot(entry);
+                       if (i == size - 1)
+                               return;
+                       lock_cluster_or_swap_info(si, offset);
                }
        }
+       unlock_cluster_or_swap_info(si, ci);
 }
 
+#ifdef CONFIG_THP_SWAP
 int split_swap_cluster(swp_entry_t entry)
 {
        struct swap_info_struct *si;
@@ -1255,19 +1284,7 @@ int split_swap_cluster(swp_entry_t entry)
        unlock_cluster(ci);
        return 0;
 }
-#else
-static inline void swapcache_free_cluster(swp_entry_t entry)
-{
-}
-#endif /* CONFIG_THP_SWAP */
-
-void put_swap_page(struct page *page, swp_entry_t entry)
-{
-       if (!PageTransHuge(page))
-               swapcache_free(entry);
-       else
-               swapcache_free_cluster(entry);
-}
+#endif
 
 static int swp_entry_cmp(const void *ent1, const void *ent2)
 {
@@ -1409,7 +1426,6 @@ int swp_swapcount(swp_entry_t entry)
        return count;
 }
 
-#ifdef CONFIG_THP_SWAP
 static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
                                         swp_entry_t entry)
 {
@@ -1422,12 +1438,12 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
 
        ci = lock_cluster_or_swap_info(si, offset);
        if (!ci || !cluster_is_huge(ci)) {
-               if (map[roffset] != SWAP_HAS_CACHE)
+               if (swap_count(map[roffset]))
                        ret = true;
                goto unlock_out;
        }
        for (i = 0; i < SWAPFILE_CLUSTER; i++) {
-               if (map[offset + i] != SWAP_HAS_CACHE) {
+               if (swap_count(map[offset + i])) {
                        ret = true;
                        break;
                }
@@ -1442,7 +1458,7 @@ static bool page_swapped(struct page *page)
        swp_entry_t entry;
        struct swap_info_struct *si;
 
-       if (likely(!PageTransCompound(page)))
+       if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
                return page_swapcount(page) != 0;
 
        page = compound_head(page);
@@ -1466,10 +1482,8 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
        /* hugetlbfs shouldn't call it */
        VM_BUG_ON_PAGE(PageHuge(page), page);
 
-       if (likely(!PageTransCompound(page))) {
-               mapcount = atomic_read(&page->_mapcount) + 1;
-               if (total_mapcount)
-                       *total_mapcount = mapcount;
+       if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
+               mapcount = page_trans_huge_mapcount(page, total_mapcount);
                if (PageSwapCache(page))
                        swapcount = page_swapcount(page);
                if (total_swapcount)
@@ -1516,26 +1530,6 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
 
        return map_swapcount;
 }
-#else
-#define swap_page_trans_huge_swapped(si, entry)        swap_swapcount(si, entry)
-#define page_swapped(page)                     (page_swapcount(page) != 0)
-
-static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
-                                        int *total_swapcount)
-{
-       int mapcount, swapcount = 0;
-
-       /* hugetlbfs shouldn't call it */
-       VM_BUG_ON_PAGE(PageHuge(page), page);
-
-       mapcount = page_trans_huge_mapcount(page, total_mapcount);
-       if (PageSwapCache(page))
-               swapcount = page_swapcount(page);
-       if (total_swapcount)
-               *total_swapcount = swapcount;
-       return mapcount + swapcount;
-}
-#endif
 
 /*
  * We can write to an anon page without COW if there are no other references
@@ -1629,7 +1623,6 @@ int try_to_free_swap(struct page *page)
 int free_swap_and_cache(swp_entry_t entry)
 {
        struct swap_info_struct *p;
-       struct page *page = NULL;
        unsigned char count;
 
        if (non_swap_entry(entry))
@@ -1639,30 +1632,9 @@ int free_swap_and_cache(swp_entry_t entry)
        if (p) {
                count = __swap_entry_free(p, entry, 1);
                if (count == SWAP_HAS_CACHE &&
-                   !swap_page_trans_huge_swapped(p, entry)) {
-                       page = find_get_page(swap_address_space(entry),
-                                            swp_offset(entry));
-                       if (page && !trylock_page(page)) {
-                               put_page(page);
-                               page = NULL;
-                       }
-               } else if (!count)
-                       free_swap_slot(entry);
-       }
-       if (page) {
-               /*
-                * Not mapped elsewhere, or swap space full? Free it!
-                * Also recheck PageSwapCache now page is locked (above).
-                */
-               if (PageSwapCache(page) && !PageWriteback(page) &&
-                   (!page_mapped(page) || mem_cgroup_swap_full(page)) &&
-                   !swap_page_trans_huge_swapped(p, entry)) {
-                       page = compound_head(page);
-                       delete_from_swap_cache(page);
-                       SetPageDirty(page);
-               }
-               unlock_page(page);
-               put_page(page);
+                   !swap_page_trans_huge_swapped(p, entry))
+                       __try_to_reclaim_swap(p, swp_offset(entry),
+                                             TTRS_UNMAPPED | TTRS_FULL);
        }
        return p != NULL;
 }
@@ -2327,12 +2299,13 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
                kfree(se);
        }
 
-       if (sis->flags & SWP_FILE) {
+       if (sis->flags & SWP_ACTIVATED) {
                struct file *swap_file = sis->swap_file;
                struct address_space *mapping = swap_file->f_mapping;
 
-               sis->flags &= ~SWP_FILE;
-               mapping->a_ops->swap_deactivate(swap_file);
+               sis->flags &= ~SWP_ACTIVATED;
+               if (mapping->a_ops->swap_deactivate)
+                       mapping->a_ops->swap_deactivate(swap_file);
        }
 }
 
@@ -2381,6 +2354,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
        list_add_tail(&new_se->list, &sis->first_swap_extent.list);
        return 1;
 }
+EXPORT_SYMBOL_GPL(add_swap_extent);
 
 /*
  * A `swap extent' is a simple thing which maps a contiguous range of pages
@@ -2428,8 +2402,10 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
 
        if (mapping->a_ops->swap_activate) {
                ret = mapping->a_ops->swap_activate(sis, swap_file, span);
+               if (ret >= 0)
+                       sis->flags |= SWP_ACTIVATED;
                if (!ret) {
-                       sis->flags |= SWP_FILE;
+                       sis->flags |= SWP_FS;
                        ret = add_swap_extent(sis, 0, sis->max, 0);
                        *span = sis->pages;
                }
@@ -2909,6 +2885,35 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
        return 0;
 }
 
+
+/*
+ * Find out how many pages are allowed for a single swap device. There
+ * are two limiting factors:
+ * 1) the number of bits for the swap offset in the swp_entry_t type, and
+ * 2) the number of bits in the swap pte, as defined by the different
+ * architectures.
+ *
+ * In order to find the largest possible bit mask, a swap entry with
+ * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
+ * decoded to a swp_entry_t again, and finally the swap offset is
+ * extracted.
+ *
+ * This will mask all the bits from the initial ~0UL mask that can't
+ * be encoded in either the swp_entry_t or the architecture definition
+ * of a swap pte.
+ */
+unsigned long generic_max_swapfile_size(void)
+{
+       return swp_offset(pte_to_swp_entry(
+                       swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+}
+
+/* Can be overridden by an architecture for additional checks. */
+__weak unsigned long max_swapfile_size(void)
+{
+       return generic_max_swapfile_size();
+}
+
 static unsigned long read_swap_header(struct swap_info_struct *p,
                                        union swap_header *swap_header,
                                        struct inode *inode)
@@ -2944,22 +2949,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
        p->cluster_next = 1;
        p->cluster_nr = 0;
 
-       /*
-        * Find out how many pages are allowed for a single swap
-        * device. There are two limiting factors: 1) the number
-        * of bits for the swap offset in the swp_entry_t type, and
-        * 2) the number of bits in the swap pte as defined by the
-        * different architectures. In order to find the
-        * largest possible bit mask, a swap entry with swap type 0
-        * and swap offset ~0UL is created, encoded to a swap pte,
-        * decoded to a swp_entry_t again, and finally the swap
-        * offset is extracted. This will mask all the bits from
-        * the initial ~0UL mask that can't be encoded in either
-        * the swp_entry_t or the architecture definition of a
-        * swap pte.
-        */
-       maxpages = swp_offset(pte_to_swp_entry(
-                       swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+       maxpages = max_swapfile_size();
        last_page = swap_header->info.last_page;
        if (!last_page) {
                pr_warn("Empty swap-file\n");
@@ -3731,6 +3721,37 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
        }
 }
 
+#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
+void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
+                                 gfp_t gfp_mask)
+{
+       struct swap_info_struct *si, *next;
+       if (!(gfp_mask & __GFP_IO) || !memcg)
+               return;
+
+       if (!blk_cgroup_congested())
+               return;
+
+       /*
+        * We've already scheduled a throttle, avoid taking the global swap
+        * lock.
+        */
+       if (current->throttle_queue)
+               return;
+
+       spin_lock(&swap_avail_lock);
+       plist_for_each_entry_safe(si, next, &swap_avail_heads[node],
+                                 avail_lists[node]) {
+               if (si->bdev) {
+                       blkcg_schedule_throttle(bdev_get_queue(si->bdev),
+                                               true);
+                       break;
+               }
+       }
+       spin_unlock(&swap_avail_lock);
+}
+#endif
+
 static int __init swapfile_init(void)
 {
        int nid;