rtc: m41t80: remove excess kerneldoc

[linux.git] / mm / huge_memory.c
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index de1f15969e2782edd648a43fc5e7c6d7edacb38b..c5cb6dcd6c69664c4e9c71d02c20ecae53362e9b 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -496,11 +496,25 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
         return pmd;
  }
  
-static inline struct list_head *page_deferred_list(struct page *page)
+#ifdef CONFIG_MEMCG
+static inline struct deferred_split *get_deferred_split_queue(struct page *page)
  {
-       /* ->lru in the tail pages is occupied by compound_head. */
-       return &page[2].deferred_list;
+       struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+       struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+
+       if (memcg)
+               return &memcg->deferred_split_queue;
+       else
+               return &pgdat->deferred_split_queue;
+}
+#else
+static inline struct deferred_split *get_deferred_split_queue(struct page *page)
+{
+       struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+
+       return &pgdat->deferred_split_queue;
  }
+#endif
  
  void prep_transhuge_page(struct page *page)
  {
@@ -645,40 +659,30 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
   *         available
   * never: never stall for any thp allocation
   */
-static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr)
+static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
  {
         const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
-       gfp_t this_node = 0;
-
-#ifdef CONFIG_NUMA
-       struct mempolicy *pol;
-       /*
-        * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not
-        * specified, to express a general desire to stay on the current
-        * node for optimistic allocation attempts. If the defrag mode
-        * and/or madvise hint requires the direct reclaim then we prefer
-        * to fallback to other node rather than node reclaim because that
-        * can lead to excessive reclaim even though there is free memory
-        * on other nodes. We expect that NUMA preferences are specified
-        * by memory policies.
-        */
-       pol = get_vma_policy(vma, addr);
-       if (pol->mode != MPOL_BIND)
-               this_node = __GFP_THISNODE;
-       mpol_cond_put(pol);
-#endif
  
+       /* Always do synchronous compaction */
         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
                 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
+
+       /* Kick kcompactd and fail quickly */
         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
-               return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node;
+               return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
+
+       /* Synchronous compaction if madvised, otherwise kick kcompactd */
         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
-               return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
-                                                            __GFP_KSWAPD_RECLAIM | this_node);
+               return GFP_TRANSHUGE_LIGHT |
+                       (vma_madvised ? __GFP_DIRECT_RECLAIM :
+                                       __GFP_KSWAPD_RECLAIM);
+
+       /* Only do synchronous compaction if madvised */
         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
-               return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
-                                                            this_node);
-       return GFP_TRANSHUGE_LIGHT | this_node;
+               return GFP_TRANSHUGE_LIGHT |
+                      (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
+
+       return GFP_TRANSHUGE_LIGHT;
  }
  
  /* Caller must hold page table lock. */
@@ -750,8 +754,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
                         pte_free(vma->vm_mm, pgtable);
                 return ret;
         }
-       gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
-       page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id());
+       gfp = alloc_hugepage_direct_gfpmask(vma);
+       page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
         if (unlikely(!page)) {
                 count_vm_event(THP_FAULT_FALLBACK);
                 return VM_FAULT_FALLBACK;
@@ -1358,9 +1362,8 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
  alloc:
         if (__transparent_hugepage_enabled(vma) &&
             !transparent_hugepage_debug_cow()) {
-               huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
-               new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma,
-                               haddr, numa_node_id());
+               huge_gfp = alloc_hugepage_direct_gfpmask(vma);
+               new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
         } else
                 new_page = NULL;
  
@@ -2497,6 +2500,8 @@ static void __split_huge_page(struct page *page, struct list_head *list,
         struct page *head = compound_head(page);
         pg_data_t *pgdat = page_pgdat(head);
         struct lruvec *lruvec;
+       struct address_space *swap_cache = NULL;
+       unsigned long offset = 0;
         int i;
  
         lruvec = mem_cgroup_page_lruvec(head, pgdat);
@@ -2504,6 +2509,14 @@ static void __split_huge_page(struct page *page, struct list_head *list,
         /* complete memcg works before add pages to LRU */
         mem_cgroup_split_huge_fixup(head);
  
+       if (PageAnon(head) && PageSwapCache(head)) {
+               swp_entry_t entry = { .val = page_private(head) };
+
+               offset = swp_offset(entry);
+               swap_cache = swap_address_space(entry);
+               xa_lock(&swap_cache->i_pages);
+       }
+
         for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
                 __split_huge_page_tail(head, i, lruvec, list);
                 /* Some pages can be beyond i_size: drop them from page cache */
@@ -2513,6 +2526,12 @@ static void __split_huge_page(struct page *page, struct list_head *list,
                         if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
                                 shmem_uncharge(head->mapping->host, 1);
                         put_page(head + i);
+               } else if (!PageAnon(page)) {
+                       __xa_store(&head->mapping->i_pages, head[i].index,
+                                       head + i, 0);
+               } else if (swap_cache) {
+                       __xa_store(&swap_cache->i_pages, offset + i,
+                                       head + i, 0);
                 }
         }
  
@@ -2523,10 +2542,12 @@ static void __split_huge_page(struct page *page, struct list_head *list,
         /* See comment in __split_huge_page_tail() */
         if (PageAnon(head)) {
                 /* Additional pin to swap cache */
-               if (PageSwapCache(head))
+               if (PageSwapCache(head)) {
                         page_ref_add(head, 2);
-               else
+                       xa_unlock(&swap_cache->i_pages);
+               } else {
                         page_ref_inc(head);
+               }
         } else {
                 /* Additional pin to page cache */
                 page_ref_add(head, 2);
@@ -2673,6 +2694,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
  {
         struct page *head = compound_head(page);
         struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
+       struct deferred_split *ds_queue = get_deferred_split_queue(page);
         struct anon_vma *anon_vma = NULL;
         struct address_space *mapping = NULL;
         int count, mapcount, extra_pins, ret;
@@ -2759,17 +2781,17 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
         }
  
         /* Prevent deferred_split_scan() touching ->_refcount */
-       spin_lock(&pgdata->split_queue_lock);
+       spin_lock(&ds_queue->split_queue_lock);
         count = page_count(head);
         mapcount = total_mapcount(head);
         if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
                 if (!list_empty(page_deferred_list(head))) {
-                       pgdata->split_queue_len--;
+                       ds_queue->split_queue_len--;
                         list_del(page_deferred_list(head));
                 }
                 if (mapping)
                         __dec_node_page_state(page, NR_SHMEM_THPS);
-               spin_unlock(&pgdata->split_queue_lock);
+               spin_unlock(&ds_queue->split_queue_lock);
                 __split_huge_page(page, list, end, flags);
                 if (PageSwapCache(head)) {
                         swp_entry_t entry = { .val = page_private(head) };
@@ -2786,7 +2808,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                         dump_page(page, "total_mapcount(head) > 0");
                         BUG();
                 }
-               spin_unlock(&pgdata->split_queue_lock);
+               spin_unlock(&ds_queue->split_queue_lock);
  fail:          if (mapping)
                         xa_unlock(&mapping->i_pages);
                 spin_unlock_irqrestore(&pgdata->lru_lock, flags);
@@ -2808,53 +2830,86 @@ fail:           if (mapping)
  
  void free_transhuge_page(struct page *page)
  {
-       struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
+       struct deferred_split *ds_queue = get_deferred_split_queue(page);
         unsigned long flags;
  
-       spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+       spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
         if (!list_empty(page_deferred_list(page))) {
-               pgdata->split_queue_len--;
+               ds_queue->split_queue_len--;
                 list_del(page_deferred_list(page));
         }
-       spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+       spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
         free_compound_page(page);
  }
  
  void deferred_split_huge_page(struct page *page)
  {
-       struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
+       struct deferred_split *ds_queue = get_deferred_split_queue(page);
+#ifdef CONFIG_MEMCG
+       struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+#endif
         unsigned long flags;
  
         VM_BUG_ON_PAGE(!PageTransHuge(page), page);
  
-       spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+       /*
+        * The try_to_unmap() in page reclaim path might reach here too,
+        * this may cause a race condition to corrupt deferred split queue.
+        * And, if page reclaim is already handling the same page, it is
+        * unnecessary to handle it again in shrinker.
+        *
+        * Check PageSwapCache to determine if the page is being
+        * handled by page reclaim since THP swap would add the page into
+        * swap cache before calling try_to_unmap().
+        */
+       if (PageSwapCache(page))
+               return;
+
+       spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
         if (list_empty(page_deferred_list(page))) {
                 count_vm_event(THP_DEFERRED_SPLIT_PAGE);
-               list_add_tail(page_deferred_list(page), &pgdata->split_queue);
-               pgdata->split_queue_len++;
+               list_add_tail(page_deferred_list(page), &ds_queue->split_queue);
+               ds_queue->split_queue_len++;
+#ifdef CONFIG_MEMCG
+               if (memcg)
+                       memcg_set_shrinker_bit(memcg, page_to_nid(page),
+                                              deferred_split_shrinker.id);
+#endif
         }
-       spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+       spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
  }
  
  static unsigned long deferred_split_count(struct shrinker *shrink,
                 struct shrink_control *sc)
  {
         struct pglist_data *pgdata = NODE_DATA(sc->nid);
-       return READ_ONCE(pgdata->split_queue_len);
+       struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
+
+#ifdef CONFIG_MEMCG
+       if (sc->memcg)
+               ds_queue = &sc->memcg->deferred_split_queue;
+#endif
+       return READ_ONCE(ds_queue->split_queue_len);
  }
  
  static unsigned long deferred_split_scan(struct shrinker *shrink,
                 struct shrink_control *sc)
  {
         struct pglist_data *pgdata = NODE_DATA(sc->nid);
+       struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
         unsigned long flags;
         LIST_HEAD(list), *pos, *next;
         struct page *page;
         int split = 0;
  
-       spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+#ifdef CONFIG_MEMCG
+       if (sc->memcg)
+               ds_queue = &sc->memcg->deferred_split_queue;
+#endif
+
+       spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
         /* Take pin on all head pages to avoid freeing them under us */
-       list_for_each_safe(pos, next, &pgdata->split_queue) {
+       list_for_each_safe(pos, next, &ds_queue->split_queue) {
                 page = list_entry((void *)pos, struct page, mapping);
                 page = compound_head(page);
                 if (get_page_unless_zero(page)) {
@@ -2862,12 +2917,12 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
                 } else {
                         /* We lost race with put_compound_page() */
                         list_del_init(page_deferred_list(page));
-                       pgdata->split_queue_len--;
+                       ds_queue->split_queue_len--;
                 }
                 if (!--sc->nr_to_scan)
                         break;
         }
-       spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+       spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
  
         list_for_each_safe(pos, next, &list) {
                 page = list_entry((void *)pos, struct page, mapping);
@@ -2881,15 +2936,15 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
                 put_page(page);
         }
  
-       spin_lock_irqsave(&pgdata->split_queue_lock, flags);
-       list_splice_tail(&list, &pgdata->split_queue);
-       spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+       spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+       list_splice_tail(&list, &ds_queue->split_queue);
+       spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
  
         /*
          * Stop shrinker if we didn't split any page, but the queue is empty.
          * This can happen if pages were freed under us.
          */
-       if (!split && list_empty(&pgdata->split_queue))
+       if (!split && list_empty(&ds_queue->split_queue))
                 return SHRINK_STOP;
         return split;
  }
@@ -2898,7 +2953,8 @@ static struct shrinker deferred_split_shrinker = {
         .count_objects = deferred_split_count,
         .scan_objects = deferred_split_scan,
         .seeks = DEFAULT_SEEKS,
-       .flags = SHRINKER_NUMA_AWARE,
+       .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
+                SHRINKER_NONSLAB,
  };
  
  #ifdef CONFIG_DEBUG_FS