Merge branches 'pm-core', 'pm-qos', 'pm-domains' and 'pm-opp'

[linux.git] / mm / filemap.c
diff --git a/mm/filemap.c b/mm/filemap.c

index 50b52fe51937ca70e62a33ab1553aef9b77ad1a0..3f9afded581be1a013bda4db2c0ec3a721323364 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -132,44 +132,28 @@ static int page_cache_tree_insert(struct address_space *mapping,
                 if (!dax_mapping(mapping)) {
                         if (shadowp)
                                 *shadowp = p;
-                       if (node)
-                               workingset_node_shadows_dec(node);
                 } else {
                         /* DAX can replace empty locked entry with a hole */
                         WARN_ON_ONCE(p !=
-                               (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
-                                        RADIX_DAX_ENTRY_LOCK));
-                       /* DAX accounts exceptional entries as normal pages */
-                       if (node)
-                               workingset_node_pages_dec(node);
+                               dax_radix_locked_entry(0, RADIX_DAX_EMPTY));
                         /* Wakeup waiters for exceptional entry lock */
-                       dax_wake_mapping_entry_waiter(mapping, page->index,
-                                                     false);
+                       dax_wake_mapping_entry_waiter(mapping, page->index, p,
+                                                     true);
                 }
         }
-       radix_tree_replace_slot(slot, page);
+       __radix_tree_replace(&mapping->page_tree, node, slot, page,
+                            workingset_update_node, mapping);
         mapping->nrpages++;
-       if (node) {
-               workingset_node_pages_inc(node);
-               /*
-                * Don't track node that contains actual pages.
-                *
-                * Avoid acquiring the list_lru lock if already
-                * untracked.  The list_empty() test is safe as
-                * node->private_list is protected by
-                * mapping->tree_lock.
-                */
-               if (!list_empty(&node->private_list))
-                       list_lru_del(&workingset_shadow_nodes,
-                                    &node->private_list);
-       }
         return 0;
  }
  
  static void page_cache_tree_delete(struct address_space *mapping,
                                    struct page *page, void *shadow)
  {
-       int i, nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
+       int i, nr;
+
+       /* hugetlb pages are represented by one entry in the radix tree */
+       nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
  
         VM_BUG_ON_PAGE(!PageLocked(page), page);
         VM_BUG_ON_PAGE(PageTail(page), page);
@@ -182,44 +166,11 @@ static void page_cache_tree_delete(struct address_space *mapping,
                 __radix_tree_lookup(&mapping->page_tree, page->index + i,
                                     &node, &slot);
  
-               radix_tree_clear_tags(&mapping->page_tree, node, slot);
-
-               if (!node) {
-                       VM_BUG_ON_PAGE(nr != 1, page);
-                       /*
-                        * We need a node to properly account shadow
-                        * entries. Don't plant any without. XXX
-                        */
-                       shadow = NULL;
-               }
-
-               radix_tree_replace_slot(slot, shadow);
-
-               if (!node)
-                       break;
-
-               workingset_node_pages_dec(node);
-               if (shadow)
-                       workingset_node_shadows_inc(node);
-               else
-                       if (__radix_tree_delete_node(&mapping->page_tree, node))
-                               continue;
+               VM_BUG_ON_PAGE(!node && nr != 1, page);
  
-               /*
-                * Track node that only contains shadow entries. DAX mappings
-                * contain no shadow entries and may contain other exceptional
-                * entries so skip those.
-                *
-                * Avoid acquiring the list_lru lock if already tracked.
-                * The list_empty() test is safe as node->private_list is
-                * protected by mapping->tree_lock.
-                */
-               if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
-                               list_empty(&node->private_list)) {
-                       node->private_data = mapping;
-                       list_lru_add(&workingset_shadow_nodes,
-                                       &node->private_list);
-               }
+               radix_tree_clear_tags(&mapping->page_tree, node, slot);
+               __radix_tree_replace(&mapping->page_tree, node, slot, shadow,
+                                    workingset_update_node, mapping);
         }
  
         if (shadow) {
@@ -788,45 +739,159 @@ EXPORT_SYMBOL(__page_cache_alloc);
   * at a cost of "thundering herd" phenomena during rare hash
   * collisions.
   */
-wait_queue_head_t *page_waitqueue(struct page *page)
+#define PAGE_WAIT_TABLE_BITS 8
+#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
+static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
+
+static wait_queue_head_t *page_waitqueue(struct page *page)
  {
-       return bit_waitqueue(page, 0);
+       return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
  }
-EXPORT_SYMBOL(page_waitqueue);
  
-void wait_on_page_bit(struct page *page, int bit_nr)
+void __init pagecache_init(void)
  {
-       DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+       int i;
  
-       if (test_bit(bit_nr, &page->flags))
-               __wait_on_bit(page_waitqueue(page), &wait, bit_wait_io,
-                                                       TASK_UNINTERRUPTIBLE);
+       for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
+               init_waitqueue_head(&page_wait_table[i]);
+
+       page_writeback_init();
  }
-EXPORT_SYMBOL(wait_on_page_bit);
  
-int wait_on_page_bit_killable(struct page *page, int bit_nr)
+struct wait_page_key {
+       struct page *page;
+       int bit_nr;
+       int page_match;
+};
+
+struct wait_page_queue {
+       struct page *page;
+       int bit_nr;
+       wait_queue_t wait;
+};
+
+static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
  {
-       DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+       struct wait_page_key *key = arg;
+       struct wait_page_queue *wait_page
+               = container_of(wait, struct wait_page_queue, wait);
+
+       if (wait_page->page != key->page)
+              return 0;
+       key->page_match = 1;
  
-       if (!test_bit(bit_nr, &page->flags))
+       if (wait_page->bit_nr != key->bit_nr)
+               return 0;
+       if (test_bit(key->bit_nr, &key->page->flags))
                 return 0;
  
-       return __wait_on_bit(page_waitqueue(page), &wait,
-                            bit_wait_io, TASK_KILLABLE);
+       return autoremove_wake_function(wait, mode, sync, key);
  }
  
-int wait_on_page_bit_killable_timeout(struct page *page,
-                                      int bit_nr, unsigned long timeout)
+void wake_up_page_bit(struct page *page, int bit_nr)
  {
-       DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+       wait_queue_head_t *q = page_waitqueue(page);
+       struct wait_page_key key;
+       unsigned long flags;
  
-       wait.key.timeout = jiffies + timeout;
-       if (!test_bit(bit_nr, &page->flags))
-               return 0;
-       return __wait_on_bit(page_waitqueue(page), &wait,
-                            bit_wait_io_timeout, TASK_KILLABLE);
+       key.page = page;
+       key.bit_nr = bit_nr;
+       key.page_match = 0;
+
+       spin_lock_irqsave(&q->lock, flags);
+       __wake_up_locked_key(q, TASK_NORMAL, &key);
+       /*
+        * It is possible for other pages to have collided on the waitqueue
+        * hash, so in that case check for a page match. That prevents a long-
+        * term waiter
+        *
+        * It is still possible to miss a case here, when we woke page waiters
+        * and removed them from the waitqueue, but there are still other
+        * page waiters.
+        */
+       if (!waitqueue_active(q) || !key.page_match) {
+               ClearPageWaiters(page);
+               /*
+                * It's possible to miss clearing Waiters here, when we woke
+                * our page waiters, but the hashed waitqueue has waiters for
+                * other pages on it.
+                *
+                * That's okay, it's a rare case. The next waker will clear it.
+                */
+       }
+       spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(wake_up_page_bit);
+
+static inline int wait_on_page_bit_common(wait_queue_head_t *q,
+               struct page *page, int bit_nr, int state, bool lock)
+{
+       struct wait_page_queue wait_page;
+       wait_queue_t *wait = &wait_page.wait;
+       int ret = 0;
+
+       init_wait(wait);
+       wait->func = wake_page_function;
+       wait_page.page = page;
+       wait_page.bit_nr = bit_nr;
+
+       for (;;) {
+               spin_lock_irq(&q->lock);
+
+               if (likely(list_empty(&wait->task_list))) {
+                       if (lock)
+                               __add_wait_queue_tail_exclusive(q, wait);
+                       else
+                               __add_wait_queue(q, wait);
+                       SetPageWaiters(page);
+               }
+
+               set_current_state(state);
+
+               spin_unlock_irq(&q->lock);
+
+               if (likely(test_bit(bit_nr, &page->flags))) {
+                       io_schedule();
+                       if (unlikely(signal_pending_state(state, current))) {
+                               ret = -EINTR;
+                               break;
+                       }
+               }
+
+               if (lock) {
+                       if (!test_and_set_bit_lock(bit_nr, &page->flags))
+                               break;
+               } else {
+                       if (!test_bit(bit_nr, &page->flags))
+                               break;
+               }
+       }
+
+       finish_wait(q, wait);
+
+       /*
+        * A signal could leave PageWaiters set. Clearing it here if
+        * !waitqueue_active would be possible (by open-coding finish_wait),
+        * but still fail to catch it in the case of wait hash collision. We
+        * already can fail to clear wait hash collision cases, so don't
+        * bother with signals either.
+        */
+
+       return ret;
+}
+
+void wait_on_page_bit(struct page *page, int bit_nr)
+{
+       wait_queue_head_t *q = page_waitqueue(page);
+       wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, false);
+}
+EXPORT_SYMBOL(wait_on_page_bit);
+
+int wait_on_page_bit_killable(struct page *page, int bit_nr)
+{
+       wait_queue_head_t *q = page_waitqueue(page);
+       return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false);
  }
-EXPORT_SYMBOL_GPL(wait_on_page_bit_killable_timeout);
  
  /**
   * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
@@ -842,10 +907,34 @@ void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
  
         spin_lock_irqsave(&q->lock, flags);
         __add_wait_queue(q, waiter);
+       SetPageWaiters(page);
         spin_unlock_irqrestore(&q->lock, flags);
  }
  EXPORT_SYMBOL_GPL(add_page_wait_queue);
  
+#ifndef clear_bit_unlock_is_negative_byte
+
+/*
+ * PG_waiters is the high bit in the same byte as PG_lock.
+ *
+ * On x86 (and on many other architectures), we can clear PG_lock and
+ * test the sign bit at the same time. But if the architecture does
+ * not support that special operation, we just do this all by hand
+ * instead.
+ *
+ * The read of PG_waiters has to be after (or concurrently with) PG_locked
+ * being cleared, but a memory barrier should be unneccssary since it is
+ * in the same byte as PG_locked.
+ */
+static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem)
+{
+       clear_bit_unlock(nr, mem);
+       /* smp_mb__after_atomic(); */
+       return test_bit(PG_waiters, mem);
+}
+
+#endif
+
  /**
   * unlock_page - unlock a locked page
   * @page: the page
@@ -855,16 +944,19 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
   * mechanism between PageLocked pages and PageWriteback pages is shared.
   * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
   *
- * The mb is necessary to enforce ordering between the clear_bit and the read
- * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
+ * Note that this depends on PG_waiters being the sign bit in the byte
+ * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to
+ * clear the PG_locked bit and test PG_waiters at the same time fairly
+ * portably (architectures that do LL/SC can test any bit, while x86 can
+ * test the sign bit).
   */
  void unlock_page(struct page *page)
  {
+       BUILD_BUG_ON(PG_waiters != 7);
         page = compound_head(page);
         VM_BUG_ON_PAGE(!PageLocked(page), page);
-       clear_bit_unlock(PG_locked, &page->flags);
-       smp_mb__after_atomic();
-       wake_up_page(page, PG_locked);
+       if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags))
+               wake_up_page_bit(page, PG_locked);
  }
  EXPORT_SYMBOL(unlock_page);
  
@@ -923,23 +1015,19 @@ EXPORT_SYMBOL_GPL(page_endio);
   * __lock_page - get a lock on the page, assuming we need to sleep to get it
   * @page: the page to lock
   */
-void __lock_page(struct page *page)
+void __lock_page(struct page *__page)
  {
-       struct page *page_head = compound_head(page);
-       DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
-
-       __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io,
-                                                       TASK_UNINTERRUPTIBLE);
+       struct page *page = compound_head(__page);
+       wait_queue_head_t *q = page_waitqueue(page);
+       wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, true);
  }
  EXPORT_SYMBOL(__lock_page);
  
-int __lock_page_killable(struct page *page)
+int __lock_page_killable(struct page *__page)
  {
-       struct page *page_head = compound_head(page);
-       DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
-
-       return __wait_on_bit_lock(page_waitqueue(page_head), &wait,
-                                       bit_wait_io, TASK_KILLABLE);
+       struct page *page = compound_head(__page);
+       wait_queue_head_t *q = page_waitqueue(page);
+       return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true);
  }
  EXPORT_SYMBOL_GPL(__lock_page_killable);
  
@@ -1686,7 +1774,7 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
         int error = 0;
  
         if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
-               return -EINVAL;
+               return 0;
         iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
  
         index = *ppos >> PAGE_SHIFT;
@@ -1703,6 +1791,11 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
  
                 cond_resched();
  find_page:
+               if (fatal_signal_pending(current)) {
+                       error = -EINTR;
+                       goto out;
+               }
+
                 page = find_get_page(mapping, index);
                 if (!page) {
                         page_cache_sync_readahead(mapping,
@@ -2213,12 +2306,12 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
  }
  EXPORT_SYMBOL(filemap_fault);
  
-void filemap_map_pages(struct fault_env *fe,
+void filemap_map_pages(struct vm_fault *vmf,
                 pgoff_t start_pgoff, pgoff_t end_pgoff)
  {
         struct radix_tree_iter iter;
         void **slot;
-       struct file *file = fe->vma->vm_file;
+       struct file *file = vmf->vma->vm_file;
         struct address_space *mapping = file->f_mapping;
         pgoff_t last_pgoff = start_pgoff;
         loff_t size;
@@ -2274,11 +2367,11 @@ void filemap_map_pages(struct fault_env *fe,
                 if (file->f_ra.mmap_miss > 0)
                         file->f_ra.mmap_miss--;
  
-               fe->address += (iter.index - last_pgoff) << PAGE_SHIFT;
-               if (fe->pte)
-                       fe->pte += iter.index - last_pgoff;
+               vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT;
+               if (vmf->pte)
+                       vmf->pte += iter.index - last_pgoff;
                 last_pgoff = iter.index;
-               if (alloc_set_pte(fe, NULL, page))
+               if (alloc_set_pte(vmf, NULL, page))
                         goto unlock;
                 unlock_page(page);
                 goto next;
@@ -2288,7 +2381,7 @@ void filemap_map_pages(struct fault_env *fe,
                 put_page(page);
  next:
                 /* Huge page is mapped? No need to proceed. */
-               if (pmd_trans_huge(*fe->pmd))
+               if (pmd_trans_huge(*vmf->pmd))
                         break;
                 if (iter.index == end_pgoff)
                         break;