]> asedeno.scripts.mit.edu Git - linux.git/blobdiff - fs/dax.c
m68k/mm: sun3 - Modernize printing of kernel messages
[linux.git] / fs / dax.c
index e83aa4077df4fc63e479c377be469b404f77edc5..a8732fbed381a45bbce44fcdf0731ccfdc1a09ba 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -31,6 +31,7 @@
 #include <linux/vmstat.h>
 #include <linux/pfn_t.h>
 #include <linux/sizes.h>
+#include <linux/mmu_notifier.h>
 #include <linux/iomap.h>
 #include "internal.h"
 
@@ -614,36 +615,107 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
        return new_entry;
 }
 
+static inline unsigned long
+pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
+{
+       unsigned long address;
+
+       address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+       VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+       return address;
+}
+
+/* Walk all mappings of a given index of a file and writeprotect them */
+static void dax_mapping_entry_mkclean(struct address_space *mapping,
+                                     pgoff_t index, unsigned long pfn)
+{
+       struct vm_area_struct *vma;
+       pte_t *ptep;
+       pte_t pte;
+       spinlock_t *ptl;
+       bool changed;
+
+       i_mmap_lock_read(mapping);
+       vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
+               unsigned long address;
+
+               cond_resched();
+
+               if (!(vma->vm_flags & VM_SHARED))
+                       continue;
+
+               address = pgoff_address(index, vma);
+               changed = false;
+               if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
+                       continue;
+               if (pfn != pte_pfn(*ptep))
+                       goto unlock;
+               if (!pte_dirty(*ptep) && !pte_write(*ptep))
+                       goto unlock;
+
+               flush_cache_page(vma, address, pfn);
+               pte = ptep_clear_flush(vma, address, ptep);
+               pte = pte_wrprotect(pte);
+               pte = pte_mkclean(pte);
+               set_pte_at(vma->vm_mm, address, ptep, pte);
+               changed = true;
+unlock:
+               pte_unmap_unlock(ptep, ptl);
+
+               if (changed)
+                       mmu_notifier_invalidate_page(vma->vm_mm, address);
+       }
+       i_mmap_unlock_read(mapping);
+}
+
 static int dax_writeback_one(struct block_device *bdev,
                struct address_space *mapping, pgoff_t index, void *entry)
 {
        struct radix_tree_root *page_tree = &mapping->page_tree;
-       struct radix_tree_node *node;
        struct blk_dax_ctl dax;
-       void **slot;
+       void *entry2, **slot;
        int ret = 0;
 
-       spin_lock_irq(&mapping->tree_lock);
        /*
-        * Regular page slots are stabilized by the page lock even
-        * without the tree itself locked.  These unlocked entries
-        * need verification under the tree lock.
+        * A page got tagged dirty in DAX mapping? Something is seriously
+        * wrong.
         */
-       if (!__radix_tree_lookup(page_tree, index, &node, &slot))
-               goto unlock;
-       if (*slot != entry)
-               goto unlock;
-
-       /* another fsync thread may have already written back this entry */
-       if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
-               goto unlock;
+       if (WARN_ON(!radix_tree_exceptional_entry(entry)))
+               return -EIO;
 
+       spin_lock_irq(&mapping->tree_lock);
+       entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
+       /* Entry got punched out / reallocated? */
+       if (!entry2 || !radix_tree_exceptional_entry(entry2))
+               goto put_unlocked;
+       /*
+        * Entry got reallocated elsewhere? No need to writeback. We have to
+        * compare sectors as we must not bail out due to difference in lockbit
+        * or entry type.
+        */
+       if (dax_radix_sector(entry2) != dax_radix_sector(entry))
+               goto put_unlocked;
        if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
                                dax_is_zero_entry(entry))) {
                ret = -EIO;
-               goto unlock;
+               goto put_unlocked;
        }
 
+       /* Another fsync thread may have already written back this entry */
+       if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
+               goto put_unlocked;
+       /* Lock the entry to serialize with page faults */
+       entry = lock_slot(mapping, slot);
+       /*
+        * We can clear the tag now but we have to be careful so that concurrent
+        * dax_writeback_one() calls for the same index cannot finish before we
+        * actually flush the caches. This is achieved as the calls will look
+        * at the entry only under tree_lock and once they do that they will
+        * see the entry locked and wait for it to unlock.
+        */
+       radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
+       spin_unlock_irq(&mapping->tree_lock);
+
        /*
         * Even if dax_writeback_mapping_range() was given a wbc->range_start
         * in the middle of a PMD, the 'index' we are given will be aligned to
@@ -653,31 +725,40 @@ static int dax_writeback_one(struct block_device *bdev,
         */
        dax.sector = dax_radix_sector(entry);
        dax.size = PAGE_SIZE << dax_radix_order(entry);
-       spin_unlock_irq(&mapping->tree_lock);
 
        /*
         * We cannot hold tree_lock while calling dax_map_atomic() because it
         * eventually calls cond_resched().
         */
        ret = dax_map_atomic(bdev, &dax);
-       if (ret < 0)
+       if (ret < 0) {
+               put_locked_mapping_entry(mapping, index, entry);
                return ret;
+       }
 
        if (WARN_ON_ONCE(ret < dax.size)) {
                ret = -EIO;
                goto unmap;
        }
 
+       dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn));
        wb_cache_pmem(dax.addr, dax.size);
-
+       /*
+        * After we have flushed the cache, we can clear the dirty tag. There
+        * cannot be new dirty data in the pfn after the flush has completed as
+        * the pfn mappings are writeprotected and fault waits for mapping
+        * entry lock.
+        */
        spin_lock_irq(&mapping->tree_lock);
-       radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
+       radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
        spin_unlock_irq(&mapping->tree_lock);
  unmap:
        dax_unmap_atomic(bdev, &dax);
+       put_locked_mapping_entry(mapping, index, entry);
        return ret;
 
- unlock:
+ put_unlocked:
+       put_unlocked_mapping_entry(mapping, index, entry2);
        spin_unlock_irq(&mapping->tree_lock);
        return ret;
 }
@@ -766,17 +847,27 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct file *file = vma->vm_file;
        struct address_space *mapping = file->f_mapping;
-       void *entry;
+       void *entry, **slot;
        pgoff_t index = vmf->pgoff;
 
        spin_lock_irq(&mapping->tree_lock);
-       entry = get_unlocked_mapping_entry(mapping, index, NULL);
-       if (!entry || !radix_tree_exceptional_entry(entry))
-               goto out;
+       entry = get_unlocked_mapping_entry(mapping, index, &slot);
+       if (!entry || !radix_tree_exceptional_entry(entry)) {
+               if (entry)
+                       put_unlocked_mapping_entry(mapping, index, entry);
+               spin_unlock_irq(&mapping->tree_lock);
+               return VM_FAULT_NOPAGE;
+       }
        radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
-       put_unlocked_mapping_entry(mapping, index, entry);
-out:
+       entry = lock_slot(mapping, slot);
        spin_unlock_irq(&mapping->tree_lock);
+       /*
+        * If we race with somebody updating the PTE and finish_mkwrite_fault()
+        * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
+        * the fault in either case.
+        */
+       finish_mkwrite_fault(vmf);
+       put_locked_mapping_entry(mapping, index, entry);
        return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);