btrfs: wait on ordered extents on abort cleanup

[linux.git] / fs / btrfs / disk-io.c
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c

index 6d776717d8b39b566e6ec14f479648ad6f788905..18eefc5b25327c14c937455ca41c3a07686eb5d2 100644 (file)
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -279,6 +279,12 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
  
         len = buf->len - offset;
         while (len > 0) {
+               /*
+                * Note: we don't need to check for the err == 1 case here, as
+                * with the given combination of 'start = BTRFS_CSUM_SIZE (32)'
+                * and 'min_len = 32' and the currently implemented mapping
+                * algorithm we cannot cross a page boundary.
+                */
                 err = map_private_extent_buffer(buf, offset, 32,
                                         &kaddr, &map_start, &map_len);
                 if (err)
@@ -542,7 +548,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
         if (WARN_ON(!PageUptodate(page)))
                 return -EUCLEAN;
  
-       ASSERT(memcmp_extent_buffer(eb, fs_info->fsid,
+       ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
                         btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0);
  
         return csum_tree_block(fs_info, eb, 0);
@@ -557,7 +563,20 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
  
         read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE);
         while (fs_devices) {
-               if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
+               u8 *metadata_uuid;
+
+               /*
+                * Checking the incompat flag is only valid for the current
+                * fs. For seed devices it's forbidden to have their uuid
+                * changed so reading ->fsid in this case is fine
+                */
+               if (fs_devices == fs_info->fs_devices &&
+                   btrfs_fs_incompat(fs_info, METADATA_UUID))
+                       metadata_uuid = fs_devices->metadata_uuid;
+               else
+                       metadata_uuid = fs_devices->fsid;
+
+               if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) {
                         ret = 0;
                         break;
                 }
@@ -660,19 +679,6 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
         return ret;
  }
  
-static int btree_io_failed_hook(struct page *page, int failed_mirror)
-{
-       struct extent_buffer *eb;
-
-       eb = (struct extent_buffer *)page->private;
-       set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
-       eb->read_mirror = failed_mirror;
-       atomic_dec(&eb->io_pages);
-       if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
-               btree_readahead_hook(eb, -EIO);
-       return -EIO;    /* we fixed nothing */
-}
-
  static void end_workqueue_bio(struct bio *bio)
  {
         struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
@@ -751,11 +757,22 @@ static void run_one_async_start(struct btrfs_work *work)
                 async->status = ret;
  }
  
+/*
+ * In order to insert checksums into the metadata in large chunks, we wait
+ * until bio submission time.   All the pages in the bio are checksummed and
+ * sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the csums attached on the ordered extent record are
+ * inserted into the tree.
+ */
  static void run_one_async_done(struct btrfs_work *work)
  {
         struct async_submit_bio *async;
+       struct inode *inode;
+       blk_status_t ret;
  
         async = container_of(work, struct  async_submit_bio, work);
+       inode = async->private_data;
  
         /* If an error occurred we just want to clean up the bio and move on */
         if (async->status) {
@@ -764,7 +781,12 @@ static void run_one_async_done(struct btrfs_work *work)
                 return;
         }
  
-       btrfs_submit_bio_done(async->private_data, async->bio, async->mirror_num);
+       ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio,
+                       async->mirror_num, 1);
+       if (ret) {
+               async->bio->bi_status = ret;
+               bio_endio(async->bio);
+       }
  }
  
  static void run_one_async_free(struct btrfs_work *work)
@@ -1178,6 +1200,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
         refcount_set(&root->refs, 1);
         atomic_set(&root->will_be_snapshotted, 0);
         atomic_set(&root->snapshot_force_cow, 0);
+       atomic_set(&root->nr_swapfiles, 0);
         root->log_transid = 0;
         root->log_transid_committed = -1;
         root->last_log_commit = 0;
@@ -2118,10 +2141,8 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
  static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
  {
         mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
-       rwlock_init(&fs_info->dev_replace.lock);
-       atomic_set(&fs_info->dev_replace.blocking_readers, 0);
+       init_rwsem(&fs_info->dev_replace.rwsem);
         init_waitqueue_head(&fs_info->dev_replace.replace_wait);
-       init_waitqueue_head(&fs_info->dev_replace.read_lock_wq);
  }
  
  static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
@@ -2442,10 +2463,11 @@ static int validate_super(struct btrfs_fs_info *fs_info,
                 ret = -EINVAL;
         }
  
-       if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) {
+       if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
+                  BTRFS_FSID_SIZE) != 0) {
                 btrfs_err(fs_info,
-                          "dev_item UUID does not match fsid: %pU != %pU",
-                          fs_info->fsid, sb->dev_item.fsid);
+                       "dev_item UUID does not match metadata fsid: %pU != %pU",
+                       fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
                 ret = -EINVAL;
         }
  
@@ -2656,6 +2678,9 @@ int open_ctree(struct super_block *sb,
         btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
         btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
                              BTRFS_BLOCK_RSV_DELOPS);
+       btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
+                            BTRFS_BLOCK_RSV_DELREFS);
+
         atomic_set(&fs_info->async_delalloc_pages, 0);
         atomic_set(&fs_info->defrag_running, 0);
         atomic_set(&fs_info->qgroup_op_seq, 0);
@@ -2745,6 +2770,9 @@ int open_ctree(struct super_block *sb,
         fs_info->sectorsize = 4096;
         fs_info->stripesize = 4096;
  
+       spin_lock_init(&fs_info->swapfile_pins_lock);
+       fs_info->swapfile_pins = RB_ROOT;
+
         ret = btrfs_alloc_stripe_hash_table(fs_info);
         if (ret) {
                 err = ret;
@@ -2781,11 +2809,29 @@ int open_ctree(struct super_block *sb,
          * the whole block of INFO_SIZE
          */
         memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
-       memcpy(fs_info->super_for_commit, fs_info->super_copy,
-              sizeof(*fs_info->super_for_commit));
         brelse(bh);
  
-       memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
+       disk_super = fs_info->super_copy;
+
+       ASSERT(!memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
+                      BTRFS_FSID_SIZE));
+
+       if (btrfs_fs_incompat(fs_info, METADATA_UUID)) {
+               ASSERT(!memcmp(fs_info->fs_devices->metadata_uuid,
+                               fs_info->super_copy->metadata_uuid,
+                               BTRFS_FSID_SIZE));
+       }
+
+       features = btrfs_super_flags(disk_super);
+       if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
+               features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2;
+               btrfs_set_super_flags(disk_super, features);
+               btrfs_info(fs_info,
+                       "found metadata UUID change in progress flag, clearing");
+       }
+
+       memcpy(fs_info->super_for_commit, fs_info->super_copy,
+              sizeof(*fs_info->super_for_commit));
  
         ret = btrfs_validate_mount_super(fs_info);
         if (ret) {
@@ -2794,7 +2840,6 @@ int open_ctree(struct super_block *sb,
                 goto fail_alloc;
         }
  
-       disk_super = fs_info->super_copy;
         if (!btrfs_super_root(disk_super))
                 goto fail_alloc;
  
@@ -2906,7 +2951,7 @@ int open_ctree(struct super_block *sb,
  
         sb->s_blocksize = sectorsize;
         sb->s_blocksize_bits = blksize_bits(sectorsize);
-       memcpy(&sb->s_uuid, fs_info->fsid, BTRFS_FSID_SIZE);
+       memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
  
         mutex_lock(&fs_info->chunk_mutex);
         ret = btrfs_read_sys_array(fs_info);
@@ -3055,7 +3100,7 @@ int open_ctree(struct super_block *sb,
  
         if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {
                 btrfs_warn(fs_info,
-               "writeable mount is not allowed due to too many missing devices");
+               "writable mount is not allowed due to too many missing devices");
                 goto fail_sysfs;
         }
  
@@ -3724,7 +3769,8 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
                 btrfs_set_stack_device_io_width(dev_item, dev->io_width);
                 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
                 memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
-               memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_FSID_SIZE);
+               memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid,
+                      BTRFS_FSID_SIZE);
  
                 flags = btrfs_super_flags(sb);
                 btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
@@ -4031,7 +4077,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
         /*
          * This is a fast path so only do this check if we have sanity tests
-        * enabled.  Normal people shouldn't be using umapped buffers as dirty
+        * enabled.  Normal people shouldn't be using unmapped buffers as dirty
          * outside of the sanity tests.
          */
         if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
@@ -4155,6 +4201,14 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
                 spin_lock(&fs_info->ordered_root_lock);
         }
         spin_unlock(&fs_info->ordered_root_lock);
+
+       /*
+        * We need this here because if we've been flipped read-only we won't
+        * get sync() from the umount, so we need to make sure any ordered
+        * extents that haven't had their dirty pages IO start writeout yet
+        * actually get run and error out properly.
+        */
+       btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
  }
  
  static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
@@ -4219,6 +4273,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
                 if (pin_bytes)
                         btrfs_pin_extent(fs_info, head->bytenr,
                                          head->num_bytes, 1);
+               btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
                 btrfs_put_delayed_ref_head(head);
                 cond_resched();
                 spin_lock(&delayed_refs->lock);
@@ -4329,6 +4384,8 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
         unpin = pinned_extents;
  again:
         while (1) {
+               struct extent_state *cached_state = NULL;
+
                 /*
                  * The btrfs_finish_extent_commit() may get the same range as
                  * ours between find_first_extent_bit and clear_extent_dirty.
@@ -4337,13 +4394,14 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
                  */
                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
                 ret = find_first_extent_bit(unpin, 0, &start, &end,
-                                           EXTENT_DIRTY, NULL);
+                                           EXTENT_DIRTY, &cached_state);
                 if (ret) {
                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                         break;
                 }
  
-               clear_extent_dirty(unpin, start, end);
+               clear_extent_dirty(unpin, start, end, &cached_state);
+               free_extent_state(cached_state);
                 btrfs_error_unpin_extent_range(fs_info, start, end);
                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                 cond_resched();
@@ -4400,6 +4458,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
  
                 spin_unlock(&cur_trans->dirty_bgs_lock);
                 btrfs_put_block_group(cache);
+               btrfs_delayed_refs_rsv_release(fs_info, 1);
                 spin_lock(&cur_trans->dirty_bgs_lock);
         }
         spin_unlock(&cur_trans->dirty_bgs_lock);
@@ -4505,7 +4564,4 @@ static const struct extent_io_ops btree_extent_io_ops = {
         /* mandatory callbacks */
         .submit_bio_hook = btree_submit_bio_hook,
         .readpage_end_io_hook = btree_readpage_end_io_hook,
-       .readpage_io_failed_hook = btree_io_failed_hook,
-
-       /* optional callbacks */
  };