1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2008 Red Hat. All rights reserved.
6 #include <linux/pagemap.h>
7 #include <linux/sched.h>
8 #include <linux/sched/signal.h>
9 #include <linux/slab.h>
10 #include <linux/math64.h>
11 #include <linux/ratelimit.h>
12 #include <linux/error-injection.h>
13 #include <linux/sched/mm.h>
15 #include "free-space-cache.h"
16 #include "transaction.h"
18 #include "extent_io.h"
19 #include "inode-map.h"
21 #include "space-info.h"
22 #include "delalloc-space.h"
23 #include "block-group.h"
26 #define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
27 #define MAX_CACHE_BYTES_PER_GIG SZ_32K
29 struct btrfs_trim_range {
32 struct list_head list;
35 static int link_free_space(struct btrfs_free_space_ctl *ctl,
36 struct btrfs_free_space *info);
37 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
38 struct btrfs_free_space *info);
39 static int btrfs_wait_cache_io_root(struct btrfs_root *root,
40 struct btrfs_trans_handle *trans,
41 struct btrfs_io_ctl *io_ctl,
42 struct btrfs_path *path);
44 static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
45 struct btrfs_path *path,
48 struct btrfs_fs_info *fs_info = root->fs_info;
50 struct btrfs_key location;
51 struct btrfs_disk_key disk_key;
52 struct btrfs_free_space_header *header;
53 struct extent_buffer *leaf;
54 struct inode *inode = NULL;
58 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
62 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
66 btrfs_release_path(path);
67 return ERR_PTR(-ENOENT);
70 leaf = path->nodes[0];
71 header = btrfs_item_ptr(leaf, path->slots[0],
72 struct btrfs_free_space_header);
73 btrfs_free_space_key(leaf, header, &disk_key);
74 btrfs_disk_key_to_cpu(&location, &disk_key);
75 btrfs_release_path(path);
78 * We are often under a trans handle at this point, so we need to make
79 * sure NOFS is set to keep us from deadlocking.
81 nofs_flag = memalloc_nofs_save();
82 inode = btrfs_iget_path(fs_info->sb, &location, root, path);
83 btrfs_release_path(path);
84 memalloc_nofs_restore(nofs_flag);
88 mapping_set_gfp_mask(inode->i_mapping,
89 mapping_gfp_constraint(inode->i_mapping,
90 ~(__GFP_FS | __GFP_HIGHMEM)));
95 struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
96 struct btrfs_path *path)
98 struct btrfs_fs_info *fs_info = block_group->fs_info;
99 struct inode *inode = NULL;
100 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
102 spin_lock(&block_group->lock);
103 if (block_group->inode)
104 inode = igrab(block_group->inode);
105 spin_unlock(&block_group->lock);
109 inode = __lookup_free_space_inode(fs_info->tree_root, path,
114 spin_lock(&block_group->lock);
115 if (!((BTRFS_I(inode)->flags & flags) == flags)) {
116 btrfs_info(fs_info, "Old style space inode found, converting.");
117 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
118 BTRFS_INODE_NODATACOW;
119 block_group->disk_cache_state = BTRFS_DC_CLEAR;
122 if (!block_group->iref) {
123 block_group->inode = igrab(inode);
124 block_group->iref = 1;
126 spin_unlock(&block_group->lock);
131 static int __create_free_space_inode(struct btrfs_root *root,
132 struct btrfs_trans_handle *trans,
133 struct btrfs_path *path,
136 struct btrfs_key key;
137 struct btrfs_disk_key disk_key;
138 struct btrfs_free_space_header *header;
139 struct btrfs_inode_item *inode_item;
140 struct extent_buffer *leaf;
141 u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
144 ret = btrfs_insert_empty_inode(trans, root, path, ino);
148 /* We inline crc's for the free disk space cache */
149 if (ino != BTRFS_FREE_INO_OBJECTID)
150 flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
152 leaf = path->nodes[0];
153 inode_item = btrfs_item_ptr(leaf, path->slots[0],
154 struct btrfs_inode_item);
155 btrfs_item_key(leaf, &disk_key, path->slots[0]);
156 memzero_extent_buffer(leaf, (unsigned long)inode_item,
157 sizeof(*inode_item));
158 btrfs_set_inode_generation(leaf, inode_item, trans->transid);
159 btrfs_set_inode_size(leaf, inode_item, 0);
160 btrfs_set_inode_nbytes(leaf, inode_item, 0);
161 btrfs_set_inode_uid(leaf, inode_item, 0);
162 btrfs_set_inode_gid(leaf, inode_item, 0);
163 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
164 btrfs_set_inode_flags(leaf, inode_item, flags);
165 btrfs_set_inode_nlink(leaf, inode_item, 1);
166 btrfs_set_inode_transid(leaf, inode_item, trans->transid);
167 btrfs_set_inode_block_group(leaf, inode_item, offset);
168 btrfs_mark_buffer_dirty(leaf);
169 btrfs_release_path(path);
171 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
174 ret = btrfs_insert_empty_item(trans, root, path, &key,
175 sizeof(struct btrfs_free_space_header));
177 btrfs_release_path(path);
181 leaf = path->nodes[0];
182 header = btrfs_item_ptr(leaf, path->slots[0],
183 struct btrfs_free_space_header);
184 memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
185 btrfs_set_free_space_key(leaf, header, &disk_key);
186 btrfs_mark_buffer_dirty(leaf);
187 btrfs_release_path(path);
192 int create_free_space_inode(struct btrfs_trans_handle *trans,
193 struct btrfs_block_group *block_group,
194 struct btrfs_path *path)
199 ret = btrfs_find_free_objectid(trans->fs_info->tree_root, &ino);
203 return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
204 ino, block_group->start);
207 int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
208 struct btrfs_block_rsv *rsv)
213 /* 1 for slack space, 1 for updating the inode */
214 needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
215 btrfs_calc_metadata_size(fs_info, 1);
217 spin_lock(&rsv->lock);
218 if (rsv->reserved < needed_bytes)
222 spin_unlock(&rsv->lock);
226 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
227 struct btrfs_block_group *block_group,
230 struct btrfs_root *root = BTRFS_I(inode)->root;
235 struct btrfs_path *path = btrfs_alloc_path();
242 mutex_lock(&trans->transaction->cache_write_mutex);
243 if (!list_empty(&block_group->io_list)) {
244 list_del_init(&block_group->io_list);
246 btrfs_wait_cache_io(trans, block_group, path);
247 btrfs_put_block_group(block_group);
251 * now that we've truncated the cache away, its no longer
254 spin_lock(&block_group->lock);
255 block_group->disk_cache_state = BTRFS_DC_CLEAR;
256 spin_unlock(&block_group->lock);
257 btrfs_free_path(path);
260 btrfs_i_size_write(BTRFS_I(inode), 0);
261 truncate_pagecache(inode, 0);
264 * We skip the throttling logic for free space cache inodes, so we don't
265 * need to check for -EAGAIN.
267 ret = btrfs_truncate_inode_items(trans, root, inode,
268 0, BTRFS_EXTENT_DATA_KEY);
272 ret = btrfs_update_inode(trans, root, inode);
276 mutex_unlock(&trans->transaction->cache_write_mutex);
278 btrfs_abort_transaction(trans, ret);
283 static void readahead_cache(struct inode *inode)
285 struct file_ra_state *ra;
286 unsigned long last_index;
288 ra = kzalloc(sizeof(*ra), GFP_NOFS);
292 file_ra_state_init(ra, inode->i_mapping);
293 last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
295 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
300 static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
306 num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
308 if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FREE_INO_OBJECTID)
311 /* Make sure we can fit our crcs and generation into the first page */
312 if (write && check_crcs &&
313 (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
316 memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
318 io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
322 io_ctl->num_pages = num_pages;
323 io_ctl->fs_info = btrfs_sb(inode->i_sb);
324 io_ctl->check_crcs = check_crcs;
325 io_ctl->inode = inode;
329 ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO);
331 static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
333 kfree(io_ctl->pages);
334 io_ctl->pages = NULL;
337 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
345 static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
347 ASSERT(io_ctl->index < io_ctl->num_pages);
348 io_ctl->page = io_ctl->pages[io_ctl->index++];
349 io_ctl->cur = page_address(io_ctl->page);
350 io_ctl->orig = io_ctl->cur;
351 io_ctl->size = PAGE_SIZE;
353 clear_page(io_ctl->cur);
356 static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
360 io_ctl_unmap_page(io_ctl);
362 for (i = 0; i < io_ctl->num_pages; i++) {
363 if (io_ctl->pages[i]) {
364 ClearPageChecked(io_ctl->pages[i]);
365 unlock_page(io_ctl->pages[i]);
366 put_page(io_ctl->pages[i]);
371 static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode,
375 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
378 for (i = 0; i < io_ctl->num_pages; i++) {
379 page = find_or_create_page(inode->i_mapping, i, mask);
381 io_ctl_drop_pages(io_ctl);
384 io_ctl->pages[i] = page;
385 if (uptodate && !PageUptodate(page)) {
386 btrfs_readpage(NULL, page);
388 if (page->mapping != inode->i_mapping) {
389 btrfs_err(BTRFS_I(inode)->root->fs_info,
390 "free space cache page truncated");
391 io_ctl_drop_pages(io_ctl);
394 if (!PageUptodate(page)) {
395 btrfs_err(BTRFS_I(inode)->root->fs_info,
396 "error reading free space cache");
397 io_ctl_drop_pages(io_ctl);
403 for (i = 0; i < io_ctl->num_pages; i++) {
404 clear_page_dirty_for_io(io_ctl->pages[i]);
405 set_page_extent_mapped(io_ctl->pages[i]);
411 static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
415 io_ctl_map_page(io_ctl, 1);
418 * Skip the csum areas. If we don't check crcs then we just have a
419 * 64bit chunk at the front of the first page.
421 if (io_ctl->check_crcs) {
422 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
423 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
425 io_ctl->cur += sizeof(u64);
426 io_ctl->size -= sizeof(u64) * 2;
430 *val = cpu_to_le64(generation);
431 io_ctl->cur += sizeof(u64);
434 static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
439 * Skip the crc area. If we don't check crcs then we just have a 64bit
440 * chunk at the front of the first page.
442 if (io_ctl->check_crcs) {
443 io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
444 io_ctl->size -= sizeof(u64) +
445 (sizeof(u32) * io_ctl->num_pages);
447 io_ctl->cur += sizeof(u64);
448 io_ctl->size -= sizeof(u64) * 2;
452 if (le64_to_cpu(*gen) != generation) {
453 btrfs_err_rl(io_ctl->fs_info,
454 "space cache generation (%llu) does not match inode (%llu)",
456 io_ctl_unmap_page(io_ctl);
459 io_ctl->cur += sizeof(u64);
463 static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
469 if (!io_ctl->check_crcs) {
470 io_ctl_unmap_page(io_ctl);
475 offset = sizeof(u32) * io_ctl->num_pages;
477 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
478 btrfs_crc32c_final(crc, (u8 *)&crc);
479 io_ctl_unmap_page(io_ctl);
480 tmp = page_address(io_ctl->pages[0]);
485 static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
491 if (!io_ctl->check_crcs) {
492 io_ctl_map_page(io_ctl, 0);
497 offset = sizeof(u32) * io_ctl->num_pages;
499 tmp = page_address(io_ctl->pages[0]);
503 io_ctl_map_page(io_ctl, 0);
504 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
505 btrfs_crc32c_final(crc, (u8 *)&crc);
507 btrfs_err_rl(io_ctl->fs_info,
508 "csum mismatch on free space cache");
509 io_ctl_unmap_page(io_ctl);
516 static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
519 struct btrfs_free_space_entry *entry;
525 entry->offset = cpu_to_le64(offset);
526 entry->bytes = cpu_to_le64(bytes);
527 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
528 BTRFS_FREE_SPACE_EXTENT;
529 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
530 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
532 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
535 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
537 /* No more pages to map */
538 if (io_ctl->index >= io_ctl->num_pages)
541 /* map the next page */
542 io_ctl_map_page(io_ctl, 1);
546 static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
552 * If we aren't at the start of the current page, unmap this one and
553 * map the next one if there is any left.
555 if (io_ctl->cur != io_ctl->orig) {
556 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
557 if (io_ctl->index >= io_ctl->num_pages)
559 io_ctl_map_page(io_ctl, 0);
562 copy_page(io_ctl->cur, bitmap);
563 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
564 if (io_ctl->index < io_ctl->num_pages)
565 io_ctl_map_page(io_ctl, 0);
569 static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
572 * If we're not on the boundary we know we've modified the page and we
573 * need to crc the page.
575 if (io_ctl->cur != io_ctl->orig)
576 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
578 io_ctl_unmap_page(io_ctl);
580 while (io_ctl->index < io_ctl->num_pages) {
581 io_ctl_map_page(io_ctl, 1);
582 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
586 static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
587 struct btrfs_free_space *entry, u8 *type)
589 struct btrfs_free_space_entry *e;
593 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
599 entry->offset = le64_to_cpu(e->offset);
600 entry->bytes = le64_to_cpu(e->bytes);
602 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
603 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
605 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
608 io_ctl_unmap_page(io_ctl);
613 static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
614 struct btrfs_free_space *entry)
618 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
622 copy_page(entry->bitmap, io_ctl->cur);
623 io_ctl_unmap_page(io_ctl);
629 * Since we attach pinned extents after the fact we can have contiguous sections
630 * of free space that are split up in entries. This poses a problem with the
631 * tree logging stuff since it could have allocated across what appears to be 2
632 * entries since we would have merged the entries when adding the pinned extents
633 * back to the free space cache. So run through the space cache that we just
634 * loaded and merge contiguous entries. This will make the log replay stuff not
635 * blow up and it will make for nicer allocator behavior.
637 static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
639 struct btrfs_free_space *e, *prev = NULL;
643 spin_lock(&ctl->tree_lock);
644 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
645 e = rb_entry(n, struct btrfs_free_space, offset_index);
648 if (e->bitmap || prev->bitmap)
650 if (prev->offset + prev->bytes == e->offset) {
651 unlink_free_space(ctl, prev);
652 unlink_free_space(ctl, e);
653 prev->bytes += e->bytes;
654 kmem_cache_free(btrfs_free_space_cachep, e);
655 link_free_space(ctl, prev);
657 spin_unlock(&ctl->tree_lock);
663 spin_unlock(&ctl->tree_lock);
666 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
667 struct btrfs_free_space_ctl *ctl,
668 struct btrfs_path *path, u64 offset)
670 struct btrfs_fs_info *fs_info = root->fs_info;
671 struct btrfs_free_space_header *header;
672 struct extent_buffer *leaf;
673 struct btrfs_io_ctl io_ctl;
674 struct btrfs_key key;
675 struct btrfs_free_space *e, *n;
683 /* Nothing in the space cache, goodbye */
684 if (!i_size_read(inode))
687 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
691 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
695 btrfs_release_path(path);
701 leaf = path->nodes[0];
702 header = btrfs_item_ptr(leaf, path->slots[0],
703 struct btrfs_free_space_header);
704 num_entries = btrfs_free_space_entries(leaf, header);
705 num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
706 generation = btrfs_free_space_generation(leaf, header);
707 btrfs_release_path(path);
709 if (!BTRFS_I(inode)->generation) {
711 "the free space cache file (%llu) is invalid, skip it",
716 if (BTRFS_I(inode)->generation != generation) {
718 "free space inode generation (%llu) did not match free space cache generation (%llu)",
719 BTRFS_I(inode)->generation, generation);
726 ret = io_ctl_init(&io_ctl, inode, 0);
730 readahead_cache(inode);
732 ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
736 ret = io_ctl_check_crc(&io_ctl, 0);
740 ret = io_ctl_check_generation(&io_ctl, generation);
744 while (num_entries) {
745 e = kmem_cache_zalloc(btrfs_free_space_cachep,
750 ret = io_ctl_read_entry(&io_ctl, e, &type);
752 kmem_cache_free(btrfs_free_space_cachep, e);
757 * Sync discard ensures that the free space cache is always
758 * trimmed. So when reading this in, the state should reflect
759 * that. We also do this for async as a stop gap for lack of
762 if (btrfs_test_opt(fs_info, DISCARD_SYNC) ||
763 btrfs_test_opt(fs_info, DISCARD_ASYNC))
764 e->trim_state = BTRFS_TRIM_STATE_TRIMMED;
767 kmem_cache_free(btrfs_free_space_cachep, e);
771 if (type == BTRFS_FREE_SPACE_EXTENT) {
772 spin_lock(&ctl->tree_lock);
773 ret = link_free_space(ctl, e);
774 spin_unlock(&ctl->tree_lock);
777 "Duplicate entries in free space cache, dumping");
778 kmem_cache_free(btrfs_free_space_cachep, e);
784 e->bitmap = kmem_cache_zalloc(
785 btrfs_free_space_bitmap_cachep, GFP_NOFS);
788 btrfs_free_space_cachep, e);
791 spin_lock(&ctl->tree_lock);
792 ret = link_free_space(ctl, e);
793 ctl->total_bitmaps++;
794 ctl->op->recalc_thresholds(ctl);
795 spin_unlock(&ctl->tree_lock);
798 "Duplicate entries in free space cache, dumping");
799 kmem_cache_free(btrfs_free_space_cachep, e);
802 list_add_tail(&e->list, &bitmaps);
808 io_ctl_unmap_page(&io_ctl);
811 * We add the bitmaps at the end of the entries in order that
812 * the bitmap entries are added to the cache.
814 list_for_each_entry_safe(e, n, &bitmaps, list) {
815 list_del_init(&e->list);
816 ret = io_ctl_read_bitmap(&io_ctl, e);
821 io_ctl_drop_pages(&io_ctl);
822 merge_space_tree(ctl);
825 io_ctl_free(&io_ctl);
828 io_ctl_drop_pages(&io_ctl);
829 __btrfs_remove_free_space_cache(ctl);
833 int load_free_space_cache(struct btrfs_block_group *block_group)
835 struct btrfs_fs_info *fs_info = block_group->fs_info;
836 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
838 struct btrfs_path *path;
841 u64 used = block_group->used;
844 * If this block group has been marked to be cleared for one reason or
845 * another then we can't trust the on disk cache, so just return.
847 spin_lock(&block_group->lock);
848 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
849 spin_unlock(&block_group->lock);
852 spin_unlock(&block_group->lock);
854 path = btrfs_alloc_path();
857 path->search_commit_root = 1;
858 path->skip_locking = 1;
861 * We must pass a path with search_commit_root set to btrfs_iget in
862 * order to avoid a deadlock when allocating extents for the tree root.
864 * When we are COWing an extent buffer from the tree root, when looking
865 * for a free extent, at extent-tree.c:find_free_extent(), we can find
866 * block group without its free space cache loaded. When we find one
867 * we must load its space cache which requires reading its free space
868 * cache's inode item from the root tree. If this inode item is located
869 * in the same leaf that we started COWing before, then we end up in
870 * deadlock on the extent buffer (trying to read lock it when we
871 * previously write locked it).
873 * It's safe to read the inode item using the commit root because
874 * block groups, once loaded, stay in memory forever (until they are
875 * removed) as well as their space caches once loaded. New block groups
876 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
877 * we will never try to read their inode item while the fs is mounted.
879 inode = lookup_free_space_inode(block_group, path);
881 btrfs_free_path(path);
885 /* We may have converted the inode and made the cache invalid. */
886 spin_lock(&block_group->lock);
887 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
888 spin_unlock(&block_group->lock);
889 btrfs_free_path(path);
892 spin_unlock(&block_group->lock);
894 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
895 path, block_group->start);
896 btrfs_free_path(path);
900 spin_lock(&ctl->tree_lock);
901 matched = (ctl->free_space == (block_group->length - used -
902 block_group->bytes_super));
903 spin_unlock(&ctl->tree_lock);
906 __btrfs_remove_free_space_cache(ctl);
908 "block group %llu has wrong amount of free space",
914 /* This cache is bogus, make sure it gets cleared */
915 spin_lock(&block_group->lock);
916 block_group->disk_cache_state = BTRFS_DC_CLEAR;
917 spin_unlock(&block_group->lock);
921 "failed to load free space cache for block group %llu, rebuilding it now",
929 static noinline_for_stack
930 int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
931 struct btrfs_free_space_ctl *ctl,
932 struct btrfs_block_group *block_group,
933 int *entries, int *bitmaps,
934 struct list_head *bitmap_list)
937 struct btrfs_free_cluster *cluster = NULL;
938 struct btrfs_free_cluster *cluster_locked = NULL;
939 struct rb_node *node = rb_first(&ctl->free_space_offset);
940 struct btrfs_trim_range *trim_entry;
942 /* Get the cluster for this block_group if it exists */
943 if (block_group && !list_empty(&block_group->cluster_list)) {
944 cluster = list_entry(block_group->cluster_list.next,
945 struct btrfs_free_cluster,
949 if (!node && cluster) {
950 cluster_locked = cluster;
951 spin_lock(&cluster_locked->lock);
952 node = rb_first(&cluster->root);
956 /* Write out the extent entries */
958 struct btrfs_free_space *e;
960 e = rb_entry(node, struct btrfs_free_space, offset_index);
963 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
969 list_add_tail(&e->list, bitmap_list);
972 node = rb_next(node);
973 if (!node && cluster) {
974 node = rb_first(&cluster->root);
975 cluster_locked = cluster;
976 spin_lock(&cluster_locked->lock);
980 if (cluster_locked) {
981 spin_unlock(&cluster_locked->lock);
982 cluster_locked = NULL;
986 * Make sure we don't miss any range that was removed from our rbtree
987 * because trimming is running. Otherwise after a umount+mount (or crash
988 * after committing the transaction) we would leak free space and get
989 * an inconsistent free space cache report from fsck.
991 list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
992 ret = io_ctl_add_entry(io_ctl, trim_entry->start,
993 trim_entry->bytes, NULL);
1002 spin_unlock(&cluster_locked->lock);
1006 static noinline_for_stack int
1007 update_cache_item(struct btrfs_trans_handle *trans,
1008 struct btrfs_root *root,
1009 struct inode *inode,
1010 struct btrfs_path *path, u64 offset,
1011 int entries, int bitmaps)
1013 struct btrfs_key key;
1014 struct btrfs_free_space_header *header;
1015 struct extent_buffer *leaf;
1018 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
1019 key.offset = offset;
1022 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1024 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1025 EXTENT_DELALLOC, 0, 0, NULL);
1028 leaf = path->nodes[0];
1030 struct btrfs_key found_key;
1031 ASSERT(path->slots[0]);
1033 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1034 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
1035 found_key.offset != offset) {
1036 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
1037 inode->i_size - 1, EXTENT_DELALLOC, 0,
1039 btrfs_release_path(path);
1044 BTRFS_I(inode)->generation = trans->transid;
1045 header = btrfs_item_ptr(leaf, path->slots[0],
1046 struct btrfs_free_space_header);
1047 btrfs_set_free_space_entries(leaf, header, entries);
1048 btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
1049 btrfs_set_free_space_generation(leaf, header, trans->transid);
1050 btrfs_mark_buffer_dirty(leaf);
1051 btrfs_release_path(path);
1059 static noinline_for_stack int write_pinned_extent_entries(
1060 struct btrfs_block_group *block_group,
1061 struct btrfs_io_ctl *io_ctl,
1064 u64 start, extent_start, extent_end, len;
1065 struct extent_io_tree *unpin = NULL;
1072 * We want to add any pinned extents to our free space cache
1073 * so we don't leak the space
1075 * We shouldn't have switched the pinned extents yet so this is the
1078 unpin = block_group->fs_info->pinned_extents;
1080 start = block_group->start;
1082 while (start < block_group->start + block_group->length) {
1083 ret = find_first_extent_bit(unpin, start,
1084 &extent_start, &extent_end,
1085 EXTENT_DIRTY, NULL);
1089 /* This pinned extent is out of our range */
1090 if (extent_start >= block_group->start + block_group->length)
1093 extent_start = max(extent_start, start);
1094 extent_end = min(block_group->start + block_group->length,
1096 len = extent_end - extent_start;
1099 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
1109 static noinline_for_stack int
1110 write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
1112 struct btrfs_free_space *entry, *next;
1115 /* Write out the bitmaps */
1116 list_for_each_entry_safe(entry, next, bitmap_list, list) {
1117 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
1120 list_del_init(&entry->list);
1126 static int flush_dirty_cache(struct inode *inode)
1130 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
1132 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1133 EXTENT_DELALLOC, 0, 0, NULL);
1138 static void noinline_for_stack
1139 cleanup_bitmap_list(struct list_head *bitmap_list)
1141 struct btrfs_free_space *entry, *next;
1143 list_for_each_entry_safe(entry, next, bitmap_list, list)
1144 list_del_init(&entry->list);
1147 static void noinline_for_stack
1148 cleanup_write_cache_enospc(struct inode *inode,
1149 struct btrfs_io_ctl *io_ctl,
1150 struct extent_state **cached_state)
1152 io_ctl_drop_pages(io_ctl);
1153 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1154 i_size_read(inode) - 1, cached_state);
1157 static int __btrfs_wait_cache_io(struct btrfs_root *root,
1158 struct btrfs_trans_handle *trans,
1159 struct btrfs_block_group *block_group,
1160 struct btrfs_io_ctl *io_ctl,
1161 struct btrfs_path *path, u64 offset)
1164 struct inode *inode = io_ctl->inode;
1169 /* Flush the dirty pages in the cache file. */
1170 ret = flush_dirty_cache(inode);
1174 /* Update the cache item to tell everyone this cache file is valid. */
1175 ret = update_cache_item(trans, root, inode, path, offset,
1176 io_ctl->entries, io_ctl->bitmaps);
1178 io_ctl_free(io_ctl);
1180 invalidate_inode_pages2(inode->i_mapping);
1181 BTRFS_I(inode)->generation = 0;
1184 btrfs_err(root->fs_info,
1185 "failed to write free space cache for block group %llu",
1186 block_group->start);
1190 btrfs_update_inode(trans, root, inode);
1193 /* the dirty list is protected by the dirty_bgs_lock */
1194 spin_lock(&trans->transaction->dirty_bgs_lock);
1196 /* the disk_cache_state is protected by the block group lock */
1197 spin_lock(&block_group->lock);
1200 * only mark this as written if we didn't get put back on
1201 * the dirty list while waiting for IO. Otherwise our
1202 * cache state won't be right, and we won't get written again
1204 if (!ret && list_empty(&block_group->dirty_list))
1205 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1207 block_group->disk_cache_state = BTRFS_DC_ERROR;
1209 spin_unlock(&block_group->lock);
1210 spin_unlock(&trans->transaction->dirty_bgs_lock);
1211 io_ctl->inode = NULL;
1219 static int btrfs_wait_cache_io_root(struct btrfs_root *root,
1220 struct btrfs_trans_handle *trans,
1221 struct btrfs_io_ctl *io_ctl,
1222 struct btrfs_path *path)
1224 return __btrfs_wait_cache_io(root, trans, NULL, io_ctl, path, 0);
1227 int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
1228 struct btrfs_block_group *block_group,
1229 struct btrfs_path *path)
1231 return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
1232 block_group, &block_group->io_ctl,
1233 path, block_group->start);
1237 * __btrfs_write_out_cache - write out cached info to an inode
1238 * @root - the root the inode belongs to
1239 * @ctl - the free space cache we are going to write out
1240 * @block_group - the block_group for this cache if it belongs to a block_group
1241 * @trans - the trans handle
1243 * This function writes out a free space cache struct to disk for quick recovery
1244 * on mount. This will return 0 if it was successful in writing the cache out,
1245 * or an errno if it was not.
1247 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1248 struct btrfs_free_space_ctl *ctl,
1249 struct btrfs_block_group *block_group,
1250 struct btrfs_io_ctl *io_ctl,
1251 struct btrfs_trans_handle *trans)
1253 struct extent_state *cached_state = NULL;
1254 LIST_HEAD(bitmap_list);
1260 if (!i_size_read(inode))
1263 WARN_ON(io_ctl->pages);
1264 ret = io_ctl_init(io_ctl, inode, 1);
1268 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
1269 down_write(&block_group->data_rwsem);
1270 spin_lock(&block_group->lock);
1271 if (block_group->delalloc_bytes) {
1272 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1273 spin_unlock(&block_group->lock);
1274 up_write(&block_group->data_rwsem);
1275 BTRFS_I(inode)->generation = 0;
1280 spin_unlock(&block_group->lock);
1283 /* Lock all pages first so we can lock the extent safely. */
1284 ret = io_ctl_prepare_pages(io_ctl, inode, 0);
1288 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1291 io_ctl_set_generation(io_ctl, trans->transid);
1293 mutex_lock(&ctl->cache_writeout_mutex);
1294 /* Write out the extent entries in the free space cache */
1295 spin_lock(&ctl->tree_lock);
1296 ret = write_cache_extent_entries(io_ctl, ctl,
1297 block_group, &entries, &bitmaps,
1300 goto out_nospc_locked;
1303 * Some spaces that are freed in the current transaction are pinned,
1304 * they will be added into free space cache after the transaction is
1305 * committed, we shouldn't lose them.
1307 * If this changes while we are working we'll get added back to
1308 * the dirty list and redo it. No locking needed
1310 ret = write_pinned_extent_entries(block_group, io_ctl, &entries);
1312 goto out_nospc_locked;
1315 * At last, we write out all the bitmaps and keep cache_writeout_mutex
1316 * locked while doing it because a concurrent trim can be manipulating
1317 * or freeing the bitmap.
1319 ret = write_bitmap_entries(io_ctl, &bitmap_list);
1320 spin_unlock(&ctl->tree_lock);
1321 mutex_unlock(&ctl->cache_writeout_mutex);
1325 /* Zero out the rest of the pages just to make sure */
1326 io_ctl_zero_remaining_pages(io_ctl);
1328 /* Everything is written out, now we dirty the pages in the file. */
1329 ret = btrfs_dirty_pages(inode, io_ctl->pages, io_ctl->num_pages, 0,
1330 i_size_read(inode), &cached_state);
1334 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1335 up_write(&block_group->data_rwsem);
1337 * Release the pages and unlock the extent, we will flush
1340 io_ctl_drop_pages(io_ctl);
1342 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1343 i_size_read(inode) - 1, &cached_state);
1346 * at this point the pages are under IO and we're happy,
1347 * The caller is responsible for waiting on them and updating the
1348 * the cache and the inode
1350 io_ctl->entries = entries;
1351 io_ctl->bitmaps = bitmaps;
1353 ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
1360 io_ctl->inode = NULL;
1361 io_ctl_free(io_ctl);
1363 invalidate_inode_pages2(inode->i_mapping);
1364 BTRFS_I(inode)->generation = 0;
1366 btrfs_update_inode(trans, root, inode);
1372 cleanup_bitmap_list(&bitmap_list);
1373 spin_unlock(&ctl->tree_lock);
1374 mutex_unlock(&ctl->cache_writeout_mutex);
1377 cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
1380 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1381 up_write(&block_group->data_rwsem);
1386 int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
1387 struct btrfs_block_group *block_group,
1388 struct btrfs_path *path)
1390 struct btrfs_fs_info *fs_info = trans->fs_info;
1391 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1392 struct inode *inode;
1395 spin_lock(&block_group->lock);
1396 if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
1397 spin_unlock(&block_group->lock);
1400 spin_unlock(&block_group->lock);
1402 inode = lookup_free_space_inode(block_group, path);
1406 ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
1407 block_group, &block_group->io_ctl, trans);
1411 "failed to write free space cache for block group %llu",
1412 block_group->start);
1414 spin_lock(&block_group->lock);
1415 block_group->disk_cache_state = BTRFS_DC_ERROR;
1416 spin_unlock(&block_group->lock);
1418 block_group->io_ctl.inode = NULL;
1423 * if ret == 0 the caller is expected to call btrfs_wait_cache_io
1424 * to wait for IO and put the inode
1430 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
1433 ASSERT(offset >= bitmap_start);
1434 offset -= bitmap_start;
1435 return (unsigned long)(div_u64(offset, unit));
1438 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
1440 return (unsigned long)(div_u64(bytes, unit));
1443 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
1447 u64 bytes_per_bitmap;
1449 bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
1450 bitmap_start = offset - ctl->start;
1451 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
1452 bitmap_start *= bytes_per_bitmap;
1453 bitmap_start += ctl->start;
1455 return bitmap_start;
1458 static int tree_insert_offset(struct rb_root *root, u64 offset,
1459 struct rb_node *node, int bitmap)
1461 struct rb_node **p = &root->rb_node;
1462 struct rb_node *parent = NULL;
1463 struct btrfs_free_space *info;
1467 info = rb_entry(parent, struct btrfs_free_space, offset_index);
1469 if (offset < info->offset) {
1471 } else if (offset > info->offset) {
1472 p = &(*p)->rb_right;
1475 * we could have a bitmap entry and an extent entry
1476 * share the same offset. If this is the case, we want
1477 * the extent entry to always be found first if we do a
1478 * linear search through the tree, since we want to have
1479 * the quickest allocation time, and allocating from an
1480 * extent is faster than allocating from a bitmap. So
1481 * if we're inserting a bitmap and we find an entry at
1482 * this offset, we want to go right, or after this entry
1483 * logically. If we are inserting an extent and we've
1484 * found a bitmap, we want to go left, or before
1492 p = &(*p)->rb_right;
1494 if (!info->bitmap) {
1503 rb_link_node(node, parent, p);
1504 rb_insert_color(node, root);
1510 * searches the tree for the given offset.
1512 * fuzzy - If this is set, then we are trying to make an allocation, and we just
1513 * want a section that has at least bytes size and comes at or after the given
1516 static struct btrfs_free_space *
1517 tree_search_offset(struct btrfs_free_space_ctl *ctl,
1518 u64 offset, int bitmap_only, int fuzzy)
1520 struct rb_node *n = ctl->free_space_offset.rb_node;
1521 struct btrfs_free_space *entry, *prev = NULL;
1523 /* find entry that is closest to the 'offset' */
1530 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1533 if (offset < entry->offset)
1535 else if (offset > entry->offset)
1548 * bitmap entry and extent entry may share same offset,
1549 * in that case, bitmap entry comes after extent entry.
1554 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1555 if (entry->offset != offset)
1558 WARN_ON(!entry->bitmap);
1561 if (entry->bitmap) {
1563 * if previous extent entry covers the offset,
1564 * we should return it instead of the bitmap entry
1566 n = rb_prev(&entry->offset_index);
1568 prev = rb_entry(n, struct btrfs_free_space,
1570 if (!prev->bitmap &&
1571 prev->offset + prev->bytes > offset)
1581 /* find last entry before the 'offset' */
1583 if (entry->offset > offset) {
1584 n = rb_prev(&entry->offset_index);
1586 entry = rb_entry(n, struct btrfs_free_space,
1588 ASSERT(entry->offset <= offset);
1597 if (entry->bitmap) {
1598 n = rb_prev(&entry->offset_index);
1600 prev = rb_entry(n, struct btrfs_free_space,
1602 if (!prev->bitmap &&
1603 prev->offset + prev->bytes > offset)
1606 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1608 } else if (entry->offset + entry->bytes > offset)
1615 if (entry->bitmap) {
1616 if (entry->offset + BITS_PER_BITMAP *
1620 if (entry->offset + entry->bytes > offset)
1624 n = rb_next(&entry->offset_index);
1627 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1633 __unlink_free_space(struct btrfs_free_space_ctl *ctl,
1634 struct btrfs_free_space *info)
1636 rb_erase(&info->offset_index, &ctl->free_space_offset);
1637 ctl->free_extents--;
1640 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
1641 struct btrfs_free_space *info)
1643 __unlink_free_space(ctl, info);
1644 ctl->free_space -= info->bytes;
1647 static int link_free_space(struct btrfs_free_space_ctl *ctl,
1648 struct btrfs_free_space *info)
1652 ASSERT(info->bytes || info->bitmap);
1653 ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
1654 &info->offset_index, (info->bitmap != NULL));
1658 ctl->free_space += info->bytes;
1659 ctl->free_extents++;
1663 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1665 struct btrfs_block_group *block_group = ctl->private;
1669 u64 size = block_group->length;
1670 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
1671 u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
1673 max_bitmaps = max_t(u64, max_bitmaps, 1);
1675 ASSERT(ctl->total_bitmaps <= max_bitmaps);
1678 * The goal is to keep the total amount of memory used per 1gb of space
1679 * at or below 32k, so we need to adjust how much memory we allow to be
1680 * used by extent based free space tracking
1683 max_bytes = MAX_CACHE_BYTES_PER_GIG;
1685 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
1688 * we want to account for 1 more bitmap than what we have so we can make
1689 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
1690 * we add more bitmaps.
1692 bitmap_bytes = (ctl->total_bitmaps + 1) * ctl->unit;
1694 if (bitmap_bytes >= max_bytes) {
1695 ctl->extents_thresh = 0;
1700 * we want the extent entry threshold to always be at most 1/2 the max
1701 * bytes we can have, or whatever is less than that.
1703 extent_bytes = max_bytes - bitmap_bytes;
1704 extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
1706 ctl->extents_thresh =
1707 div_u64(extent_bytes, sizeof(struct btrfs_free_space));
1710 static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1711 struct btrfs_free_space *info,
1712 u64 offset, u64 bytes)
1714 unsigned long start, count;
1716 start = offset_to_bit(info->offset, ctl->unit, offset);
1717 count = bytes_to_bits(bytes, ctl->unit);
1718 ASSERT(start + count <= BITS_PER_BITMAP);
1720 bitmap_clear(info->bitmap, start, count);
1722 info->bytes -= bytes;
1723 if (info->max_extent_size > ctl->unit)
1724 info->max_extent_size = 0;
1727 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1728 struct btrfs_free_space *info, u64 offset,
1731 __bitmap_clear_bits(ctl, info, offset, bytes);
1732 ctl->free_space -= bytes;
1735 static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
1736 struct btrfs_free_space *info, u64 offset,
1739 unsigned long start, count;
1741 start = offset_to_bit(info->offset, ctl->unit, offset);
1742 count = bytes_to_bits(bytes, ctl->unit);
1743 ASSERT(start + count <= BITS_PER_BITMAP);
1745 bitmap_set(info->bitmap, start, count);
1747 info->bytes += bytes;
1748 ctl->free_space += bytes;
1752 * If we can not find suitable extent, we will use bytes to record
1753 * the size of the max extent.
1755 static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1756 struct btrfs_free_space *bitmap_info, u64 *offset,
1757 u64 *bytes, bool for_alloc)
1759 unsigned long found_bits = 0;
1760 unsigned long max_bits = 0;
1761 unsigned long bits, i;
1762 unsigned long next_zero;
1763 unsigned long extent_bits;
1766 * Skip searching the bitmap if we don't have a contiguous section that
1767 * is large enough for this allocation.
1770 bitmap_info->max_extent_size &&
1771 bitmap_info->max_extent_size < *bytes) {
1772 *bytes = bitmap_info->max_extent_size;
1776 i = offset_to_bit(bitmap_info->offset, ctl->unit,
1777 max_t(u64, *offset, bitmap_info->offset));
1778 bits = bytes_to_bits(*bytes, ctl->unit);
1780 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1781 if (for_alloc && bits == 1) {
1785 next_zero = find_next_zero_bit(bitmap_info->bitmap,
1786 BITS_PER_BITMAP, i);
1787 extent_bits = next_zero - i;
1788 if (extent_bits >= bits) {
1789 found_bits = extent_bits;
1791 } else if (extent_bits > max_bits) {
1792 max_bits = extent_bits;
1798 *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
1799 *bytes = (u64)(found_bits) * ctl->unit;
1803 *bytes = (u64)(max_bits) * ctl->unit;
1804 bitmap_info->max_extent_size = *bytes;
1808 static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
1811 return entry->max_extent_size;
1812 return entry->bytes;
1815 /* Cache the size of the max extent in bytes */
1816 static struct btrfs_free_space *
1817 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1818 unsigned long align, u64 *max_extent_size)
1820 struct btrfs_free_space *entry;
1821 struct rb_node *node;
1826 if (!ctl->free_space_offset.rb_node)
1829 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
1833 for (node = &entry->offset_index; node; node = rb_next(node)) {
1834 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1835 if (entry->bytes < *bytes) {
1836 *max_extent_size = max(get_max_extent_size(entry),
1841 /* make sure the space returned is big enough
1842 * to match our requested alignment
1844 if (*bytes >= align) {
1845 tmp = entry->offset - ctl->start + align - 1;
1846 tmp = div64_u64(tmp, align);
1847 tmp = tmp * align + ctl->start;
1848 align_off = tmp - entry->offset;
1851 tmp = entry->offset;
1854 if (entry->bytes < *bytes + align_off) {
1855 *max_extent_size = max(get_max_extent_size(entry),
1860 if (entry->bitmap) {
1863 ret = search_bitmap(ctl, entry, &tmp, &size, true);
1870 max(get_max_extent_size(entry),
1877 *bytes = entry->bytes - align_off;
1884 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1885 struct btrfs_free_space *info, u64 offset)
1887 info->offset = offset_to_bitmap(ctl, offset);
1889 INIT_LIST_HEAD(&info->list);
1890 link_free_space(ctl, info);
1891 ctl->total_bitmaps++;
1893 ctl->op->recalc_thresholds(ctl);
1896 static void free_bitmap(struct btrfs_free_space_ctl *ctl,
1897 struct btrfs_free_space *bitmap_info)
1899 unlink_free_space(ctl, bitmap_info);
1900 kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
1901 kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
1902 ctl->total_bitmaps--;
1903 ctl->op->recalc_thresholds(ctl);
1906 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
1907 struct btrfs_free_space *bitmap_info,
1908 u64 *offset, u64 *bytes)
1911 u64 search_start, search_bytes;
1915 end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
1918 * We need to search for bits in this bitmap. We could only cover some
1919 * of the extent in this bitmap thanks to how we add space, so we need
1920 * to search for as much as it as we can and clear that amount, and then
1921 * go searching for the next bit.
1923 search_start = *offset;
1924 search_bytes = ctl->unit;
1925 search_bytes = min(search_bytes, end - search_start + 1);
1926 ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
1928 if (ret < 0 || search_start != *offset)
1931 /* We may have found more bits than what we need */
1932 search_bytes = min(search_bytes, *bytes);
1934 /* Cannot clear past the end of the bitmap */
1935 search_bytes = min(search_bytes, end - search_start + 1);
1937 bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
1938 *offset += search_bytes;
1939 *bytes -= search_bytes;
1942 struct rb_node *next = rb_next(&bitmap_info->offset_index);
1943 if (!bitmap_info->bytes)
1944 free_bitmap(ctl, bitmap_info);
1947 * no entry after this bitmap, but we still have bytes to
1948 * remove, so something has gone wrong.
1953 bitmap_info = rb_entry(next, struct btrfs_free_space,
1957 * if the next entry isn't a bitmap we need to return to let the
1958 * extent stuff do its work.
1960 if (!bitmap_info->bitmap)
1964 * Ok the next item is a bitmap, but it may not actually hold
1965 * the information for the rest of this free space stuff, so
1966 * look for it, and if we don't find it return so we can try
1967 * everything over again.
1969 search_start = *offset;
1970 search_bytes = ctl->unit;
1971 ret = search_bitmap(ctl, bitmap_info, &search_start,
1972 &search_bytes, false);
1973 if (ret < 0 || search_start != *offset)
1977 } else if (!bitmap_info->bytes)
1978 free_bitmap(ctl, bitmap_info);
1983 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
1984 struct btrfs_free_space *info, u64 offset,
1985 u64 bytes, enum btrfs_trim_state trim_state)
1987 u64 bytes_to_set = 0;
1991 * This is a tradeoff to make bitmap trim state minimal. We mark the
1992 * whole bitmap untrimmed if at any point we add untrimmed regions.
1994 if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED)
1995 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
1997 end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
1999 bytes_to_set = min(end - offset, bytes);
2001 bitmap_set_bits(ctl, info, offset, bytes_to_set);
2004 * We set some bytes, we have no idea what the max extent size is
2007 info->max_extent_size = 0;
2009 return bytes_to_set;
2013 static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
2014 struct btrfs_free_space *info)
2016 struct btrfs_block_group *block_group = ctl->private;
2017 struct btrfs_fs_info *fs_info = block_group->fs_info;
2018 bool forced = false;
2020 #ifdef CONFIG_BTRFS_DEBUG
2021 if (btrfs_should_fragment_free_space(block_group))
2026 * If we are below the extents threshold then we can add this as an
2027 * extent, and don't have to deal with the bitmap
2029 if (!forced && ctl->free_extents < ctl->extents_thresh) {
2031 * If this block group has some small extents we don't want to
2032 * use up all of our free slots in the cache with them, we want
2033 * to reserve them to larger extents, however if we have plenty
2034 * of cache left then go ahead an dadd them, no sense in adding
2035 * the overhead of a bitmap if we don't have to.
2037 if (info->bytes <= fs_info->sectorsize * 4) {
2038 if (ctl->free_extents * 2 <= ctl->extents_thresh)
2046 * The original block groups from mkfs can be really small, like 8
2047 * megabytes, so don't bother with a bitmap for those entries. However
2048 * some block groups can be smaller than what a bitmap would cover but
2049 * are still large enough that they could overflow the 32k memory limit,
2050 * so allow those block groups to still be allowed to have a bitmap
2053 if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length)
2059 static const struct btrfs_free_space_op free_space_op = {
2060 .recalc_thresholds = recalculate_thresholds,
2061 .use_bitmap = use_bitmap,
2064 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
2065 struct btrfs_free_space *info)
2067 struct btrfs_free_space *bitmap_info;
2068 struct btrfs_block_group *block_group = NULL;
2070 u64 bytes, offset, bytes_added;
2071 enum btrfs_trim_state trim_state;
2074 bytes = info->bytes;
2075 offset = info->offset;
2076 trim_state = info->trim_state;
2078 if (!ctl->op->use_bitmap(ctl, info))
2081 if (ctl->op == &free_space_op)
2082 block_group = ctl->private;
2085 * Since we link bitmaps right into the cluster we need to see if we
2086 * have a cluster here, and if so and it has our bitmap we need to add
2087 * the free space to that bitmap.
2089 if (block_group && !list_empty(&block_group->cluster_list)) {
2090 struct btrfs_free_cluster *cluster;
2091 struct rb_node *node;
2092 struct btrfs_free_space *entry;
2094 cluster = list_entry(block_group->cluster_list.next,
2095 struct btrfs_free_cluster,
2097 spin_lock(&cluster->lock);
2098 node = rb_first(&cluster->root);
2100 spin_unlock(&cluster->lock);
2101 goto no_cluster_bitmap;
2104 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2105 if (!entry->bitmap) {
2106 spin_unlock(&cluster->lock);
2107 goto no_cluster_bitmap;
2110 if (entry->offset == offset_to_bitmap(ctl, offset)) {
2111 bytes_added = add_bytes_to_bitmap(ctl, entry, offset,
2113 bytes -= bytes_added;
2114 offset += bytes_added;
2116 spin_unlock(&cluster->lock);
2124 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2131 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
2133 bytes -= bytes_added;
2134 offset += bytes_added;
2144 if (info && info->bitmap) {
2145 add_new_bitmap(ctl, info, offset);
2150 spin_unlock(&ctl->tree_lock);
2152 /* no pre-allocated info, allocate a new one */
2154 info = kmem_cache_zalloc(btrfs_free_space_cachep,
2157 spin_lock(&ctl->tree_lock);
2163 /* allocate the bitmap */
2164 info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
2166 info->trim_state = BTRFS_TRIM_STATE_TRIMMED;
2167 spin_lock(&ctl->tree_lock);
2168 if (!info->bitmap) {
2178 kmem_cache_free(btrfs_free_space_bitmap_cachep,
2180 kmem_cache_free(btrfs_free_space_cachep, info);
2187 * Free space merging rules:
2188 * 1) Merge trimmed areas together
2189 * 2) Let untrimmed areas coalesce with trimmed areas
2190 * 3) Always pull neighboring regions from bitmaps
2192 * The above rules are for when we merge free space based on btrfs_trim_state.
2193 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the
2194 * same reason: to promote larger extent regions which makes life easier for
2195 * find_free_extent(). Rule 2 enables coalescing based on the common path
2196 * being returning free space from btrfs_finish_extent_commit(). So when free
2197 * space is trimmed, it will prevent aggregating trimmed new region and
2198 * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents
2199 * and provide find_free_extent() with the largest extents possible hoping for
2202 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
2203 struct btrfs_free_space *info, bool update_stat)
2205 struct btrfs_free_space *left_info;
2206 struct btrfs_free_space *right_info;
2207 bool merged = false;
2208 u64 offset = info->offset;
2209 u64 bytes = info->bytes;
2210 const bool is_trimmed = btrfs_free_space_trimmed(info);
2213 * first we want to see if there is free space adjacent to the range we
2214 * are adding, if there is remove that struct and add a new one to
2215 * cover the entire range
2217 right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
2218 if (right_info && rb_prev(&right_info->offset_index))
2219 left_info = rb_entry(rb_prev(&right_info->offset_index),
2220 struct btrfs_free_space, offset_index);
2222 left_info = tree_search_offset(ctl, offset - 1, 0, 0);
2224 /* See try_merge_free_space() comment. */
2225 if (right_info && !right_info->bitmap &&
2226 (!is_trimmed || btrfs_free_space_trimmed(right_info))) {
2228 unlink_free_space(ctl, right_info);
2230 __unlink_free_space(ctl, right_info);
2231 info->bytes += right_info->bytes;
2232 kmem_cache_free(btrfs_free_space_cachep, right_info);
2236 /* See try_merge_free_space() comment. */
2237 if (left_info && !left_info->bitmap &&
2238 left_info->offset + left_info->bytes == offset &&
2239 (!is_trimmed || btrfs_free_space_trimmed(left_info))) {
2241 unlink_free_space(ctl, left_info);
2243 __unlink_free_space(ctl, left_info);
2244 info->offset = left_info->offset;
2245 info->bytes += left_info->bytes;
2246 kmem_cache_free(btrfs_free_space_cachep, left_info);
2253 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
2254 struct btrfs_free_space *info,
2257 struct btrfs_free_space *bitmap;
2260 const u64 end = info->offset + info->bytes;
2261 const u64 bitmap_offset = offset_to_bitmap(ctl, end);
2264 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2268 i = offset_to_bit(bitmap->offset, ctl->unit, end);
2269 j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
2272 bytes = (j - i) * ctl->unit;
2273 info->bytes += bytes;
2275 /* See try_merge_free_space() comment. */
2276 if (!btrfs_free_space_trimmed(bitmap))
2277 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2280 bitmap_clear_bits(ctl, bitmap, end, bytes);
2282 __bitmap_clear_bits(ctl, bitmap, end, bytes);
2285 free_bitmap(ctl, bitmap);
2290 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
2291 struct btrfs_free_space *info,
2294 struct btrfs_free_space *bitmap;
2298 unsigned long prev_j;
2301 bitmap_offset = offset_to_bitmap(ctl, info->offset);
2302 /* If we're on a boundary, try the previous logical bitmap. */
2303 if (bitmap_offset == info->offset) {
2304 if (info->offset == 0)
2306 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
2309 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2313 i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
2315 prev_j = (unsigned long)-1;
2316 for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
2324 if (prev_j == (unsigned long)-1)
2325 bytes = (i + 1) * ctl->unit;
2327 bytes = (i - prev_j) * ctl->unit;
2329 info->offset -= bytes;
2330 info->bytes += bytes;
2332 /* See try_merge_free_space() comment. */
2333 if (!btrfs_free_space_trimmed(bitmap))
2334 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2337 bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2339 __bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2342 free_bitmap(ctl, bitmap);
2348 * We prefer always to allocate from extent entries, both for clustered and
2349 * non-clustered allocation requests. So when attempting to add a new extent
2350 * entry, try to see if there's adjacent free space in bitmap entries, and if
2351 * there is, migrate that space from the bitmaps to the extent.
2352 * Like this we get better chances of satisfying space allocation requests
2353 * because we attempt to satisfy them based on a single cache entry, and never
2354 * on 2 or more entries - even if the entries represent a contiguous free space
2355 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
2358 static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
2359 struct btrfs_free_space *info,
2363 * Only work with disconnected entries, as we can change their offset,
2364 * and must be extent entries.
2366 ASSERT(!info->bitmap);
2367 ASSERT(RB_EMPTY_NODE(&info->offset_index));
2369 if (ctl->total_bitmaps > 0) {
2371 bool stole_front = false;
2373 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
2374 if (ctl->total_bitmaps > 0)
2375 stole_front = steal_from_bitmap_to_front(ctl, info,
2378 if (stole_end || stole_front)
2379 try_merge_free_space(ctl, info, update_stat);
2383 int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
2384 struct btrfs_free_space_ctl *ctl,
2385 u64 offset, u64 bytes,
2386 enum btrfs_trim_state trim_state)
2388 struct btrfs_block_group *block_group = ctl->private;
2389 struct btrfs_free_space *info;
2392 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
2396 info->offset = offset;
2397 info->bytes = bytes;
2398 info->trim_state = trim_state;
2399 RB_CLEAR_NODE(&info->offset_index);
2401 spin_lock(&ctl->tree_lock);
2403 if (try_merge_free_space(ctl, info, true))
2407 * There was no extent directly to the left or right of this new
2408 * extent then we know we're going to have to allocate a new extent, so
2409 * before we do that see if we need to drop this into a bitmap
2411 ret = insert_into_bitmap(ctl, info);
2420 * Only steal free space from adjacent bitmaps if we're sure we're not
2421 * going to add the new free space to existing bitmap entries - because
2422 * that would mean unnecessary work that would be reverted. Therefore
2423 * attempt to steal space from bitmaps if we're adding an extent entry.
2425 steal_from_bitmap(ctl, info, true);
2427 ret = link_free_space(ctl, info);
2429 kmem_cache_free(btrfs_free_space_cachep, info);
2431 spin_unlock(&ctl->tree_lock);
2434 btrfs_crit(fs_info, "unable to add free space :%d", ret);
2435 ASSERT(ret != -EEXIST);
2438 if (trim_state != BTRFS_TRIM_STATE_TRIMMED)
2439 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
2444 int btrfs_add_free_space(struct btrfs_block_group *block_group,
2445 u64 bytenr, u64 size)
2447 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2449 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
2450 trim_state = BTRFS_TRIM_STATE_TRIMMED;
2452 return __btrfs_add_free_space(block_group->fs_info,
2453 block_group->free_space_ctl,
2454 bytenr, size, trim_state);
2458 * This is a subtle distinction because when adding free space back in general,
2459 * we want it to be added as untrimmed for async. But in the case where we add
2460 * it on loading of a block group, we want to consider it trimmed.
2462 int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
2463 u64 bytenr, u64 size)
2465 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2467 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) ||
2468 btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
2469 trim_state = BTRFS_TRIM_STATE_TRIMMED;
2471 return __btrfs_add_free_space(block_group->fs_info,
2472 block_group->free_space_ctl,
2473 bytenr, size, trim_state);
2476 int btrfs_remove_free_space(struct btrfs_block_group *block_group,
2477 u64 offset, u64 bytes)
2479 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2480 struct btrfs_free_space *info;
2482 bool re_search = false;
2484 spin_lock(&ctl->tree_lock);
2491 info = tree_search_offset(ctl, offset, 0, 0);
2494 * oops didn't find an extent that matched the space we wanted
2495 * to remove, look for a bitmap instead
2497 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2501 * If we found a partial bit of our free space in a
2502 * bitmap but then couldn't find the other part this may
2503 * be a problem, so WARN about it.
2511 if (!info->bitmap) {
2512 unlink_free_space(ctl, info);
2513 if (offset == info->offset) {
2514 u64 to_free = min(bytes, info->bytes);
2516 info->bytes -= to_free;
2517 info->offset += to_free;
2519 ret = link_free_space(ctl, info);
2522 kmem_cache_free(btrfs_free_space_cachep, info);
2529 u64 old_end = info->bytes + info->offset;
2531 info->bytes = offset - info->offset;
2532 ret = link_free_space(ctl, info);
2537 /* Not enough bytes in this entry to satisfy us */
2538 if (old_end < offset + bytes) {
2539 bytes -= old_end - offset;
2542 } else if (old_end == offset + bytes) {
2546 spin_unlock(&ctl->tree_lock);
2548 ret = __btrfs_add_free_space(block_group->fs_info, ctl,
2550 old_end - (offset + bytes),
2557 ret = remove_from_bitmap(ctl, info, &offset, &bytes);
2558 if (ret == -EAGAIN) {
2563 spin_unlock(&ctl->tree_lock);
2568 void btrfs_dump_free_space(struct btrfs_block_group *block_group,
2571 struct btrfs_fs_info *fs_info = block_group->fs_info;
2572 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2573 struct btrfs_free_space *info;
2577 spin_lock(&ctl->tree_lock);
2578 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
2579 info = rb_entry(n, struct btrfs_free_space, offset_index);
2580 if (info->bytes >= bytes && !block_group->ro)
2582 btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
2583 info->offset, info->bytes,
2584 (info->bitmap) ? "yes" : "no");
2586 spin_unlock(&ctl->tree_lock);
2587 btrfs_info(fs_info, "block group has cluster?: %s",
2588 list_empty(&block_group->cluster_list) ? "no" : "yes");
2590 "%d blocks of free space at or bigger than bytes is", count);
2593 void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group)
2595 struct btrfs_fs_info *fs_info = block_group->fs_info;
2596 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2598 spin_lock_init(&ctl->tree_lock);
2599 ctl->unit = fs_info->sectorsize;
2600 ctl->start = block_group->start;
2601 ctl->private = block_group;
2602 ctl->op = &free_space_op;
2603 INIT_LIST_HEAD(&ctl->trimming_ranges);
2604 mutex_init(&ctl->cache_writeout_mutex);
2607 * we only want to have 32k of ram per block group for keeping
2608 * track of free space, and if we pass 1/2 of that we want to
2609 * start converting things over to using bitmaps
2611 ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
2615 * for a given cluster, put all of its extents back into the free
2616 * space cache. If the block group passed doesn't match the block group
2617 * pointed to by the cluster, someone else raced in and freed the
2618 * cluster already. In that case, we just return without changing anything
2621 __btrfs_return_cluster_to_free_space(
2622 struct btrfs_block_group *block_group,
2623 struct btrfs_free_cluster *cluster)
2625 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2626 struct btrfs_free_space *entry;
2627 struct rb_node *node;
2629 spin_lock(&cluster->lock);
2630 if (cluster->block_group != block_group)
2633 cluster->block_group = NULL;
2634 cluster->window_start = 0;
2635 list_del_init(&cluster->block_group_list);
2637 node = rb_first(&cluster->root);
2641 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2642 node = rb_next(&entry->offset_index);
2643 rb_erase(&entry->offset_index, &cluster->root);
2644 RB_CLEAR_NODE(&entry->offset_index);
2646 bitmap = (entry->bitmap != NULL);
2648 try_merge_free_space(ctl, entry, false);
2649 steal_from_bitmap(ctl, entry, false);
2651 tree_insert_offset(&ctl->free_space_offset,
2652 entry->offset, &entry->offset_index, bitmap);
2654 cluster->root = RB_ROOT;
2657 spin_unlock(&cluster->lock);
2658 btrfs_put_block_group(block_group);
2662 static void __btrfs_remove_free_space_cache_locked(
2663 struct btrfs_free_space_ctl *ctl)
2665 struct btrfs_free_space *info;
2666 struct rb_node *node;
2668 while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
2669 info = rb_entry(node, struct btrfs_free_space, offset_index);
2670 if (!info->bitmap) {
2671 unlink_free_space(ctl, info);
2672 kmem_cache_free(btrfs_free_space_cachep, info);
2674 free_bitmap(ctl, info);
2677 cond_resched_lock(&ctl->tree_lock);
2681 void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
2683 spin_lock(&ctl->tree_lock);
2684 __btrfs_remove_free_space_cache_locked(ctl);
2685 spin_unlock(&ctl->tree_lock);
2688 void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
2690 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2691 struct btrfs_free_cluster *cluster;
2692 struct list_head *head;
2694 spin_lock(&ctl->tree_lock);
2695 while ((head = block_group->cluster_list.next) !=
2696 &block_group->cluster_list) {
2697 cluster = list_entry(head, struct btrfs_free_cluster,
2700 WARN_ON(cluster->block_group != block_group);
2701 __btrfs_return_cluster_to_free_space(block_group, cluster);
2703 cond_resched_lock(&ctl->tree_lock);
2705 __btrfs_remove_free_space_cache_locked(ctl);
2706 spin_unlock(&ctl->tree_lock);
2711 * btrfs_is_free_space_trimmed - see if everything is trimmed
2712 * @block_group: block_group of interest
2714 * Walk @block_group's free space rb_tree to determine if everything is trimmed.
2716 bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
2718 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2719 struct btrfs_free_space *info;
2720 struct rb_node *node;
2723 spin_lock(&ctl->tree_lock);
2724 node = rb_first(&ctl->free_space_offset);
2727 info = rb_entry(node, struct btrfs_free_space, offset_index);
2729 if (!btrfs_free_space_trimmed(info)) {
2734 node = rb_next(node);
2737 spin_unlock(&ctl->tree_lock);
2741 u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
2742 u64 offset, u64 bytes, u64 empty_size,
2743 u64 *max_extent_size)
2745 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2746 struct btrfs_free_space *entry = NULL;
2747 u64 bytes_search = bytes + empty_size;
2750 u64 align_gap_len = 0;
2751 enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2753 spin_lock(&ctl->tree_lock);
2754 entry = find_free_space(ctl, &offset, &bytes_search,
2755 block_group->full_stripe_len, max_extent_size);
2760 if (entry->bitmap) {
2761 bitmap_clear_bits(ctl, entry, offset, bytes);
2763 free_bitmap(ctl, entry);
2765 unlink_free_space(ctl, entry);
2766 align_gap_len = offset - entry->offset;
2767 align_gap = entry->offset;
2768 align_gap_trim_state = entry->trim_state;
2770 entry->offset = offset + bytes;
2771 WARN_ON(entry->bytes < bytes + align_gap_len);
2773 entry->bytes -= bytes + align_gap_len;
2775 kmem_cache_free(btrfs_free_space_cachep, entry);
2777 link_free_space(ctl, entry);
2780 spin_unlock(&ctl->tree_lock);
2783 __btrfs_add_free_space(block_group->fs_info, ctl,
2784 align_gap, align_gap_len,
2785 align_gap_trim_state);
2790 * given a cluster, put all of its extents back into the free space
2791 * cache. If a block group is passed, this function will only free
2792 * a cluster that belongs to the passed block group.
2794 * Otherwise, it'll get a reference on the block group pointed to by the
2795 * cluster and remove the cluster from it.
2797 int btrfs_return_cluster_to_free_space(
2798 struct btrfs_block_group *block_group,
2799 struct btrfs_free_cluster *cluster)
2801 struct btrfs_free_space_ctl *ctl;
2804 /* first, get a safe pointer to the block group */
2805 spin_lock(&cluster->lock);
2807 block_group = cluster->block_group;
2809 spin_unlock(&cluster->lock);
2812 } else if (cluster->block_group != block_group) {
2813 /* someone else has already freed it don't redo their work */
2814 spin_unlock(&cluster->lock);
2817 atomic_inc(&block_group->count);
2818 spin_unlock(&cluster->lock);
2820 ctl = block_group->free_space_ctl;
2822 /* now return any extents the cluster had on it */
2823 spin_lock(&ctl->tree_lock);
2824 ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
2825 spin_unlock(&ctl->tree_lock);
2827 btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group);
2829 /* finally drop our ref */
2830 btrfs_put_block_group(block_group);
2834 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
2835 struct btrfs_free_cluster *cluster,
2836 struct btrfs_free_space *entry,
2837 u64 bytes, u64 min_start,
2838 u64 *max_extent_size)
2840 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2842 u64 search_start = cluster->window_start;
2843 u64 search_bytes = bytes;
2846 search_start = min_start;
2847 search_bytes = bytes;
2849 err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
2851 *max_extent_size = max(get_max_extent_size(entry),
2857 __bitmap_clear_bits(ctl, entry, ret, bytes);
2863 * given a cluster, try to allocate 'bytes' from it, returns 0
2864 * if it couldn't find anything suitably large, or a logical disk offset
2865 * if things worked out
2867 u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
2868 struct btrfs_free_cluster *cluster, u64 bytes,
2869 u64 min_start, u64 *max_extent_size)
2871 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2872 struct btrfs_free_space *entry = NULL;
2873 struct rb_node *node;
2876 spin_lock(&cluster->lock);
2877 if (bytes > cluster->max_size)
2880 if (cluster->block_group != block_group)
2883 node = rb_first(&cluster->root);
2887 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2889 if (entry->bytes < bytes)
2890 *max_extent_size = max(get_max_extent_size(entry),
2893 if (entry->bytes < bytes ||
2894 (!entry->bitmap && entry->offset < min_start)) {
2895 node = rb_next(&entry->offset_index);
2898 entry = rb_entry(node, struct btrfs_free_space,
2903 if (entry->bitmap) {
2904 ret = btrfs_alloc_from_bitmap(block_group,
2905 cluster, entry, bytes,
2906 cluster->window_start,
2909 node = rb_next(&entry->offset_index);
2912 entry = rb_entry(node, struct btrfs_free_space,
2916 cluster->window_start += bytes;
2918 ret = entry->offset;
2920 entry->offset += bytes;
2921 entry->bytes -= bytes;
2924 if (entry->bytes == 0)
2925 rb_erase(&entry->offset_index, &cluster->root);
2929 spin_unlock(&cluster->lock);
2934 spin_lock(&ctl->tree_lock);
2936 ctl->free_space -= bytes;
2937 if (entry->bytes == 0) {
2938 ctl->free_extents--;
2939 if (entry->bitmap) {
2940 kmem_cache_free(btrfs_free_space_bitmap_cachep,
2942 ctl->total_bitmaps--;
2943 ctl->op->recalc_thresholds(ctl);
2945 kmem_cache_free(btrfs_free_space_cachep, entry);
2948 spin_unlock(&ctl->tree_lock);
2953 static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
2954 struct btrfs_free_space *entry,
2955 struct btrfs_free_cluster *cluster,
2956 u64 offset, u64 bytes,
2957 u64 cont1_bytes, u64 min_bytes)
2959 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2960 unsigned long next_zero;
2962 unsigned long want_bits;
2963 unsigned long min_bits;
2964 unsigned long found_bits;
2965 unsigned long max_bits = 0;
2966 unsigned long start = 0;
2967 unsigned long total_found = 0;
2970 i = offset_to_bit(entry->offset, ctl->unit,
2971 max_t(u64, offset, entry->offset));
2972 want_bits = bytes_to_bits(bytes, ctl->unit);
2973 min_bits = bytes_to_bits(min_bytes, ctl->unit);
2976 * Don't bother looking for a cluster in this bitmap if it's heavily
2979 if (entry->max_extent_size &&
2980 entry->max_extent_size < cont1_bytes)
2984 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
2985 next_zero = find_next_zero_bit(entry->bitmap,
2986 BITS_PER_BITMAP, i);
2987 if (next_zero - i >= min_bits) {
2988 found_bits = next_zero - i;
2989 if (found_bits > max_bits)
2990 max_bits = found_bits;
2993 if (next_zero - i > max_bits)
2994 max_bits = next_zero - i;
2999 entry->max_extent_size = (u64)max_bits * ctl->unit;
3005 cluster->max_size = 0;
3008 total_found += found_bits;
3010 if (cluster->max_size < found_bits * ctl->unit)
3011 cluster->max_size = found_bits * ctl->unit;
3013 if (total_found < want_bits || cluster->max_size < cont1_bytes) {
3018 cluster->window_start = start * ctl->unit + entry->offset;
3019 rb_erase(&entry->offset_index, &ctl->free_space_offset);
3020 ret = tree_insert_offset(&cluster->root, entry->offset,
3021 &entry->offset_index, 1);
3022 ASSERT(!ret); /* -EEXIST; Logic error */
3024 trace_btrfs_setup_cluster(block_group, cluster,
3025 total_found * ctl->unit, 1);
3030 * This searches the block group for just extents to fill the cluster with.
3031 * Try to find a cluster with at least bytes total bytes, at least one
3032 * extent of cont1_bytes, and other clusters of at least min_bytes.
3035 setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
3036 struct btrfs_free_cluster *cluster,
3037 struct list_head *bitmaps, u64 offset, u64 bytes,
3038 u64 cont1_bytes, u64 min_bytes)
3040 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3041 struct btrfs_free_space *first = NULL;
3042 struct btrfs_free_space *entry = NULL;
3043 struct btrfs_free_space *last;
3044 struct rb_node *node;
3049 entry = tree_search_offset(ctl, offset, 0, 1);
3054 * We don't want bitmaps, so just move along until we find a normal
3057 while (entry->bitmap || entry->bytes < min_bytes) {
3058 if (entry->bitmap && list_empty(&entry->list))
3059 list_add_tail(&entry->list, bitmaps);
3060 node = rb_next(&entry->offset_index);
3063 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3066 window_free = entry->bytes;
3067 max_extent = entry->bytes;
3071 for (node = rb_next(&entry->offset_index); node;
3072 node = rb_next(&entry->offset_index)) {
3073 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3075 if (entry->bitmap) {
3076 if (list_empty(&entry->list))
3077 list_add_tail(&entry->list, bitmaps);
3081 if (entry->bytes < min_bytes)
3085 window_free += entry->bytes;
3086 if (entry->bytes > max_extent)
3087 max_extent = entry->bytes;
3090 if (window_free < bytes || max_extent < cont1_bytes)
3093 cluster->window_start = first->offset;
3095 node = &first->offset_index;
3098 * now we've found our entries, pull them out of the free space
3099 * cache and put them into the cluster rbtree
3104 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3105 node = rb_next(&entry->offset_index);
3106 if (entry->bitmap || entry->bytes < min_bytes)
3109 rb_erase(&entry->offset_index, &ctl->free_space_offset);
3110 ret = tree_insert_offset(&cluster->root, entry->offset,
3111 &entry->offset_index, 0);
3112 total_size += entry->bytes;
3113 ASSERT(!ret); /* -EEXIST; Logic error */
3114 } while (node && entry != last);
3116 cluster->max_size = max_extent;
3117 trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
3122 * This specifically looks for bitmaps that may work in the cluster, we assume
3123 * that we have already failed to find extents that will work.
3126 setup_cluster_bitmap(struct btrfs_block_group *block_group,
3127 struct btrfs_free_cluster *cluster,
3128 struct list_head *bitmaps, u64 offset, u64 bytes,
3129 u64 cont1_bytes, u64 min_bytes)
3131 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3132 struct btrfs_free_space *entry = NULL;
3134 u64 bitmap_offset = offset_to_bitmap(ctl, offset);
3136 if (ctl->total_bitmaps == 0)
3140 * The bitmap that covers offset won't be in the list unless offset
3141 * is just its start offset.
3143 if (!list_empty(bitmaps))
3144 entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
3146 if (!entry || entry->offset != bitmap_offset) {
3147 entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
3148 if (entry && list_empty(&entry->list))
3149 list_add(&entry->list, bitmaps);
3152 list_for_each_entry(entry, bitmaps, list) {
3153 if (entry->bytes < bytes)
3155 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
3156 bytes, cont1_bytes, min_bytes);
3162 * The bitmaps list has all the bitmaps that record free space
3163 * starting after offset, so no more search is required.
3169 * here we try to find a cluster of blocks in a block group. The goal
3170 * is to find at least bytes+empty_size.
3171 * We might not find them all in one contiguous area.
3173 * returns zero and sets up cluster if things worked out, otherwise
3174 * it returns -enospc
3176 int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
3177 struct btrfs_free_cluster *cluster,
3178 u64 offset, u64 bytes, u64 empty_size)
3180 struct btrfs_fs_info *fs_info = block_group->fs_info;
3181 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3182 struct btrfs_free_space *entry, *tmp;
3189 * Choose the minimum extent size we'll require for this
3190 * cluster. For SSD_SPREAD, don't allow any fragmentation.
3191 * For metadata, allow allocates with smaller extents. For
3192 * data, keep it dense.
3194 if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
3195 cont1_bytes = min_bytes = bytes + empty_size;
3196 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
3197 cont1_bytes = bytes;
3198 min_bytes = fs_info->sectorsize;
3200 cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
3201 min_bytes = fs_info->sectorsize;
3204 spin_lock(&ctl->tree_lock);
3207 * If we know we don't have enough space to make a cluster don't even
3208 * bother doing all the work to try and find one.
3210 if (ctl->free_space < bytes) {
3211 spin_unlock(&ctl->tree_lock);
3215 spin_lock(&cluster->lock);
3217 /* someone already found a cluster, hooray */
3218 if (cluster->block_group) {
3223 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
3226 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
3228 cont1_bytes, min_bytes);
3230 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
3231 offset, bytes + empty_size,
3232 cont1_bytes, min_bytes);
3234 /* Clear our temporary list */
3235 list_for_each_entry_safe(entry, tmp, &bitmaps, list)
3236 list_del_init(&entry->list);
3239 atomic_inc(&block_group->count);
3240 list_add_tail(&cluster->block_group_list,
3241 &block_group->cluster_list);
3242 cluster->block_group = block_group;
3244 trace_btrfs_failed_cluster_setup(block_group);
3247 spin_unlock(&cluster->lock);
3248 spin_unlock(&ctl->tree_lock);
3254 * simple code to zero out a cluster
3256 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
3258 spin_lock_init(&cluster->lock);
3259 spin_lock_init(&cluster->refill_lock);
3260 cluster->root = RB_ROOT;
3261 cluster->max_size = 0;
3262 cluster->fragmented = false;
3263 INIT_LIST_HEAD(&cluster->block_group_list);
3264 cluster->block_group = NULL;
3267 static int do_trimming(struct btrfs_block_group *block_group,
3268 u64 *total_trimmed, u64 start, u64 bytes,
3269 u64 reserved_start, u64 reserved_bytes,
3270 enum btrfs_trim_state reserved_trim_state,
3271 struct btrfs_trim_range *trim_entry)
3273 struct btrfs_space_info *space_info = block_group->space_info;
3274 struct btrfs_fs_info *fs_info = block_group->fs_info;
3275 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3278 const u64 end = start + bytes;
3279 const u64 reserved_end = reserved_start + reserved_bytes;
3280 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3283 spin_lock(&space_info->lock);
3284 spin_lock(&block_group->lock);
3285 if (!block_group->ro) {
3286 block_group->reserved += reserved_bytes;
3287 space_info->bytes_reserved += reserved_bytes;
3290 spin_unlock(&block_group->lock);
3291 spin_unlock(&space_info->lock);
3293 ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
3295 *total_trimmed += trimmed;
3296 trim_state = BTRFS_TRIM_STATE_TRIMMED;
3299 mutex_lock(&ctl->cache_writeout_mutex);
3300 if (reserved_start < start)
3301 __btrfs_add_free_space(fs_info, ctl, reserved_start,
3302 start - reserved_start,
3303 reserved_trim_state);
3304 if (start + bytes < reserved_start + reserved_bytes)
3305 __btrfs_add_free_space(fs_info, ctl, end, reserved_end - end,
3306 reserved_trim_state);
3307 __btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state);
3308 list_del(&trim_entry->list);
3309 mutex_unlock(&ctl->cache_writeout_mutex);
3312 spin_lock(&space_info->lock);
3313 spin_lock(&block_group->lock);
3314 if (block_group->ro)
3315 space_info->bytes_readonly += reserved_bytes;
3316 block_group->reserved -= reserved_bytes;
3317 space_info->bytes_reserved -= reserved_bytes;
3318 spin_unlock(&block_group->lock);
3319 spin_unlock(&space_info->lock);
3325 static int trim_no_bitmap(struct btrfs_block_group *block_group,
3326 u64 *total_trimmed, u64 start, u64 end, u64 minlen)
3328 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3329 struct btrfs_free_space *entry;
3330 struct rb_node *node;
3334 enum btrfs_trim_state extent_trim_state;
3337 while (start < end) {
3338 struct btrfs_trim_range trim_entry;
3340 mutex_lock(&ctl->cache_writeout_mutex);
3341 spin_lock(&ctl->tree_lock);
3343 if (ctl->free_space < minlen) {
3344 spin_unlock(&ctl->tree_lock);
3345 mutex_unlock(&ctl->cache_writeout_mutex);
3349 entry = tree_search_offset(ctl, start, 0, 1);
3351 spin_unlock(&ctl->tree_lock);
3352 mutex_unlock(&ctl->cache_writeout_mutex);
3357 while (entry->bitmap) {
3358 node = rb_next(&entry->offset_index);
3360 spin_unlock(&ctl->tree_lock);
3361 mutex_unlock(&ctl->cache_writeout_mutex);
3364 entry = rb_entry(node, struct btrfs_free_space,
3368 if (entry->offset >= end) {
3369 spin_unlock(&ctl->tree_lock);
3370 mutex_unlock(&ctl->cache_writeout_mutex);
3374 extent_start = entry->offset;
3375 extent_bytes = entry->bytes;
3376 extent_trim_state = entry->trim_state;
3377 start = max(start, extent_start);
3378 bytes = min(extent_start + extent_bytes, end) - start;
3379 if (bytes < minlen) {
3380 spin_unlock(&ctl->tree_lock);
3381 mutex_unlock(&ctl->cache_writeout_mutex);
3385 unlink_free_space(ctl, entry);
3386 kmem_cache_free(btrfs_free_space_cachep, entry);
3388 spin_unlock(&ctl->tree_lock);
3389 trim_entry.start = extent_start;
3390 trim_entry.bytes = extent_bytes;
3391 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3392 mutex_unlock(&ctl->cache_writeout_mutex);
3394 ret = do_trimming(block_group, total_trimmed, start, bytes,
3395 extent_start, extent_bytes, extent_trim_state,
3402 if (fatal_signal_pending(current)) {
3414 * If we break out of trimming a bitmap prematurely, we should reset the
3415 * trimming bit. In a rather contrieved case, it's possible to race here so
3416 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
3418 * start = start of bitmap
3419 * end = near end of bitmap
3421 * Thread 1: Thread 2:
3422 * trim_bitmaps(start)
3424 * end_trimming_bitmap()
3425 * reset_trimming_bitmap()
3427 static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset)
3429 struct btrfs_free_space *entry;
3431 spin_lock(&ctl->tree_lock);
3432 entry = tree_search_offset(ctl, offset, 1, 0);
3434 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3435 spin_unlock(&ctl->tree_lock);
3438 static void end_trimming_bitmap(struct btrfs_free_space *entry)
3440 if (btrfs_free_space_trimming_bitmap(entry))
3441 entry->trim_state = BTRFS_TRIM_STATE_TRIMMED;
3444 static int trim_bitmaps(struct btrfs_block_group *block_group,
3445 u64 *total_trimmed, u64 start, u64 end, u64 minlen)
3447 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3448 struct btrfs_free_space *entry;
3452 u64 offset = offset_to_bitmap(ctl, start);
3454 while (offset < end) {
3455 bool next_bitmap = false;
3456 struct btrfs_trim_range trim_entry;
3458 mutex_lock(&ctl->cache_writeout_mutex);
3459 spin_lock(&ctl->tree_lock);
3461 if (ctl->free_space < minlen) {
3462 spin_unlock(&ctl->tree_lock);
3463 mutex_unlock(&ctl->cache_writeout_mutex);
3467 entry = tree_search_offset(ctl, offset, 1, 0);
3468 if (!entry || btrfs_free_space_trimmed(entry)) {
3469 spin_unlock(&ctl->tree_lock);
3470 mutex_unlock(&ctl->cache_writeout_mutex);
3476 * Async discard bitmap trimming begins at by setting the start
3477 * to be key.objectid and the offset_to_bitmap() aligns to the
3478 * start of the bitmap. This lets us know we are fully
3479 * scanning the bitmap rather than only some portion of it.
3481 if (start == offset)
3482 entry->trim_state = BTRFS_TRIM_STATE_TRIMMING;
3485 ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
3486 if (ret2 || start >= end) {
3488 * This keeps the invariant that all bytes are trimmed
3489 * if BTRFS_TRIM_STATE_TRIMMED is set on a bitmap.
3491 if (ret2 && !minlen)
3492 end_trimming_bitmap(entry);
3494 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3495 spin_unlock(&ctl->tree_lock);
3496 mutex_unlock(&ctl->cache_writeout_mutex);
3501 bytes = min(bytes, end - start);
3502 if (bytes < minlen) {
3503 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3504 spin_unlock(&ctl->tree_lock);
3505 mutex_unlock(&ctl->cache_writeout_mutex);
3509 bitmap_clear_bits(ctl, entry, start, bytes);
3510 if (entry->bytes == 0)
3511 free_bitmap(ctl, entry);
3513 spin_unlock(&ctl->tree_lock);
3514 trim_entry.start = start;
3515 trim_entry.bytes = bytes;
3516 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3517 mutex_unlock(&ctl->cache_writeout_mutex);
3519 ret = do_trimming(block_group, total_trimmed, start, bytes,
3520 start, bytes, 0, &trim_entry);
3522 reset_trimming_bitmap(ctl, offset);
3527 offset += BITS_PER_BITMAP * ctl->unit;
3533 if (fatal_signal_pending(current)) {
3534 if (start != offset)
3535 reset_trimming_bitmap(ctl, offset);
3546 void btrfs_get_block_group_trimming(struct btrfs_block_group *cache)
3548 atomic_inc(&cache->trimming);
3551 void btrfs_put_block_group_trimming(struct btrfs_block_group *block_group)
3553 struct btrfs_fs_info *fs_info = block_group->fs_info;
3554 struct extent_map_tree *em_tree;
3555 struct extent_map *em;
3558 spin_lock(&block_group->lock);
3559 cleanup = (atomic_dec_and_test(&block_group->trimming) &&
3560 block_group->removed);
3561 spin_unlock(&block_group->lock);
3564 mutex_lock(&fs_info->chunk_mutex);
3565 em_tree = &fs_info->mapping_tree;
3566 write_lock(&em_tree->lock);
3567 em = lookup_extent_mapping(em_tree, block_group->start,
3569 BUG_ON(!em); /* logic error, can't happen */
3570 remove_extent_mapping(em_tree, em);
3571 write_unlock(&em_tree->lock);
3572 mutex_unlock(&fs_info->chunk_mutex);
3574 /* once for us and once for the tree */
3575 free_extent_map(em);
3576 free_extent_map(em);
3579 * We've left one free space entry and other tasks trimming
3580 * this block group have left 1 entry each one. Free them.
3582 __btrfs_remove_free_space_cache(block_group->free_space_ctl);
3586 int btrfs_trim_block_group(struct btrfs_block_group *block_group,
3587 u64 *trimmed, u64 start, u64 end, u64 minlen)
3589 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3595 spin_lock(&block_group->lock);
3596 if (block_group->removed) {
3597 spin_unlock(&block_group->lock);
3600 btrfs_get_block_group_trimming(block_group);
3601 spin_unlock(&block_group->lock);
3603 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
3607 ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
3608 div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem);
3609 /* If we ended in the middle of a bitmap, reset the trimming flag */
3611 reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
3613 btrfs_put_block_group_trimming(block_group);
3618 * Find the left-most item in the cache tree, and then return the
3619 * smallest inode number in the item.
3621 * Note: the returned inode number may not be the smallest one in
3622 * the tree, if the left-most item is a bitmap.
3624 u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
3626 struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl;
3627 struct btrfs_free_space *entry = NULL;
3630 spin_lock(&ctl->tree_lock);
3632 if (RB_EMPTY_ROOT(&ctl->free_space_offset))
3635 entry = rb_entry(rb_first(&ctl->free_space_offset),
3636 struct btrfs_free_space, offset_index);
3638 if (!entry->bitmap) {
3639 ino = entry->offset;
3641 unlink_free_space(ctl, entry);
3645 kmem_cache_free(btrfs_free_space_cachep, entry);
3647 link_free_space(ctl, entry);
3653 ret = search_bitmap(ctl, entry, &offset, &count, true);
3654 /* Logic error; Should be empty if it can't find anything */
3658 bitmap_clear_bits(ctl, entry, offset, 1);
3659 if (entry->bytes == 0)
3660 free_bitmap(ctl, entry);
3663 spin_unlock(&ctl->tree_lock);
3668 struct inode *lookup_free_ino_inode(struct btrfs_root *root,
3669 struct btrfs_path *path)
3671 struct inode *inode = NULL;
3673 spin_lock(&root->ino_cache_lock);
3674 if (root->ino_cache_inode)
3675 inode = igrab(root->ino_cache_inode);
3676 spin_unlock(&root->ino_cache_lock);
3680 inode = __lookup_free_space_inode(root, path, 0);
3684 spin_lock(&root->ino_cache_lock);
3685 if (!btrfs_fs_closing(root->fs_info))
3686 root->ino_cache_inode = igrab(inode);
3687 spin_unlock(&root->ino_cache_lock);
3692 int create_free_ino_inode(struct btrfs_root *root,
3693 struct btrfs_trans_handle *trans,
3694 struct btrfs_path *path)
3696 return __create_free_space_inode(root, trans, path,
3697 BTRFS_FREE_INO_OBJECTID, 0);
3700 int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
3702 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
3703 struct btrfs_path *path;
3704 struct inode *inode;
3706 u64 root_gen = btrfs_root_generation(&root->root_item);
3708 if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
3712 * If we're unmounting then just return, since this does a search on the
3713 * normal root and not the commit root and we could deadlock.
3715 if (btrfs_fs_closing(fs_info))
3718 path = btrfs_alloc_path();
3722 inode = lookup_free_ino_inode(root, path);
3726 if (root_gen != BTRFS_I(inode)->generation)
3729 ret = __load_free_space_cache(root, inode, ctl, path, 0);
3733 "failed to load free ino cache for root %llu",
3734 root->root_key.objectid);
3738 btrfs_free_path(path);
3742 int btrfs_write_out_ino_cache(struct btrfs_root *root,
3743 struct btrfs_trans_handle *trans,
3744 struct btrfs_path *path,
3745 struct inode *inode)
3747 struct btrfs_fs_info *fs_info = root->fs_info;
3748 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
3750 struct btrfs_io_ctl io_ctl;
3751 bool release_metadata = true;
3753 if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
3756 memset(&io_ctl, 0, sizeof(io_ctl));
3757 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, trans);
3760 * At this point writepages() didn't error out, so our metadata
3761 * reservation is released when the writeback finishes, at
3762 * inode.c:btrfs_finish_ordered_io(), regardless of it finishing
3763 * with or without an error.
3765 release_metadata = false;
3766 ret = btrfs_wait_cache_io_root(root, trans, &io_ctl, path);
3770 if (release_metadata)
3771 btrfs_delalloc_release_metadata(BTRFS_I(inode),
3772 inode->i_size, true);
3775 "failed to write free ino cache for root %llu",
3776 root->root_key.objectid);
3783 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
3785 * Use this if you need to make a bitmap or extent entry specifically, it
3786 * doesn't do any of the merging that add_free_space does, this acts a lot like
3787 * how the free space cache loading stuff works, so you can get really weird
3790 int test_add_free_space_entry(struct btrfs_block_group *cache,
3791 u64 offset, u64 bytes, bool bitmap)
3793 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
3794 struct btrfs_free_space *info = NULL, *bitmap_info;
3796 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED;
3802 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
3808 spin_lock(&ctl->tree_lock);
3809 info->offset = offset;
3810 info->bytes = bytes;
3811 info->max_extent_size = 0;
3812 ret = link_free_space(ctl, info);
3813 spin_unlock(&ctl->tree_lock);
3815 kmem_cache_free(btrfs_free_space_cachep, info);
3820 map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS);
3822 kmem_cache_free(btrfs_free_space_cachep, info);
3827 spin_lock(&ctl->tree_lock);
3828 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
3833 add_new_bitmap(ctl, info, offset);
3838 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
3841 bytes -= bytes_added;
3842 offset += bytes_added;
3843 spin_unlock(&ctl->tree_lock);
3849 kmem_cache_free(btrfs_free_space_cachep, info);
3851 kmem_cache_free(btrfs_free_space_bitmap_cachep, map);
3856 * Checks to see if the given range is in the free space cache. This is really
3857 * just used to check the absence of space, so if there is free space in the
3858 * range at all we will return 1.
3860 int test_check_exists(struct btrfs_block_group *cache,
3861 u64 offset, u64 bytes)
3863 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
3864 struct btrfs_free_space *info;
3867 spin_lock(&ctl->tree_lock);
3868 info = tree_search_offset(ctl, offset, 0, 0);
3870 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
3878 u64 bit_off, bit_bytes;
3880 struct btrfs_free_space *tmp;
3883 bit_bytes = ctl->unit;
3884 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
3886 if (bit_off == offset) {
3889 } else if (bit_off > offset &&
3890 offset + bytes > bit_off) {
3896 n = rb_prev(&info->offset_index);
3898 tmp = rb_entry(n, struct btrfs_free_space,
3900 if (tmp->offset + tmp->bytes < offset)
3902 if (offset + bytes < tmp->offset) {
3903 n = rb_prev(&tmp->offset_index);
3910 n = rb_next(&info->offset_index);
3912 tmp = rb_entry(n, struct btrfs_free_space,
3914 if (offset + bytes < tmp->offset)
3916 if (tmp->offset + tmp->bytes < offset) {
3917 n = rb_next(&tmp->offset_index);
3928 if (info->offset == offset) {
3933 if (offset > info->offset && offset < info->offset + info->bytes)
3936 spin_unlock(&ctl->tree_lock);
3939 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */