]> asedeno.scripts.mit.edu Git - linux.git/blob - fs/btrfs/space-info.c
Merge tag 'pci-v5.6-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/helgaas/pci
[linux.git] / fs / btrfs / space-info.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "misc.h"
4 #include "ctree.h"
5 #include "space-info.h"
6 #include "sysfs.h"
7 #include "volumes.h"
8 #include "free-space-cache.h"
9 #include "ordered-data.h"
10 #include "transaction.h"
11 #include "block-group.h"
12
13 u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
14                           bool may_use_included)
15 {
16         ASSERT(s_info);
17         return s_info->bytes_used + s_info->bytes_reserved +
18                 s_info->bytes_pinned + s_info->bytes_readonly +
19                 (may_use_included ? s_info->bytes_may_use : 0);
20 }
21
22 /*
23  * after adding space to the filesystem, we need to clear the full flags
24  * on all the space infos.
25  */
26 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
27 {
28         struct list_head *head = &info->space_info;
29         struct btrfs_space_info *found;
30
31         rcu_read_lock();
32         list_for_each_entry_rcu(found, head, list)
33                 found->full = 0;
34         rcu_read_unlock();
35 }
36
37 static int create_space_info(struct btrfs_fs_info *info, u64 flags)
38 {
39
40         struct btrfs_space_info *space_info;
41         int i;
42         int ret;
43
44         space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
45         if (!space_info)
46                 return -ENOMEM;
47
48         ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
49                                  GFP_KERNEL);
50         if (ret) {
51                 kfree(space_info);
52                 return ret;
53         }
54
55         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
56                 INIT_LIST_HEAD(&space_info->block_groups[i]);
57         init_rwsem(&space_info->groups_sem);
58         spin_lock_init(&space_info->lock);
59         space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
60         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
61         INIT_LIST_HEAD(&space_info->ro_bgs);
62         INIT_LIST_HEAD(&space_info->tickets);
63         INIT_LIST_HEAD(&space_info->priority_tickets);
64
65         ret = btrfs_sysfs_add_space_info_type(info, space_info);
66         if (ret)
67                 return ret;
68
69         list_add_rcu(&space_info->list, &info->space_info);
70         if (flags & BTRFS_BLOCK_GROUP_DATA)
71                 info->data_sinfo = space_info;
72
73         return ret;
74 }
75
76 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
77 {
78         struct btrfs_super_block *disk_super;
79         u64 features;
80         u64 flags;
81         int mixed = 0;
82         int ret;
83
84         disk_super = fs_info->super_copy;
85         if (!btrfs_super_root(disk_super))
86                 return -EINVAL;
87
88         features = btrfs_super_incompat_flags(disk_super);
89         if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
90                 mixed = 1;
91
92         flags = BTRFS_BLOCK_GROUP_SYSTEM;
93         ret = create_space_info(fs_info, flags);
94         if (ret)
95                 goto out;
96
97         if (mixed) {
98                 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
99                 ret = create_space_info(fs_info, flags);
100         } else {
101                 flags = BTRFS_BLOCK_GROUP_METADATA;
102                 ret = create_space_info(fs_info, flags);
103                 if (ret)
104                         goto out;
105
106                 flags = BTRFS_BLOCK_GROUP_DATA;
107                 ret = create_space_info(fs_info, flags);
108         }
109 out:
110         return ret;
111 }
112
113 void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
114                              u64 total_bytes, u64 bytes_used,
115                              u64 bytes_readonly,
116                              struct btrfs_space_info **space_info)
117 {
118         struct btrfs_space_info *found;
119         int factor;
120
121         factor = btrfs_bg_type_to_factor(flags);
122
123         found = btrfs_find_space_info(info, flags);
124         ASSERT(found);
125         spin_lock(&found->lock);
126         found->total_bytes += total_bytes;
127         found->disk_total += total_bytes * factor;
128         found->bytes_used += bytes_used;
129         found->disk_used += bytes_used * factor;
130         found->bytes_readonly += bytes_readonly;
131         if (total_bytes > 0)
132                 found->full = 0;
133         btrfs_try_granting_tickets(info, found);
134         spin_unlock(&found->lock);
135         *space_info = found;
136 }
137
138 struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
139                                                u64 flags)
140 {
141         struct list_head *head = &info->space_info;
142         struct btrfs_space_info *found;
143
144         flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
145
146         rcu_read_lock();
147         list_for_each_entry_rcu(found, head, list) {
148                 if (found->flags & flags) {
149                         rcu_read_unlock();
150                         return found;
151                 }
152         }
153         rcu_read_unlock();
154         return NULL;
155 }
156
157 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
158 {
159         return (global->size << 1);
160 }
161
162 static int can_overcommit(struct btrfs_fs_info *fs_info,
163                           struct btrfs_space_info *space_info, u64 bytes,
164                           enum btrfs_reserve_flush_enum flush)
165 {
166         u64 profile;
167         u64 avail;
168         u64 used;
169         int factor;
170
171         /* Don't overcommit when in mixed mode. */
172         if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
173                 return 0;
174
175         if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
176                 profile = btrfs_system_alloc_profile(fs_info);
177         else
178                 profile = btrfs_metadata_alloc_profile(fs_info);
179
180         used = btrfs_space_info_used(space_info, true);
181         avail = atomic64_read(&fs_info->free_chunk_space);
182
183         /*
184          * If we have dup, raid1 or raid10 then only half of the free
185          * space is actually usable.  For raid56, the space info used
186          * doesn't include the parity drive, so we don't have to
187          * change the math
188          */
189         factor = btrfs_bg_type_to_factor(profile);
190         avail = div_u64(avail, factor);
191
192         /*
193          * If we aren't flushing all things, let us overcommit up to
194          * 1/2th of the space. If we can flush, don't let us overcommit
195          * too much, let it overcommit up to 1/8 of the space.
196          */
197         if (flush == BTRFS_RESERVE_FLUSH_ALL)
198                 avail >>= 3;
199         else
200                 avail >>= 1;
201
202         if (used + bytes < space_info->total_bytes + avail)
203                 return 1;
204         return 0;
205 }
206
207 /*
208  * This is for space we already have accounted in space_info->bytes_may_use, so
209  * basically when we're returning space from block_rsv's.
210  */
211 void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
212                                 struct btrfs_space_info *space_info)
213 {
214         struct list_head *head;
215         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
216
217         lockdep_assert_held(&space_info->lock);
218
219         head = &space_info->priority_tickets;
220 again:
221         while (!list_empty(head)) {
222                 struct reserve_ticket *ticket;
223                 u64 used = btrfs_space_info_used(space_info, true);
224
225                 ticket = list_first_entry(head, struct reserve_ticket, list);
226
227                 /* Check and see if our ticket can be satisified now. */
228                 if ((used + ticket->bytes <= space_info->total_bytes) ||
229                     can_overcommit(fs_info, space_info, ticket->bytes, flush)) {
230                         btrfs_space_info_update_bytes_may_use(fs_info,
231                                                               space_info,
232                                                               ticket->bytes);
233                         list_del_init(&ticket->list);
234                         ticket->bytes = 0;
235                         space_info->tickets_id++;
236                         wake_up(&ticket->wait);
237                 } else {
238                         break;
239                 }
240         }
241
242         if (head == &space_info->priority_tickets) {
243                 head = &space_info->tickets;
244                 flush = BTRFS_RESERVE_FLUSH_ALL;
245                 goto again;
246         }
247 }
248
249 #define DUMP_BLOCK_RSV(fs_info, rsv_name)                               \
250 do {                                                                    \
251         struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name;           \
252         spin_lock(&__rsv->lock);                                        \
253         btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu",      \
254                    __rsv->size, __rsv->reserved);                       \
255         spin_unlock(&__rsv->lock);                                      \
256 } while (0)
257
258 static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
259                                     struct btrfs_space_info *info)
260 {
261         lockdep_assert_held(&info->lock);
262
263         btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
264                    info->flags,
265                    info->total_bytes - btrfs_space_info_used(info, true),
266                    info->full ? "" : "not ");
267         btrfs_info(fs_info,
268                 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
269                 info->total_bytes, info->bytes_used, info->bytes_pinned,
270                 info->bytes_reserved, info->bytes_may_use,
271                 info->bytes_readonly);
272
273         DUMP_BLOCK_RSV(fs_info, global_block_rsv);
274         DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
275         DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
276         DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
277         DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
278
279 }
280
281 void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
282                            struct btrfs_space_info *info, u64 bytes,
283                            int dump_block_groups)
284 {
285         struct btrfs_block_group *cache;
286         int index = 0;
287
288         spin_lock(&info->lock);
289         __btrfs_dump_space_info(fs_info, info);
290         spin_unlock(&info->lock);
291
292         if (!dump_block_groups)
293                 return;
294
295         down_read(&info->groups_sem);
296 again:
297         list_for_each_entry(cache, &info->block_groups[index], list) {
298                 spin_lock(&cache->lock);
299                 btrfs_info(fs_info,
300                         "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
301                         cache->start, cache->length, cache->used, cache->pinned,
302                         cache->reserved, cache->ro ? "[readonly]" : "");
303                 btrfs_dump_free_space(cache, bytes);
304                 spin_unlock(&cache->lock);
305         }
306         if (++index < BTRFS_NR_RAID_TYPES)
307                 goto again;
308         up_read(&info->groups_sem);
309 }
310
311 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
312                                          unsigned long nr_pages, int nr_items)
313 {
314         struct super_block *sb = fs_info->sb;
315
316         if (down_read_trylock(&sb->s_umount)) {
317                 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
318                 up_read(&sb->s_umount);
319         } else {
320                 /*
321                  * We needn't worry the filesystem going from r/w to r/o though
322                  * we don't acquire ->s_umount mutex, because the filesystem
323                  * should guarantee the delalloc inodes list be empty after
324                  * the filesystem is readonly(all dirty pages are written to
325                  * the disk).
326                  */
327                 btrfs_start_delalloc_roots(fs_info, nr_items);
328                 if (!current->journal_info)
329                         btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
330         }
331 }
332
333 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
334                                         u64 to_reclaim)
335 {
336         u64 bytes;
337         u64 nr;
338
339         bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
340         nr = div64_u64(to_reclaim, bytes);
341         if (!nr)
342                 nr = 1;
343         return nr;
344 }
345
346 #define EXTENT_SIZE_PER_ITEM    SZ_256K
347
348 /*
349  * shrink metadata reservation for delalloc
350  */
351 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
352                             u64 orig, bool wait_ordered)
353 {
354         struct btrfs_space_info *space_info;
355         struct btrfs_trans_handle *trans;
356         u64 delalloc_bytes;
357         u64 dio_bytes;
358         u64 async_pages;
359         u64 items;
360         long time_left;
361         unsigned long nr_pages;
362         int loops;
363
364         /* Calc the number of the pages we need flush for space reservation */
365         items = calc_reclaim_items_nr(fs_info, to_reclaim);
366         to_reclaim = items * EXTENT_SIZE_PER_ITEM;
367
368         trans = (struct btrfs_trans_handle *)current->journal_info;
369         space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
370
371         delalloc_bytes = percpu_counter_sum_positive(
372                                                 &fs_info->delalloc_bytes);
373         dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
374         if (delalloc_bytes == 0 && dio_bytes == 0) {
375                 if (trans)
376                         return;
377                 if (wait_ordered)
378                         btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
379                 return;
380         }
381
382         /*
383          * If we are doing more ordered than delalloc we need to just wait on
384          * ordered extents, otherwise we'll waste time trying to flush delalloc
385          * that likely won't give us the space back we need.
386          */
387         if (dio_bytes > delalloc_bytes)
388                 wait_ordered = true;
389
390         loops = 0;
391         while ((delalloc_bytes || dio_bytes) && loops < 3) {
392                 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
393
394                 /*
395                  * Triggers inode writeback for up to nr_pages. This will invoke
396                  * ->writepages callback and trigger delalloc filling
397                  *  (btrfs_run_delalloc_range()).
398                  */
399                 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
400
401                 /*
402                  * We need to wait for the compressed pages to start before
403                  * we continue.
404                  */
405                 async_pages = atomic_read(&fs_info->async_delalloc_pages);
406                 if (!async_pages)
407                         goto skip_async;
408
409                 /*
410                  * Calculate how many compressed pages we want to be written
411                  * before we continue. I.e if there are more async pages than we
412                  * require wait_event will wait until nr_pages are written.
413                  */
414                 if (async_pages <= nr_pages)
415                         async_pages = 0;
416                 else
417                         async_pages -= nr_pages;
418
419                 wait_event(fs_info->async_submit_wait,
420                            atomic_read(&fs_info->async_delalloc_pages) <=
421                            (int)async_pages);
422 skip_async:
423                 spin_lock(&space_info->lock);
424                 if (list_empty(&space_info->tickets) &&
425                     list_empty(&space_info->priority_tickets)) {
426                         spin_unlock(&space_info->lock);
427                         break;
428                 }
429                 spin_unlock(&space_info->lock);
430
431                 loops++;
432                 if (wait_ordered && !trans) {
433                         btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
434                 } else {
435                         time_left = schedule_timeout_killable(1);
436                         if (time_left)
437                                 break;
438                 }
439                 delalloc_bytes = percpu_counter_sum_positive(
440                                                 &fs_info->delalloc_bytes);
441                 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
442         }
443 }
444
445 /**
446  * maybe_commit_transaction - possibly commit the transaction if its ok to
447  * @root - the root we're allocating for
448  * @bytes - the number of bytes we want to reserve
449  * @force - force the commit
450  *
451  * This will check to make sure that committing the transaction will actually
452  * get us somewhere and then commit the transaction if it does.  Otherwise it
453  * will return -ENOSPC.
454  */
455 static int may_commit_transaction(struct btrfs_fs_info *fs_info,
456                                   struct btrfs_space_info *space_info)
457 {
458         struct reserve_ticket *ticket = NULL;
459         struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
460         struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
461         struct btrfs_trans_handle *trans;
462         u64 bytes_needed;
463         u64 reclaim_bytes = 0;
464         u64 cur_free_bytes = 0;
465
466         trans = (struct btrfs_trans_handle *)current->journal_info;
467         if (trans)
468                 return -EAGAIN;
469
470         spin_lock(&space_info->lock);
471         cur_free_bytes = btrfs_space_info_used(space_info, true);
472         if (cur_free_bytes < space_info->total_bytes)
473                 cur_free_bytes = space_info->total_bytes - cur_free_bytes;
474         else
475                 cur_free_bytes = 0;
476
477         if (!list_empty(&space_info->priority_tickets))
478                 ticket = list_first_entry(&space_info->priority_tickets,
479                                           struct reserve_ticket, list);
480         else if (!list_empty(&space_info->tickets))
481                 ticket = list_first_entry(&space_info->tickets,
482                                           struct reserve_ticket, list);
483         bytes_needed = (ticket) ? ticket->bytes : 0;
484
485         if (bytes_needed > cur_free_bytes)
486                 bytes_needed -= cur_free_bytes;
487         else
488                 bytes_needed = 0;
489         spin_unlock(&space_info->lock);
490
491         if (!bytes_needed)
492                 return 0;
493
494         trans = btrfs_join_transaction(fs_info->extent_root);
495         if (IS_ERR(trans))
496                 return PTR_ERR(trans);
497
498         /*
499          * See if there is enough pinned space to make this reservation, or if
500          * we have block groups that are going to be freed, allowing us to
501          * possibly do a chunk allocation the next loop through.
502          */
503         if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
504             __percpu_counter_compare(&space_info->total_bytes_pinned,
505                                      bytes_needed,
506                                      BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
507                 goto commit;
508
509         /*
510          * See if there is some space in the delayed insertion reservation for
511          * this reservation.
512          */
513         if (space_info != delayed_rsv->space_info)
514                 goto enospc;
515
516         spin_lock(&delayed_rsv->lock);
517         reclaim_bytes += delayed_rsv->reserved;
518         spin_unlock(&delayed_rsv->lock);
519
520         spin_lock(&delayed_refs_rsv->lock);
521         reclaim_bytes += delayed_refs_rsv->reserved;
522         spin_unlock(&delayed_refs_rsv->lock);
523         if (reclaim_bytes >= bytes_needed)
524                 goto commit;
525         bytes_needed -= reclaim_bytes;
526
527         if (__percpu_counter_compare(&space_info->total_bytes_pinned,
528                                    bytes_needed,
529                                    BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
530                 goto enospc;
531
532 commit:
533         return btrfs_commit_transaction(trans);
534 enospc:
535         btrfs_end_transaction(trans);
536         return -ENOSPC;
537 }
538
539 /*
540  * Try to flush some data based on policy set by @state. This is only advisory
541  * and may fail for various reasons. The caller is supposed to examine the
542  * state of @space_info to detect the outcome.
543  */
544 static void flush_space(struct btrfs_fs_info *fs_info,
545                        struct btrfs_space_info *space_info, u64 num_bytes,
546                        int state)
547 {
548         struct btrfs_root *root = fs_info->extent_root;
549         struct btrfs_trans_handle *trans;
550         int nr;
551         int ret = 0;
552
553         switch (state) {
554         case FLUSH_DELAYED_ITEMS_NR:
555         case FLUSH_DELAYED_ITEMS:
556                 if (state == FLUSH_DELAYED_ITEMS_NR)
557                         nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
558                 else
559                         nr = -1;
560
561                 trans = btrfs_join_transaction(root);
562                 if (IS_ERR(trans)) {
563                         ret = PTR_ERR(trans);
564                         break;
565                 }
566                 ret = btrfs_run_delayed_items_nr(trans, nr);
567                 btrfs_end_transaction(trans);
568                 break;
569         case FLUSH_DELALLOC:
570         case FLUSH_DELALLOC_WAIT:
571                 shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
572                                 state == FLUSH_DELALLOC_WAIT);
573                 break;
574         case FLUSH_DELAYED_REFS_NR:
575         case FLUSH_DELAYED_REFS:
576                 trans = btrfs_join_transaction(root);
577                 if (IS_ERR(trans)) {
578                         ret = PTR_ERR(trans);
579                         break;
580                 }
581                 if (state == FLUSH_DELAYED_REFS_NR)
582                         nr = calc_reclaim_items_nr(fs_info, num_bytes);
583                 else
584                         nr = 0;
585                 btrfs_run_delayed_refs(trans, nr);
586                 btrfs_end_transaction(trans);
587                 break;
588         case ALLOC_CHUNK:
589         case ALLOC_CHUNK_FORCE:
590                 trans = btrfs_join_transaction(root);
591                 if (IS_ERR(trans)) {
592                         ret = PTR_ERR(trans);
593                         break;
594                 }
595                 ret = btrfs_chunk_alloc(trans,
596                                 btrfs_metadata_alloc_profile(fs_info),
597                                 (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
598                                         CHUNK_ALLOC_FORCE);
599                 btrfs_end_transaction(trans);
600                 if (ret > 0 || ret == -ENOSPC)
601                         ret = 0;
602                 break;
603         case RUN_DELAYED_IPUTS:
604                 /*
605                  * If we have pending delayed iputs then we could free up a
606                  * bunch of pinned space, so make sure we run the iputs before
607                  * we do our pinned bytes check below.
608                  */
609                 btrfs_run_delayed_iputs(fs_info);
610                 btrfs_wait_on_delayed_iputs(fs_info);
611                 break;
612         case COMMIT_TRANS:
613                 ret = may_commit_transaction(fs_info, space_info);
614                 break;
615         default:
616                 ret = -ENOSPC;
617                 break;
618         }
619
620         trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
621                                 ret);
622         return;
623 }
624
625 static inline u64
626 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
627                                  struct btrfs_space_info *space_info)
628 {
629         struct reserve_ticket *ticket;
630         u64 used;
631         u64 expected;
632         u64 to_reclaim = 0;
633
634         list_for_each_entry(ticket, &space_info->tickets, list)
635                 to_reclaim += ticket->bytes;
636         list_for_each_entry(ticket, &space_info->priority_tickets, list)
637                 to_reclaim += ticket->bytes;
638         if (to_reclaim)
639                 return to_reclaim;
640
641         to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
642         if (can_overcommit(fs_info, space_info, to_reclaim,
643                            BTRFS_RESERVE_FLUSH_ALL))
644                 return 0;
645
646         used = btrfs_space_info_used(space_info, true);
647
648         if (can_overcommit(fs_info, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL))
649                 expected = div_factor_fine(space_info->total_bytes, 95);
650         else
651                 expected = div_factor_fine(space_info->total_bytes, 90);
652
653         if (used > expected)
654                 to_reclaim = used - expected;
655         else
656                 to_reclaim = 0;
657         to_reclaim = min(to_reclaim, space_info->bytes_may_use +
658                                      space_info->bytes_reserved);
659         return to_reclaim;
660 }
661
662 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
663                                         struct btrfs_space_info *space_info,
664                                         u64 used)
665 {
666         u64 thresh = div_factor_fine(space_info->total_bytes, 98);
667
668         /* If we're just plain full then async reclaim just slows us down. */
669         if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
670                 return 0;
671
672         if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info))
673                 return 0;
674
675         return (used >= thresh && !btrfs_fs_closing(fs_info) &&
676                 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
677 }
678
679 /*
680  * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets
681  * @fs_info - fs_info for this fs
682  * @space_info - the space info we were flushing
683  *
684  * We call this when we've exhausted our flushing ability and haven't made
685  * progress in satisfying tickets.  The reservation code handles tickets in
686  * order, so if there is a large ticket first and then smaller ones we could
687  * very well satisfy the smaller tickets.  This will attempt to wake up any
688  * tickets in the list to catch this case.
689  *
690  * This function returns true if it was able to make progress by clearing out
691  * other tickets, or if it stumbles across a ticket that was smaller than the
692  * first ticket.
693  */
694 static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
695                                    struct btrfs_space_info *space_info)
696 {
697         struct reserve_ticket *ticket;
698         u64 tickets_id = space_info->tickets_id;
699         u64 first_ticket_bytes = 0;
700
701         if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
702                 btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
703                 __btrfs_dump_space_info(fs_info, space_info);
704         }
705
706         while (!list_empty(&space_info->tickets) &&
707                tickets_id == space_info->tickets_id) {
708                 ticket = list_first_entry(&space_info->tickets,
709                                           struct reserve_ticket, list);
710
711                 /*
712                  * may_commit_transaction will avoid committing the transaction
713                  * if it doesn't feel like the space reclaimed by the commit
714                  * would result in the ticket succeeding.  However if we have a
715                  * smaller ticket in the queue it may be small enough to be
716                  * satisified by committing the transaction, so if any
717                  * subsequent ticket is smaller than the first ticket go ahead
718                  * and send us back for another loop through the enospc flushing
719                  * code.
720                  */
721                 if (first_ticket_bytes == 0)
722                         first_ticket_bytes = ticket->bytes;
723                 else if (first_ticket_bytes > ticket->bytes)
724                         return true;
725
726                 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
727                         btrfs_info(fs_info, "failing ticket with %llu bytes",
728                                    ticket->bytes);
729
730                 list_del_init(&ticket->list);
731                 ticket->error = -ENOSPC;
732                 wake_up(&ticket->wait);
733
734                 /*
735                  * We're just throwing tickets away, so more flushing may not
736                  * trip over btrfs_try_granting_tickets, so we need to call it
737                  * here to see if we can make progress with the next ticket in
738                  * the list.
739                  */
740                 btrfs_try_granting_tickets(fs_info, space_info);
741         }
742         return (tickets_id != space_info->tickets_id);
743 }
744
745 /*
746  * This is for normal flushers, we can wait all goddamned day if we want to.  We
747  * will loop and continuously try to flush as long as we are making progress.
748  * We count progress as clearing off tickets each time we have to loop.
749  */
750 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
751 {
752         struct btrfs_fs_info *fs_info;
753         struct btrfs_space_info *space_info;
754         u64 to_reclaim;
755         int flush_state;
756         int commit_cycles = 0;
757         u64 last_tickets_id;
758
759         fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
760         space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
761
762         spin_lock(&space_info->lock);
763         to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
764         if (!to_reclaim) {
765                 space_info->flush = 0;
766                 spin_unlock(&space_info->lock);
767                 return;
768         }
769         last_tickets_id = space_info->tickets_id;
770         spin_unlock(&space_info->lock);
771
772         flush_state = FLUSH_DELAYED_ITEMS_NR;
773         do {
774                 flush_space(fs_info, space_info, to_reclaim, flush_state);
775                 spin_lock(&space_info->lock);
776                 if (list_empty(&space_info->tickets)) {
777                         space_info->flush = 0;
778                         spin_unlock(&space_info->lock);
779                         return;
780                 }
781                 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
782                                                               space_info);
783                 if (last_tickets_id == space_info->tickets_id) {
784                         flush_state++;
785                 } else {
786                         last_tickets_id = space_info->tickets_id;
787                         flush_state = FLUSH_DELAYED_ITEMS_NR;
788                         if (commit_cycles)
789                                 commit_cycles--;
790                 }
791
792                 /*
793                  * We don't want to force a chunk allocation until we've tried
794                  * pretty hard to reclaim space.  Think of the case where we
795                  * freed up a bunch of space and so have a lot of pinned space
796                  * to reclaim.  We would rather use that than possibly create a
797                  * underutilized metadata chunk.  So if this is our first run
798                  * through the flushing state machine skip ALLOC_CHUNK_FORCE and
799                  * commit the transaction.  If nothing has changed the next go
800                  * around then we can force a chunk allocation.
801                  */
802                 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
803                         flush_state++;
804
805                 if (flush_state > COMMIT_TRANS) {
806                         commit_cycles++;
807                         if (commit_cycles > 2) {
808                                 if (maybe_fail_all_tickets(fs_info, space_info)) {
809                                         flush_state = FLUSH_DELAYED_ITEMS_NR;
810                                         commit_cycles--;
811                                 } else {
812                                         space_info->flush = 0;
813                                 }
814                         } else {
815                                 flush_state = FLUSH_DELAYED_ITEMS_NR;
816                         }
817                 }
818                 spin_unlock(&space_info->lock);
819         } while (flush_state <= COMMIT_TRANS);
820 }
821
822 void btrfs_init_async_reclaim_work(struct work_struct *work)
823 {
824         INIT_WORK(work, btrfs_async_reclaim_metadata_space);
825 }
826
827 static const enum btrfs_flush_state priority_flush_states[] = {
828         FLUSH_DELAYED_ITEMS_NR,
829         FLUSH_DELAYED_ITEMS,
830         ALLOC_CHUNK,
831 };
832
833 static const enum btrfs_flush_state evict_flush_states[] = {
834         FLUSH_DELAYED_ITEMS_NR,
835         FLUSH_DELAYED_ITEMS,
836         FLUSH_DELAYED_REFS_NR,
837         FLUSH_DELAYED_REFS,
838         FLUSH_DELALLOC,
839         FLUSH_DELALLOC_WAIT,
840         ALLOC_CHUNK,
841         COMMIT_TRANS,
842 };
843
844 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
845                                 struct btrfs_space_info *space_info,
846                                 struct reserve_ticket *ticket,
847                                 const enum btrfs_flush_state *states,
848                                 int states_nr)
849 {
850         u64 to_reclaim;
851         int flush_state;
852
853         spin_lock(&space_info->lock);
854         to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
855         if (!to_reclaim) {
856                 spin_unlock(&space_info->lock);
857                 return;
858         }
859         spin_unlock(&space_info->lock);
860
861         flush_state = 0;
862         do {
863                 flush_space(fs_info, space_info, to_reclaim, states[flush_state]);
864                 flush_state++;
865                 spin_lock(&space_info->lock);
866                 if (ticket->bytes == 0) {
867                         spin_unlock(&space_info->lock);
868                         return;
869                 }
870                 spin_unlock(&space_info->lock);
871         } while (flush_state < states_nr);
872 }
873
874 static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
875                                 struct btrfs_space_info *space_info,
876                                 struct reserve_ticket *ticket)
877
878 {
879         DEFINE_WAIT(wait);
880         int ret = 0;
881
882         spin_lock(&space_info->lock);
883         while (ticket->bytes > 0 && ticket->error == 0) {
884                 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
885                 if (ret) {
886                         /*
887                          * Delete us from the list. After we unlock the space
888                          * info, we don't want the async reclaim job to reserve
889                          * space for this ticket. If that would happen, then the
890                          * ticket's task would not known that space was reserved
891                          * despite getting an error, resulting in a space leak
892                          * (bytes_may_use counter of our space_info).
893                          */
894                         list_del_init(&ticket->list);
895                         ticket->error = -EINTR;
896                         break;
897                 }
898                 spin_unlock(&space_info->lock);
899
900                 schedule();
901
902                 finish_wait(&ticket->wait, &wait);
903                 spin_lock(&space_info->lock);
904         }
905         spin_unlock(&space_info->lock);
906 }
907
908 /**
909  * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket
910  * @fs_info - the fs
911  * @space_info - the space_info for the reservation
912  * @ticket - the ticket for the reservation
913  * @flush - how much we can flush
914  *
915  * This does the work of figuring out how to flush for the ticket, waiting for
916  * the reservation, and returning the appropriate error if there is one.
917  */
918 static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
919                                  struct btrfs_space_info *space_info,
920                                  struct reserve_ticket *ticket,
921                                  enum btrfs_reserve_flush_enum flush)
922 {
923         int ret;
924
925         switch (flush) {
926         case BTRFS_RESERVE_FLUSH_ALL:
927                 wait_reserve_ticket(fs_info, space_info, ticket);
928                 break;
929         case BTRFS_RESERVE_FLUSH_LIMIT:
930                 priority_reclaim_metadata_space(fs_info, space_info, ticket,
931                                                 priority_flush_states,
932                                                 ARRAY_SIZE(priority_flush_states));
933                 break;
934         case BTRFS_RESERVE_FLUSH_EVICT:
935                 priority_reclaim_metadata_space(fs_info, space_info, ticket,
936                                                 evict_flush_states,
937                                                 ARRAY_SIZE(evict_flush_states));
938                 break;
939         default:
940                 ASSERT(0);
941                 break;
942         }
943
944         spin_lock(&space_info->lock);
945         ret = ticket->error;
946         if (ticket->bytes || ticket->error) {
947                 /*
948                  * Need to delete here for priority tickets. For regular tickets
949                  * either the async reclaim job deletes the ticket from the list
950                  * or we delete it ourselves at wait_reserve_ticket().
951                  */
952                 list_del_init(&ticket->list);
953                 if (!ret)
954                         ret = -ENOSPC;
955         }
956         spin_unlock(&space_info->lock);
957         ASSERT(list_empty(&ticket->list));
958         /*
959          * Check that we can't have an error set if the reservation succeeded,
960          * as that would confuse tasks and lead them to error out without
961          * releasing reserved space (if an error happens the expectation is that
962          * space wasn't reserved at all).
963          */
964         ASSERT(!(ticket->bytes == 0 && ticket->error));
965         return ret;
966 }
967
968 /**
969  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
970  * @root - the root we're allocating for
971  * @space_info - the space info we want to allocate from
972  * @orig_bytes - the number of bytes we want
973  * @flush - whether or not we can flush to make our reservation
974  *
975  * This will reserve orig_bytes number of bytes from the space info associated
976  * with the block_rsv.  If there is not enough space it will make an attempt to
977  * flush out space to make room.  It will do this by flushing delalloc if
978  * possible or committing the transaction.  If flush is 0 then no attempts to
979  * regain reservations will be made and this will fail if there is not enough
980  * space already.
981  */
982 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
983                                     struct btrfs_space_info *space_info,
984                                     u64 orig_bytes,
985                                     enum btrfs_reserve_flush_enum flush)
986 {
987         struct reserve_ticket ticket;
988         u64 used;
989         int ret = 0;
990         bool pending_tickets;
991
992         ASSERT(orig_bytes);
993         ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
994
995         spin_lock(&space_info->lock);
996         ret = -ENOSPC;
997         used = btrfs_space_info_used(space_info, true);
998         pending_tickets = !list_empty(&space_info->tickets) ||
999                 !list_empty(&space_info->priority_tickets);
1000
1001         /*
1002          * Carry on if we have enough space (short-circuit) OR call
1003          * can_overcommit() to ensure we can overcommit to continue.
1004          */
1005         if (!pending_tickets &&
1006             ((used + orig_bytes <= space_info->total_bytes) ||
1007              can_overcommit(fs_info, space_info, orig_bytes, flush))) {
1008                 btrfs_space_info_update_bytes_may_use(fs_info, space_info,
1009                                                       orig_bytes);
1010                 ret = 0;
1011         }
1012
1013         /*
1014          * If we couldn't make a reservation then setup our reservation ticket
1015          * and kick the async worker if it's not already running.
1016          *
1017          * If we are a priority flusher then we just need to add our ticket to
1018          * the list and we will do our own flushing further down.
1019          */
1020         if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
1021                 ticket.bytes = orig_bytes;
1022                 ticket.error = 0;
1023                 init_waitqueue_head(&ticket.wait);
1024                 if (flush == BTRFS_RESERVE_FLUSH_ALL) {
1025                         list_add_tail(&ticket.list, &space_info->tickets);
1026                         if (!space_info->flush) {
1027                                 space_info->flush = 1;
1028                                 trace_btrfs_trigger_flush(fs_info,
1029                                                           space_info->flags,
1030                                                           orig_bytes, flush,
1031                                                           "enospc");
1032                                 queue_work(system_unbound_wq,
1033                                            &fs_info->async_reclaim_work);
1034                         }
1035                 } else {
1036                         list_add_tail(&ticket.list,
1037                                       &space_info->priority_tickets);
1038                 }
1039         } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
1040                 used += orig_bytes;
1041                 /*
1042                  * We will do the space reservation dance during log replay,
1043                  * which means we won't have fs_info->fs_root set, so don't do
1044                  * the async reclaim as we will panic.
1045                  */
1046                 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
1047                     need_do_async_reclaim(fs_info, space_info, used) &&
1048                     !work_busy(&fs_info->async_reclaim_work)) {
1049                         trace_btrfs_trigger_flush(fs_info, space_info->flags,
1050                                                   orig_bytes, flush, "preempt");
1051                         queue_work(system_unbound_wq,
1052                                    &fs_info->async_reclaim_work);
1053                 }
1054         }
1055         spin_unlock(&space_info->lock);
1056         if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
1057                 return ret;
1058
1059         return handle_reserve_ticket(fs_info, space_info, &ticket, flush);
1060 }
1061
1062 /**
1063  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
1064  * @root - the root we're allocating for
1065  * @block_rsv - the block_rsv we're allocating for
1066  * @orig_bytes - the number of bytes we want
1067  * @flush - whether or not we can flush to make our reservation
1068  *
1069  * This will reserve orig_bytes number of bytes from the space info associated
1070  * with the block_rsv.  If there is not enough space it will make an attempt to
1071  * flush out space to make room.  It will do this by flushing delalloc if
1072  * possible or committing the transaction.  If flush is 0 then no attempts to
1073  * regain reservations will be made and this will fail if there is not enough
1074  * space already.
1075  */
1076 int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
1077                                  struct btrfs_block_rsv *block_rsv,
1078                                  u64 orig_bytes,
1079                                  enum btrfs_reserve_flush_enum flush)
1080 {
1081         struct btrfs_fs_info *fs_info = root->fs_info;
1082         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
1083         int ret;
1084
1085         ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
1086                                        orig_bytes, flush);
1087         if (ret == -ENOSPC &&
1088             unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
1089                 if (block_rsv != global_rsv &&
1090                     !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
1091                         ret = 0;
1092         }
1093         if (ret == -ENOSPC) {
1094                 trace_btrfs_space_reservation(fs_info, "space_info:enospc",
1095                                               block_rsv->space_info->flags,
1096                                               orig_bytes, 1);
1097
1098                 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
1099                         btrfs_dump_space_info(fs_info, block_rsv->space_info,
1100                                               orig_bytes, 0);
1101         }
1102         return ret;
1103 }