drivers/md/dm-cache-target.c

   1 /*
   2  * Copyright (C) 2012 Red Hat. All rights reserved.
   3  *
   4  * This file is released under the GPL.
   5  */
   6
   7 #include "dm.h"
   8 #include "dm-bio-prison.h"
   9 #include "dm-bio-record.h"
  10 #include "dm-cache-metadata.h"
  11
  12 #include <linux/dm-io.h>
  13 #include <linux/dm-kcopyd.h>
  14 #include <linux/jiffies.h>
  15 #include <linux/init.h>
  16 #include <linux/mempool.h>
  17 #include <linux/module.h>
  18 #include <linux/slab.h>
  19 #include <linux/vmalloc.h>
  20
  21 #define DM_MSG_PREFIX "cache"
  22
  23 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
  24         "A percentage of time allocated for copying to and/or from cache");
  25
  26 /*----------------------------------------------------------------*/
  27
  28 #define IOT_RESOLUTION 4
  29
  30 struct io_tracker {
  31         spinlock_t lock;
  32
  33         /*
  34          * Sectors of in-flight IO.
  35          */
  36         sector_t in_flight;
  37
  38         /*
  39          * The time, in jiffies, when this device became idle (if it is
  40          * indeed idle).
  41          */
  42         unsigned long idle_time;
  43         unsigned long last_update_time;
  44 };
  45
  46 static void iot_init(struct io_tracker *iot)
  47 {
  48         spin_lock_init(&iot->lock);
  49         iot->in_flight = 0ul;
  50         iot->idle_time = 0ul;
  51         iot->last_update_time = jiffies;
  52 }
  53
  54 static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
  55 {
  56         if (iot->in_flight)
  57                 return false;
  58
  59         return time_after(jiffies, iot->idle_time + jifs);
  60 }
  61
  62 static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
  63 {
  64         bool r;
  65         unsigned long flags;
  66
  67         spin_lock_irqsave(&iot->lock, flags);
  68         r = __iot_idle_for(iot, jifs);
  69         spin_unlock_irqrestore(&iot->lock, flags);
  70
  71         return r;
  72 }
  73
  74 static void iot_io_begin(struct io_tracker *iot, sector_t len)
  75 {
  76         unsigned long flags;
  77
  78         spin_lock_irqsave(&iot->lock, flags);
  79         iot->in_flight += len;
  80         spin_unlock_irqrestore(&iot->lock, flags);
  81 }
  82
  83 static void __iot_io_end(struct io_tracker *iot, sector_t len)
  84 {
  85         iot->in_flight -= len;
  86         if (!iot->in_flight)
  87                 iot->idle_time = jiffies;
  88 }
  89
  90 static void iot_io_end(struct io_tracker *iot, sector_t len)
  91 {
  92         unsigned long flags;
  93
  94         spin_lock_irqsave(&iot->lock, flags);
  95         __iot_io_end(iot, len);
  96         spin_unlock_irqrestore(&iot->lock, flags);
  97 }
  98
  99 /*----------------------------------------------------------------*/
 100
 101 /*
 102  * Glossary:
 103  *
 104  * oblock: index of an origin block
 105  * cblock: index of a cache block
 106  * promotion: movement of a block from origin to cache
 107  * demotion: movement of a block from cache to origin
 108  * migration: movement of a block between the origin and cache device,
 109  *            either direction
 110  */
 111
 112 /*----------------------------------------------------------------*/
 113
 114 /*
 115  * There are a couple of places where we let a bio run, but want to do some
 116  * work before calling its endio function.  We do this by temporarily
 117  * changing the endio fn.
 118  */
 119 struct dm_hook_info {
 120         bio_end_io_t *bi_end_io;
 121 };
 122
 123 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
 124                         bio_end_io_t *bi_end_io, void *bi_private)
 125 {
 126         h->bi_end_io = bio->bi_end_io;
 127
 128         bio->bi_end_io = bi_end_io;
 129         bio->bi_private = bi_private;
 130 }
 131
 132 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
 133 {
 134         bio->bi_end_io = h->bi_end_io;
 135 }
 136
 137 /*----------------------------------------------------------------*/
 138
 139 #define MIGRATION_POOL_SIZE 128
 140 #define COMMIT_PERIOD HZ
 141 #define MIGRATION_COUNT_WINDOW 10
 142
 143 /*
 144  * The block size of the device holding cache data must be
 145  * between 32KB and 1GB.
 146  */
 147 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
 148 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
 149
 150 enum cache_metadata_mode {
 151         CM_WRITE,               /* metadata may be changed */
 152         CM_READ_ONLY,           /* metadata may not be changed */
 153         CM_FAIL
 154 };
 155
 156 enum cache_io_mode {
 157         /*
 158          * Data is written to cached blocks only.  These blocks are marked
 159          * dirty.  If you lose the cache device you will lose data.
 160          * Potential performance increase for both reads and writes.
 161          */
 162         CM_IO_WRITEBACK,
 163
 164         /*
 165          * Data is written to both cache and origin.  Blocks are never
 166          * dirty.  Potential performance benfit for reads only.
 167          */
 168         CM_IO_WRITETHROUGH,
 169
 170         /*
 171          * A degraded mode useful for various cache coherency situations
 172          * (eg, rolling back snapshots).  Reads and writes always go to the
 173          * origin.  If a write goes to a cached oblock, then the cache
 174          * block is invalidated.
 175          */
 176         CM_IO_PASSTHROUGH
 177 };
 178
 179 struct cache_features {
 180         enum cache_metadata_mode mode;
 181         enum cache_io_mode io_mode;
 182         unsigned metadata_version;
 183 };
 184
 185 struct cache_stats {
 186         atomic_t read_hit;
 187         atomic_t read_miss;
 188         atomic_t write_hit;
 189         atomic_t write_miss;
 190         atomic_t demotion;
 191         atomic_t promotion;
 192         atomic_t copies_avoided;
 193         atomic_t cache_cell_clash;
 194         atomic_t commit_count;
 195         atomic_t discard_count;
 196 };
 197
 198 /*
 199  * Defines a range of cblocks, begin to (end - 1) are in the range.  end is
 200  * the one-past-the-end value.
 201  */
 202 struct cblock_range {
 203         dm_cblock_t begin;
 204         dm_cblock_t end;
 205 };
 206
 207 struct invalidation_request {
 208         struct list_head list;
 209         struct cblock_range *cblocks;
 210
 211         atomic_t complete;
 212         int err;
 213
 214         wait_queue_head_t result_wait;
 215 };
 216
 217 struct cache {
 218         struct dm_target *ti;
 219         struct dm_target_callbacks callbacks;
 220
 221         struct dm_cache_metadata *cmd;
 222
 223         /*
 224          * Metadata is written to this device.
 225          */
 226         struct dm_dev *metadata_dev;
 227
 228         /*
 229          * The slower of the two data devices.  Typically a spindle.
 230          */
 231         struct dm_dev *origin_dev;
 232
 233         /*
 234          * The faster of the two data devices.  Typically an SSD.
 235          */
 236         struct dm_dev *cache_dev;
 237
 238         /*
 239          * Size of the origin device in _complete_ blocks and native sectors.
 240          */
 241         dm_oblock_t origin_blocks;
 242         sector_t origin_sectors;
 243
 244         /*
 245          * Size of the cache device in blocks.
 246          */
 247         dm_cblock_t cache_size;
 248
 249         /*
 250          * Fields for converting from sectors to blocks.
 251          */
 252         sector_t sectors_per_block;
 253         int sectors_per_block_shift;
 254
 255         spinlock_t lock;
 256         struct list_head deferred_cells;
 257         struct bio_list deferred_bios;
 258         struct bio_list deferred_flush_bios;
 259         struct bio_list deferred_writethrough_bios;
 260         struct list_head quiesced_migrations;
 261         struct list_head completed_migrations;
 262         struct list_head need_commit_migrations;
 263         sector_t migration_threshold;
 264         wait_queue_head_t migration_wait;
 265         atomic_t nr_allocated_migrations;
 266
 267         /*
 268          * The number of in flight migrations that are performing
 269          * background io. eg, promotion, writeback.
 270          */
 271         atomic_t nr_io_migrations;
 272
 273         wait_queue_head_t quiescing_wait;
 274         atomic_t quiescing;
 275         atomic_t quiescing_ack;
 276
 277         /*
 278          * cache_size entries, dirty if set
 279          */
 280         atomic_t nr_dirty;
 281         unsigned long *dirty_bitset;
 282
 283         /*
 284          * origin_blocks entries, discarded if set.
 285          */
 286         dm_dblock_t discard_nr_blocks;
 287         unsigned long *discard_bitset;
 288         uint32_t discard_block_size; /* a power of 2 times sectors per block */
 289
 290         /*
 291          * Rather than reconstructing the table line for the status we just
 292          * save it and regurgitate.
 293          */
 294         unsigned nr_ctr_args;
 295         const char **ctr_args;
 296
 297         struct dm_kcopyd_client *copier;
 298         struct workqueue_struct *wq;
 299         struct work_struct worker;
 300
 301         struct delayed_work waker;
 302         unsigned long last_commit_jiffies;
 303
 304         struct dm_bio_prison *prison;
 305         struct dm_deferred_set *all_io_ds;
 306
 307         mempool_t *migration_pool;
 308
 309         struct dm_cache_policy *policy;
 310         unsigned policy_nr_args;
 311
 312         bool need_tick_bio:1;
 313         bool sized:1;
 314         bool invalidate:1;
 315         bool commit_requested:1;
 316         bool loaded_mappings:1;
 317         bool loaded_discards:1;
 318
 319         /*
 320          * Cache features such as write-through.
 321          */
 322         struct cache_features features;
 323
 324         struct cache_stats stats;
 325
 326         /*
 327          * Invalidation fields.
 328          */
 329         spinlock_t invalidation_lock;
 330         struct list_head invalidation_requests;
 331
 332         struct io_tracker origin_tracker;
 333 };
 334
 335 struct per_bio_data {
 336         bool tick:1;
 337         unsigned req_nr:2;
 338         struct dm_deferred_entry *all_io_entry;
 339         struct dm_hook_info hook_info;
 340         sector_t len;
 341
 342         /*
 343          * writethrough fields.  These MUST remain at the end of this
 344          * structure and the 'cache' member must be the first as it
 345          * is used to determine the offset of the writethrough fields.
 346          */
 347         struct cache *cache;
 348         dm_cblock_t cblock;
 349         struct dm_bio_details bio_details;
 350 };
 351
 352 struct dm_cache_migration {
 353         struct list_head list;
 354         struct cache *cache;
 355
 356         unsigned long start_jiffies;
 357         dm_oblock_t old_oblock;
 358         dm_oblock_t new_oblock;
 359         dm_cblock_t cblock;
 360
 361         bool err:1;
 362         bool discard:1;
 363         bool writeback:1;
 364         bool demote:1;
 365         bool promote:1;
 366         bool requeue_holder:1;
 367         bool invalidate:1;
 368
 369         struct dm_bio_prison_cell *old_ocell;
 370         struct dm_bio_prison_cell *new_ocell;
 371 };
 372
 373 /*
 374  * Processing a bio in the worker thread may require these memory
 375  * allocations.  We prealloc to avoid deadlocks (the same worker thread
 376  * frees them back to the mempool).
 377  */
 378 struct prealloc {
 379         struct dm_cache_migration *mg;
 380         struct dm_bio_prison_cell *cell1;
 381         struct dm_bio_prison_cell *cell2;
 382 };
 383
 384 static enum cache_metadata_mode get_cache_mode(struct cache *cache);
 385
 386 static void wake_worker(struct cache *cache)
 387 {
 388         queue_work(cache->wq, &cache->worker);
 389 }
 390
 391 /*----------------------------------------------------------------*/
 392
 393 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
 394 {
 395         /* FIXME: change to use a local slab. */
 396         return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
 397 }
 398
 399 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
 400 {
 401         dm_bio_prison_free_cell(cache->prison, cell);
 402 }
 403
 404 static struct dm_cache_migration *alloc_migration(struct cache *cache)
 405 {
 406         struct dm_cache_migration *mg;
 407
 408         mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
 409         if (mg) {
 410                 mg->cache = cache;
 411                 atomic_inc(&mg->cache->nr_allocated_migrations);
 412         }
 413
 414         return mg;
 415 }
 416
 417 static void free_migration(struct dm_cache_migration *mg)
 418 {
 419         struct cache *cache = mg->cache;
 420
 421         if (atomic_dec_and_test(&cache->nr_allocated_migrations))
 422                 wake_up(&cache->migration_wait);
 423
 424         mempool_free(mg, cache->migration_pool);
 425 }
 426
 427 static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
 428 {
 429         if (!p->mg) {
 430                 p->mg = alloc_migration(cache);
 431                 if (!p->mg)
 432                         return -ENOMEM;
 433         }
 434
 435         if (!p->cell1) {
 436                 p->cell1 = alloc_prison_cell(cache);
 437                 if (!p->cell1)
 438                         return -ENOMEM;
 439         }
 440
 441         if (!p->cell2) {
 442                 p->cell2 = alloc_prison_cell(cache);
 443                 if (!p->cell2)
 444                         return -ENOMEM;
 445         }
 446
 447         return 0;
 448 }
 449
 450 static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
 451 {
 452         if (p->cell2)
 453                 free_prison_cell(cache, p->cell2);
 454
 455         if (p->cell1)
 456                 free_prison_cell(cache, p->cell1);
 457
 458         if (p->mg)
 459                 free_migration(p->mg);
 460 }
 461
 462 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
 463 {
 464         struct dm_cache_migration *mg = p->mg;
 465
 466         BUG_ON(!mg);
 467         p->mg = NULL;
 468
 469         return mg;
 470 }
 471
 472 /*
 473  * You must have a cell within the prealloc struct to return.  If not this
 474  * function will BUG() rather than returning NULL.
 475  */
 476 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
 477 {
 478         struct dm_bio_prison_cell *r = NULL;
 479
 480         if (p->cell1) {
 481                 r = p->cell1;
 482                 p->cell1 = NULL;
 483
 484         } else if (p->cell2) {
 485                 r = p->cell2;
 486                 p->cell2 = NULL;
 487         } else
 488                 BUG();
 489
 490         return r;
 491 }
 492
 493 /*
 494  * You can't have more than two cells in a prealloc struct.  BUG() will be
 495  * called if you try and overfill.
 496  */
 497 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
 498 {
 499         if (!p->cell2)
 500                 p->cell2 = cell;
 501
 502         else if (!p->cell1)
 503                 p->cell1 = cell;
 504
 505         else
 506                 BUG();
 507 }
 508
 509 /*----------------------------------------------------------------*/
 510
 511 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key)
 512 {
 513         key->virtual = 0;
 514         key->dev = 0;
 515         key->block_begin = from_oblock(begin);
 516         key->block_end = from_oblock(end);
 517 }
 518
 519 /*
 520  * The caller hands in a preallocated cell, and a free function for it.
 521  * The cell will be freed if there's an error, or if it wasn't used because
 522  * a cell with that key already exists.
 523  */
 524 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
 525
 526 static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end,
 527                             struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
 528                             cell_free_fn free_fn, void *free_context,
 529                             struct dm_bio_prison_cell **cell_result)
 530 {
 531         int r;
 532         struct dm_cell_key key;
 533
 534         build_key(oblock_begin, oblock_end, &key);
 535         r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
 536         if (r)
 537                 free_fn(free_context, cell_prealloc);
 538
 539         return r;
 540 }
 541
 542 static int bio_detain(struct cache *cache, dm_oblock_t oblock,
 543                       struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
 544                       cell_free_fn free_fn, void *free_context,
 545                       struct dm_bio_prison_cell **cell_result)
 546 {
 547         dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
 548         return bio_detain_range(cache, oblock, end, bio,
 549                                 cell_prealloc, free_fn, free_context, cell_result);
 550 }
 551
 552 static int get_cell(struct cache *cache,
 553                     dm_oblock_t oblock,
 554                     struct prealloc *structs,
 555                     struct dm_bio_prison_cell **cell_result)
 556 {
 557         int r;
 558         struct dm_cell_key key;
 559         struct dm_bio_prison_cell *cell_prealloc;
 560
 561         cell_prealloc = prealloc_get_cell(structs);
 562
 563         build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key);
 564         r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
 565         if (r)
 566                 prealloc_put_cell(structs, cell_prealloc);
 567
 568         return r;
 569 }
 570
 571 /*----------------------------------------------------------------*/
 572
 573 static bool is_dirty(struct cache *cache, dm_cblock_t b)
 574 {
 575         return test_bit(from_cblock(b), cache->dirty_bitset);
 576 }
 577
 578 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
 579 {
 580         if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
 581                 atomic_inc(&cache->nr_dirty);
 582                 policy_set_dirty(cache->policy, oblock);
 583         }
 584 }
 585
 586 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
 587 {
 588         if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
 589                 policy_clear_dirty(cache->policy, oblock);
 590                 if (atomic_dec_return(&cache->nr_dirty) == 0)
 591                         dm_table_event(cache->ti->table);
 592         }
 593 }
 594
 595 /*----------------------------------------------------------------*/
 596
 597 static bool block_size_is_power_of_two(struct cache *cache)
 598 {
 599         return cache->sectors_per_block_shift >= 0;
 600 }
 601
 602 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
 603 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
 604 __always_inline
 605 #endif
 606 static dm_block_t block_div(dm_block_t b, uint32_t n)
 607 {
 608         do_div(b, n);
 609
 610         return b;
 611 }
 612
 613 static dm_block_t oblocks_per_dblock(struct cache *cache)
 614 {
 615         dm_block_t oblocks = cache->discard_block_size;
 616
 617         if (block_size_is_power_of_two(cache))
 618                 oblocks >>= cache->sectors_per_block_shift;
 619         else
 620                 oblocks = block_div(oblocks, cache->sectors_per_block);
 621
 622         return oblocks;
 623 }
 624
 625 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
 626 {
 627         return to_dblock(block_div(from_oblock(oblock),
 628                                    oblocks_per_dblock(cache)));
 629 }
 630
 631 static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock)
 632 {
 633         return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache));
 634 }
 635
 636 static void set_discard(struct cache *cache, dm_dblock_t b)
 637 {
 638         unsigned long flags;
 639
 640         BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
 641         atomic_inc(&cache->stats.discard_count);
 642
 643         spin_lock_irqsave(&cache->lock, flags);
 644         set_bit(from_dblock(b), cache->discard_bitset);
 645         spin_unlock_irqrestore(&cache->lock, flags);
 646 }
 647
 648 static void clear_discard(struct cache *cache, dm_dblock_t b)
 649 {
 650         unsigned long flags;
 651
 652         spin_lock_irqsave(&cache->lock, flags);
 653         clear_bit(from_dblock(b), cache->discard_bitset);
 654         spin_unlock_irqrestore(&cache->lock, flags);
 655 }
 656
 657 static bool is_discarded(struct cache *cache, dm_dblock_t b)
 658 {
 659         int r;
 660         unsigned long flags;
 661
 662         spin_lock_irqsave(&cache->lock, flags);
 663         r = test_bit(from_dblock(b), cache->discard_bitset);
 664         spin_unlock_irqrestore(&cache->lock, flags);
 665
 666         return r;
 667 }
 668
 669 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
 670 {
 671         int r;
 672         unsigned long flags;
 673
 674         spin_lock_irqsave(&cache->lock, flags);
 675         r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
 676                      cache->discard_bitset);
 677         spin_unlock_irqrestore(&cache->lock, flags);
 678
 679         return r;
 680 }
 681
 682 /*----------------------------------------------------------------*/
 683
 684 static void load_stats(struct cache *cache)
 685 {
 686         struct dm_cache_statistics stats;
 687
 688         dm_cache_metadata_get_stats(cache->cmd, &stats);
 689         atomic_set(&cache->stats.read_hit, stats.read_hits);
 690         atomic_set(&cache->stats.read_miss, stats.read_misses);
 691         atomic_set(&cache->stats.write_hit, stats.write_hits);
 692         atomic_set(&cache->stats.write_miss, stats.write_misses);
 693 }
 694
 695 static void save_stats(struct cache *cache)
 696 {
 697         struct dm_cache_statistics stats;
 698
 699         if (get_cache_mode(cache) >= CM_READ_ONLY)
 700                 return;
 701
 702         stats.read_hits = atomic_read(&cache->stats.read_hit);
 703         stats.read_misses = atomic_read(&cache->stats.read_miss);
 704         stats.write_hits = atomic_read(&cache->stats.write_hit);
 705         stats.write_misses = atomic_read(&cache->stats.write_miss);
 706
 707         dm_cache_metadata_set_stats(cache->cmd, &stats);
 708 }
 709
 710 /*----------------------------------------------------------------
 711  * Per bio data
 712  *--------------------------------------------------------------*/
 713
 714 /*
 715  * If using writeback, leave out struct per_bio_data's writethrough fields.
 716  */
 717 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
 718 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
 719
 720 static bool writethrough_mode(struct cache_features *f)
 721 {
 722         return f->io_mode == CM_IO_WRITETHROUGH;
 723 }
 724
 725 static bool writeback_mode(struct cache_features *f)
 726 {
 727         return f->io_mode == CM_IO_WRITEBACK;
 728 }
 729
 730 static bool passthrough_mode(struct cache_features *f)
 731 {
 732         return f->io_mode == CM_IO_PASSTHROUGH;
 733 }
 734
 735 static size_t get_per_bio_data_size(struct cache *cache)
 736 {
 737         return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
 738 }
 739
 740 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
 741 {
 742         struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
 743         BUG_ON(!pb);
 744         return pb;
 745 }
 746
 747 static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
 748 {
 749         struct per_bio_data *pb = get_per_bio_data(bio, data_size);
 750
 751         pb->tick = false;
 752         pb->req_nr = dm_bio_get_target_bio_nr(bio);
 753         pb->all_io_entry = NULL;
 754         pb->len = 0;
 755
 756         return pb;
 757 }
 758
 759 /*----------------------------------------------------------------
 760  * Remapping
 761  *--------------------------------------------------------------*/
 762 static void remap_to_origin(struct cache *cache, struct bio *bio)
 763 {
 764         bio->bi_bdev = cache->origin_dev->bdev;
 765 }
 766
 767 static void remap_to_cache(struct cache *cache, struct bio *bio,
 768                            dm_cblock_t cblock)
 769 {
 770         sector_t bi_sector = bio->bi_iter.bi_sector;
 771         sector_t block = from_cblock(cblock);
 772
 773         bio->bi_bdev = cache->cache_dev->bdev;
 774         if (!block_size_is_power_of_two(cache))
 775                 bio->bi_iter.bi_sector =
 776                         (block * cache->sectors_per_block) +
 777                         sector_div(bi_sector, cache->sectors_per_block);
 778         else
 779                 bio->bi_iter.bi_sector =
 780                         (block << cache->sectors_per_block_shift) |
 781                         (bi_sector & (cache->sectors_per_block - 1));
 782 }
 783
 784 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
 785 {
 786         unsigned long flags;
 787         size_t pb_data_size = get_per_bio_data_size(cache);
 788         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 789
 790         spin_lock_irqsave(&cache->lock, flags);
 791         if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) &&
 792             bio_op(bio) != REQ_OP_DISCARD) {
 793                 pb->tick = true;
 794                 cache->need_tick_bio = false;
 795         }
 796         spin_unlock_irqrestore(&cache->lock, flags);
 797 }
 798
 799 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
 800                                   dm_oblock_t oblock)
 801 {
 802         check_if_tick_bio_needed(cache, bio);
 803         remap_to_origin(cache, bio);
 804         if (bio_data_dir(bio) == WRITE)
 805                 clear_discard(cache, oblock_to_dblock(cache, oblock));
 806 }
 807
 808 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
 809                                  dm_oblock_t oblock, dm_cblock_t cblock)
 810 {
 811         check_if_tick_bio_needed(cache, bio);
 812         remap_to_cache(cache, bio, cblock);
 813         if (bio_data_dir(bio) == WRITE) {
 814                 set_dirty(cache, oblock, cblock);
 815                 clear_discard(cache, oblock_to_dblock(cache, oblock));
 816         }
 817 }
 818
 819 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
 820 {
 821         sector_t block_nr = bio->bi_iter.bi_sector;
 822
 823         if (!block_size_is_power_of_two(cache))
 824                 (void) sector_div(block_nr, cache->sectors_per_block);
 825         else
 826                 block_nr >>= cache->sectors_per_block_shift;
 827
 828         return to_oblock(block_nr);
 829 }
 830
 831 /*
 832  * You must increment the deferred set whilst the prison cell is held.  To
 833  * encourage this, we ask for 'cell' to be passed in.
 834  */
 835 static void inc_ds(struct cache *cache, struct bio *bio,
 836                    struct dm_bio_prison_cell *cell)
 837 {
 838         size_t pb_data_size = get_per_bio_data_size(cache);
 839         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 840
 841         BUG_ON(!cell);
 842         BUG_ON(pb->all_io_entry);
 843
 844         pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
 845 }
 846
 847 static bool accountable_bio(struct cache *cache, struct bio *bio)
 848 {
 849         return ((bio->bi_bdev == cache->origin_dev->bdev) &&
 850                 bio_op(bio) != REQ_OP_DISCARD);
 851 }
 852
 853 static void accounted_begin(struct cache *cache, struct bio *bio)
 854 {
 855         size_t pb_data_size = get_per_bio_data_size(cache);
 856         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 857
 858         if (accountable_bio(cache, bio)) {
 859                 pb->len = bio_sectors(bio);
 860                 iot_io_begin(&cache->origin_tracker, pb->len);
 861         }
 862 }
 863
 864 static void accounted_complete(struct cache *cache, struct bio *bio)
 865 {
 866         size_t pb_data_size = get_per_bio_data_size(cache);
 867         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 868
 869         iot_io_end(&cache->origin_tracker, pb->len);
 870 }
 871
 872 static void accounted_request(struct cache *cache, struct bio *bio)
 873 {
 874         accounted_begin(cache, bio);
 875         generic_make_request(bio);
 876 }
 877
 878 static void issue(struct cache *cache, struct bio *bio)
 879 {
 880         unsigned long flags;
 881
 882         if (!op_is_flush(bio->bi_opf)) {
 883                 accounted_request(cache, bio);
 884                 return;
 885         }
 886
 887         /*
 888          * Batch together any bios that trigger commits and then issue a
 889          * single commit for them in do_worker().
 890          */
 891         spin_lock_irqsave(&cache->lock, flags);
 892         cache->commit_requested = true;
 893         bio_list_add(&cache->deferred_flush_bios, bio);
 894         spin_unlock_irqrestore(&cache->lock, flags);
 895 }
 896
 897 static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell)
 898 {
 899         inc_ds(cache, bio, cell);
 900         issue(cache, bio);
 901 }
 902
 903 static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
 904 {
 905         unsigned long flags;
 906
 907         spin_lock_irqsave(&cache->lock, flags);
 908         bio_list_add(&cache->deferred_writethrough_bios, bio);
 909         spin_unlock_irqrestore(&cache->lock, flags);
 910
 911         wake_worker(cache);
 912 }
 913
 914 static void writethrough_endio(struct bio *bio)
 915 {
 916         struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
 917
 918         dm_unhook_bio(&pb->hook_info, bio);
 919
 920         if (bio->bi_error) {
 921                 bio_endio(bio);
 922                 return;
 923         }
 924
 925         dm_bio_restore(&pb->bio_details, bio);
 926         remap_to_cache(pb->cache, bio, pb->cblock);
 927
 928         /*
 929          * We can't issue this bio directly, since we're in interrupt
 930          * context.  So it gets put on a bio list for processing by the
 931          * worker thread.
 932          */
 933         defer_writethrough_bio(pb->cache, bio);
 934 }
 935
 936 /*
 937  * When running in writethrough mode we need to send writes to clean blocks
 938  * to both the cache and origin devices.  In future we'd like to clone the
 939  * bio and send them in parallel, but for now we're doing them in
 940  * series as this is easier.
 941  */
 942 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
 943                                        dm_oblock_t oblock, dm_cblock_t cblock)
 944 {
 945         struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
 946
 947         pb->cache = cache;
 948         pb->cblock = cblock;
 949         dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL);
 950         dm_bio_record(&pb->bio_details, bio);
 951
 952         remap_to_origin_clear_discard(pb->cache, bio, oblock);
 953 }
 954
 955 /*----------------------------------------------------------------
 956  * Failure modes
 957  *--------------------------------------------------------------*/
 958 static enum cache_metadata_mode get_cache_mode(struct cache *cache)
 959 {
 960         return cache->features.mode;
 961 }
 962
 963 static const char *cache_device_name(struct cache *cache)
 964 {
 965         return dm_device_name(dm_table_get_md(cache->ti->table));
 966 }
 967
 968 static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode)
 969 {
 970         const char *descs[] = {
 971                 "write",
 972                 "read-only",
 973                 "fail"
 974         };
 975
 976         dm_table_event(cache->ti->table);
 977         DMINFO("%s: switching cache to %s mode",
 978                cache_device_name(cache), descs[(int)mode]);
 979 }
 980
 981 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode)
 982 {
 983         bool needs_check;
 984         enum cache_metadata_mode old_mode = get_cache_mode(cache);
 985
 986         if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) {
 987                 DMERR("%s: unable to read needs_check flag, setting failure mode.",
 988                       cache_device_name(cache));
 989                 new_mode = CM_FAIL;
 990         }
 991
 992         if (new_mode == CM_WRITE && needs_check) {
 993                 DMERR("%s: unable to switch cache to write mode until repaired.",
 994                       cache_device_name(cache));
 995                 if (old_mode != new_mode)
 996                         new_mode = old_mode;
 997                 else
 998                         new_mode = CM_READ_ONLY;
 999         }
1000
1001         /* Never move out of fail mode */
1002         if (old_mode == CM_FAIL)
1003                 new_mode = CM_FAIL;
1004
1005         switch (new_mode) {
1006         case CM_FAIL:
1007         case CM_READ_ONLY:
1008                 dm_cache_metadata_set_read_only(cache->cmd);
1009                 break;
1010
1011         case CM_WRITE:
1012                 dm_cache_metadata_set_read_write(cache->cmd);
1013                 break;
1014         }
1015
1016         cache->features.mode = new_mode;
1017
1018         if (new_mode != old_mode)
1019                 notify_mode_switch(cache, new_mode);
1020 }
1021
1022 static void abort_transaction(struct cache *cache)
1023 {
1024         const char *dev_name = cache_device_name(cache);
1025
1026         if (get_cache_mode(cache) >= CM_READ_ONLY)
1027                 return;
1028
1029         if (dm_cache_metadata_set_needs_check(cache->cmd)) {
1030                 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
1031                 set_cache_mode(cache, CM_FAIL);
1032         }
1033
1034         DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
1035         if (dm_cache_metadata_abort(cache->cmd)) {
1036                 DMERR("%s: failed to abort metadata transaction", dev_name);
1037                 set_cache_mode(cache, CM_FAIL);
1038         }
1039 }
1040
1041 static void metadata_operation_failed(struct cache *cache, const char *op, int r)
1042 {
1043         DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
1044                     cache_device_name(cache), op, r);
1045         abort_transaction(cache);
1046         set_cache_mode(cache, CM_READ_ONLY);
1047 }
1048
1049 /*----------------------------------------------------------------
1050  * Migration processing
1051  *
1052  * Migration covers moving data from the origin device to the cache, or
1053  * vice versa.
1054  *--------------------------------------------------------------*/
1055 static void inc_io_migrations(struct cache *cache)
1056 {
1057         atomic_inc(&cache->nr_io_migrations);
1058 }
1059
1060 static void dec_io_migrations(struct cache *cache)
1061 {
1062         atomic_dec(&cache->nr_io_migrations);
1063 }
1064
1065 static bool discard_or_flush(struct bio *bio)
1066 {
1067         return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf);
1068 }
1069
1070 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell)
1071 {
1072         if (discard_or_flush(cell->holder)) {
1073                 /*
1074                  * We have to handle these bios individually.
1075                  */
1076                 dm_cell_release(cache->prison, cell, &cache->deferred_bios);
1077                 free_prison_cell(cache, cell);
1078         } else
1079                 list_add_tail(&cell->user_list, &cache->deferred_cells);
1080 }
1081
1082 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder)
1083 {
1084         unsigned long flags;
1085
1086         if (!holder && dm_cell_promote_or_release(cache->prison, cell)) {
1087                 /*
1088                  * There was no prisoner to promote to holder, the
1089                  * cell has been released.
1090                  */
1091                 free_prison_cell(cache, cell);
1092                 return;
1093         }
1094
1095         spin_lock_irqsave(&cache->lock, flags);
1096         __cell_defer(cache, cell);
1097         spin_unlock_irqrestore(&cache->lock, flags);
1098
1099         wake_worker(cache);
1100 }
1101
1102 static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err)
1103 {
1104         dm_cell_error(cache->prison, cell, err);
1105         free_prison_cell(cache, cell);
1106 }
1107
1108 static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell)
1109 {
1110         cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE);
1111 }
1112
1113 static void free_io_migration(struct dm_cache_migration *mg)
1114 {
1115         struct cache *cache = mg->cache;
1116
1117         dec_io_migrations(cache);
1118         free_migration(mg);
1119         wake_worker(cache);
1120 }
1121
1122 static void migration_failure(struct dm_cache_migration *mg)
1123 {
1124         struct cache *cache = mg->cache;
1125         const char *dev_name = cache_device_name(cache);
1126
1127         if (mg->writeback) {
1128                 DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name);
1129                 set_dirty(cache, mg->old_oblock, mg->cblock);
1130                 cell_defer(cache, mg->old_ocell, false);
1131
1132         } else if (mg->demote) {
1133                 DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name);
1134                 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
1135
1136                 cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
1137                 if (mg->promote)
1138                         cell_defer(cache, mg->new_ocell, true);
1139         } else {
1140                 DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name);
1141                 policy_remove_mapping(cache->policy, mg->new_oblock);
1142                 cell_defer(cache, mg->new_ocell, true);
1143         }
1144
1145         free_io_migration(mg);
1146 }
1147
1148 static void migration_success_pre_commit(struct dm_cache_migration *mg)
1149 {
1150         int r;
1151         unsigned long flags;
1152         struct cache *cache = mg->cache;
1153
1154         if (mg->writeback) {
1155                 clear_dirty(cache, mg->old_oblock, mg->cblock);
1156                 cell_defer(cache, mg->old_ocell, false);
1157                 free_io_migration(mg);
1158                 return;
1159
1160         } else if (mg->demote) {
1161                 r = dm_cache_remove_mapping(cache->cmd, mg->cblock);
1162                 if (r) {
1163                         DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata",
1164                                     cache_device_name(cache));
1165                         metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1166                         policy_force_mapping(cache->policy, mg->new_oblock,
1167                                              mg->old_oblock);
1168                         if (mg->promote)
1169                                 cell_defer(cache, mg->new_ocell, true);
1170                         free_io_migration(mg);
1171                         return;
1172                 }
1173         } else {
1174                 r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock);
1175                 if (r) {
1176                         DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata",
1177                                     cache_device_name(cache));
1178                         metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
1179                         policy_remove_mapping(cache->policy, mg->new_oblock);
1180                         free_io_migration(mg);
1181                         return;
1182                 }
1183         }
1184
1185         spin_lock_irqsave(&cache->lock, flags);
1186         list_add_tail(&mg->list, &cache->need_commit_migrations);
1187         cache->commit_requested = true;
1188         spin_unlock_irqrestore(&cache->lock, flags);
1189 }
1190
1191 static void migration_success_post_commit(struct dm_cache_migration *mg)
1192 {
1193         unsigned long flags;
1194         struct cache *cache = mg->cache;
1195
1196         if (mg->writeback) {
1197                 DMWARN_LIMIT("%s: writeback unexpectedly triggered commit",
1198                              cache_device_name(cache));
1199                 return;
1200
1201         } else if (mg->demote) {
1202                 cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
1203
1204                 if (mg->promote) {
1205                         mg->demote = false;
1206
1207                         spin_lock_irqsave(&cache->lock, flags);
1208                         list_add_tail(&mg->list, &cache->quiesced_migrations);
1209                         spin_unlock_irqrestore(&cache->lock, flags);
1210
1211                 } else {
1212                         if (mg->invalidate)
1213                                 policy_remove_mapping(cache->policy, mg->old_oblock);
1214                         free_io_migration(mg);
1215                 }
1216
1217         } else {
1218                 if (mg->requeue_holder) {
1219                         clear_dirty(cache, mg->new_oblock, mg->cblock);
1220                         cell_defer(cache, mg->new_ocell, true);
1221                 } else {
1222                         /*
1223                          * The block was promoted via an overwrite, so it's dirty.
1224                          */
1225                         set_dirty(cache, mg->new_oblock, mg->cblock);
1226                         bio_endio(mg->new_ocell->holder);
1227                         cell_defer(cache, mg->new_ocell, false);
1228                 }
1229                 free_io_migration(mg);
1230         }
1231 }
1232
1233 static void copy_complete(int read_err, unsigned long write_err, void *context)
1234 {
1235         unsigned long flags;
1236         struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
1237         struct cache *cache = mg->cache;
1238
1239         if (read_err || write_err)
1240                 mg->err = true;
1241
1242         spin_lock_irqsave(&cache->lock, flags);
1243         list_add_tail(&mg->list, &cache->completed_migrations);
1244         spin_unlock_irqrestore(&cache->lock, flags);
1245
1246         wake_worker(cache);
1247 }
1248
1249 static void issue_copy(struct dm_cache_migration *mg)
1250 {
1251         int r;
1252         struct dm_io_region o_region, c_region;
1253         struct cache *cache = mg->cache;
1254         sector_t cblock = from_cblock(mg->cblock);
1255
1256         o_region.bdev = cache->origin_dev->bdev;
1257         o_region.count = cache->sectors_per_block;
1258
1259         c_region.bdev = cache->cache_dev->bdev;
1260         c_region.sector = cblock * cache->sectors_per_block;
1261         c_region.count = cache->sectors_per_block;
1262
1263         if (mg->writeback || mg->demote) {
1264                 /* demote */
1265                 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
1266                 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
1267         } else {
1268                 /* promote */
1269                 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
1270                 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
1271         }
1272
1273         if (r < 0) {
1274                 DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache));
1275                 migration_failure(mg);
1276         }
1277 }
1278
1279 static void overwrite_endio(struct bio *bio)
1280 {
1281         struct dm_cache_migration *mg = bio->bi_private;
1282         struct cache *cache = mg->cache;
1283         size_t pb_data_size = get_per_bio_data_size(cache);
1284         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1285         unsigned long flags;
1286
1287         dm_unhook_bio(&pb->hook_info, bio);
1288
1289         if (bio->bi_error)
1290                 mg->err = true;
1291
1292         mg->requeue_holder = false;
1293
1294         spin_lock_irqsave(&cache->lock, flags);
1295         list_add_tail(&mg->list, &cache->completed_migrations);
1296         spin_unlock_irqrestore(&cache->lock, flags);
1297
1298         wake_worker(cache);
1299 }
1300
1301 static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
1302 {
1303         size_t pb_data_size = get_per_bio_data_size(mg->cache);
1304         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1305
1306         dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
1307         remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
1308
1309         /*
1310          * No need to inc_ds() here, since the cell will be held for the
1311          * duration of the io.
1312          */
1313         accounted_request(mg->cache, bio);
1314 }
1315
1316 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
1317 {
1318         return (bio_data_dir(bio) == WRITE) &&
1319                 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
1320 }
1321
1322 static void avoid_copy(struct dm_cache_migration *mg)
1323 {
1324         atomic_inc(&mg->cache->stats.copies_avoided);
1325         migration_success_pre_commit(mg);
1326 }
1327
1328 static void calc_discard_block_range(struct cache *cache, struct bio *bio,
1329                                      dm_dblock_t *b, dm_dblock_t *e)
1330 {
1331         sector_t sb = bio->bi_iter.bi_sector;
1332         sector_t se = bio_end_sector(bio);
1333
1334         *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
1335
1336         if (se - sb < cache->discard_block_size)
1337                 *e = *b;
1338         else
1339                 *e = to_dblock(block_div(se, cache->discard_block_size));
1340 }
1341
1342 static void issue_discard(struct dm_cache_migration *mg)
1343 {
1344         dm_dblock_t b, e;
1345         struct bio *bio = mg->new_ocell->holder;
1346         struct cache *cache = mg->cache;
1347
1348         calc_discard_block_range(cache, bio, &b, &e);
1349         while (b != e) {
1350                 set_discard(cache, b);
1351                 b = to_dblock(from_dblock(b) + 1);
1352         }
1353
1354         bio_endio(bio);
1355         cell_defer(cache, mg->new_ocell, false);
1356         free_migration(mg);
1357         wake_worker(cache);
1358 }
1359
1360 static void issue_copy_or_discard(struct dm_cache_migration *mg)
1361 {
1362         bool avoid;
1363         struct cache *cache = mg->cache;
1364
1365         if (mg->discard) {
1366                 issue_discard(mg);
1367                 return;
1368         }
1369
1370         if (mg->writeback || mg->demote)
1371                 avoid = !is_dirty(cache, mg->cblock) ||
1372                         is_discarded_oblock(cache, mg->old_oblock);
1373         else {
1374                 struct bio *bio = mg->new_ocell->holder;
1375
1376                 avoid = is_discarded_oblock(cache, mg->new_oblock);
1377
1378                 if (writeback_mode(&cache->features) &&
1379                     !avoid && bio_writes_complete_block(cache, bio)) {
1380                         issue_overwrite(mg, bio);
1381                         return;
1382                 }
1383         }
1384
1385         avoid ? avoid_copy(mg) : issue_copy(mg);
1386 }
1387
1388 static void complete_migration(struct dm_cache_migration *mg)
1389 {
1390         if (mg->err)
1391                 migration_failure(mg);
1392         else
1393                 migration_success_pre_commit(mg);
1394 }
1395
1396 static void process_migrations(struct cache *cache, struct list_head *head,
1397                                void (*fn)(struct dm_cache_migration *))
1398 {
1399         unsigned long flags;
1400         struct list_head list;
1401         struct dm_cache_migration *mg, *tmp;
1402
1403         INIT_LIST_HEAD(&list);
1404         spin_lock_irqsave(&cache->lock, flags);
1405         list_splice_init(head, &list);
1406         spin_unlock_irqrestore(&cache->lock, flags);
1407
1408         list_for_each_entry_safe(mg, tmp, &list, list)
1409                 fn(mg);
1410 }
1411
1412 static void __queue_quiesced_migration(struct dm_cache_migration *mg)
1413 {
1414         list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
1415 }
1416
1417 static void queue_quiesced_migration(struct dm_cache_migration *mg)
1418 {
1419         unsigned long flags;
1420         struct cache *cache = mg->cache;
1421
1422         spin_lock_irqsave(&cache->lock, flags);
1423         __queue_quiesced_migration(mg);
1424         spin_unlock_irqrestore(&cache->lock, flags);
1425
1426         wake_worker(cache);
1427 }
1428
1429 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
1430 {
1431         unsigned long flags;
1432         struct dm_cache_migration *mg, *tmp;
1433
1434         spin_lock_irqsave(&cache->lock, flags);
1435         list_for_each_entry_safe(mg, tmp, work, list)
1436                 __queue_quiesced_migration(mg);
1437         spin_unlock_irqrestore(&cache->lock, flags);
1438
1439         wake_worker(cache);
1440 }
1441
1442 static void check_for_quiesced_migrations(struct cache *cache,
1443                                           struct per_bio_data *pb)
1444 {
1445         struct list_head work;
1446
1447         if (!pb->all_io_entry)
1448                 return;
1449
1450         INIT_LIST_HEAD(&work);
1451         dm_deferred_entry_dec(pb->all_io_entry, &work);
1452
1453         if (!list_empty(&work))
1454                 queue_quiesced_migrations(cache, &work);
1455 }
1456
1457 static void quiesce_migration(struct dm_cache_migration *mg)
1458 {
1459         if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
1460                 queue_quiesced_migration(mg);
1461 }
1462
1463 static void promote(struct cache *cache, struct prealloc *structs,
1464                     dm_oblock_t oblock, dm_cblock_t cblock,
1465                     struct dm_bio_prison_cell *cell)
1466 {
1467         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1468
1469         mg->err = false;
1470         mg->discard = false;
1471         mg->writeback = false;
1472         mg->demote = false;
1473         mg->promote = true;
1474         mg->requeue_holder = true;
1475         mg->invalidate = false;
1476         mg->cache = cache;
1477         mg->new_oblock = oblock;
1478         mg->cblock = cblock;
1479         mg->old_ocell = NULL;
1480         mg->new_ocell = cell;
1481         mg->start_jiffies = jiffies;
1482
1483         inc_io_migrations(cache);
1484         quiesce_migration(mg);
1485 }
1486
1487 static void writeback(struct cache *cache, struct prealloc *structs,
1488                       dm_oblock_t oblock, dm_cblock_t cblock,
1489                       struct dm_bio_prison_cell *cell)
1490 {
1491         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1492
1493         mg->err = false;
1494         mg->discard = false;
1495         mg->writeback = true;
1496         mg->demote = false;
1497         mg->promote = false;
1498         mg->requeue_holder = true;
1499         mg->invalidate = false;
1500         mg->cache = cache;
1501         mg->old_oblock = oblock;
1502         mg->cblock = cblock;
1503         mg->old_ocell = cell;
1504         mg->new_ocell = NULL;
1505         mg->start_jiffies = jiffies;
1506
1507         inc_io_migrations(cache);
1508         quiesce_migration(mg);
1509 }
1510
1511 static void demote_then_promote(struct cache *cache, struct prealloc *structs,
1512                                 dm_oblock_t old_oblock, dm_oblock_t new_oblock,
1513                                 dm_cblock_t cblock,
1514                                 struct dm_bio_prison_cell *old_ocell,
1515                                 struct dm_bio_prison_cell *new_ocell)
1516 {
1517         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1518
1519         mg->err = false;
1520         mg->discard = false;
1521         mg->writeback = false;
1522         mg->demote = true;
1523         mg->promote = true;
1524         mg->requeue_holder = true;
1525         mg->invalidate = false;
1526         mg->cache = cache;
1527         mg->old_oblock = old_oblock;
1528         mg->new_oblock = new_oblock;
1529         mg->cblock = cblock;
1530         mg->old_ocell = old_ocell;
1531         mg->new_ocell = new_ocell;
1532         mg->start_jiffies = jiffies;
1533
1534         inc_io_migrations(cache);
1535         quiesce_migration(mg);
1536 }
1537
1538 /*
1539  * Invalidate a cache entry.  No writeback occurs; any changes in the cache
1540  * block are thrown away.
1541  */
1542 static void invalidate(struct cache *cache, struct prealloc *structs,
1543                        dm_oblock_t oblock, dm_cblock_t cblock,
1544                        struct dm_bio_prison_cell *cell)
1545 {
1546         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1547
1548         mg->err = false;
1549         mg->discard = false;
1550         mg->writeback = false;
1551         mg->demote = true;
1552         mg->promote = false;
1553         mg->requeue_holder = true;
1554         mg->invalidate = true;
1555         mg->cache = cache;
1556         mg->old_oblock = oblock;
1557         mg->cblock = cblock;
1558         mg->old_ocell = cell;
1559         mg->new_ocell = NULL;
1560         mg->start_jiffies = jiffies;
1561
1562         inc_io_migrations(cache);
1563         quiesce_migration(mg);
1564 }
1565
1566 static void discard(struct cache *cache, struct prealloc *structs,
1567                     struct dm_bio_prison_cell *cell)
1568 {
1569         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1570
1571         mg->err = false;
1572         mg->discard = true;
1573         mg->writeback = false;
1574         mg->demote = false;
1575         mg->promote = false;
1576         mg->requeue_holder = false;
1577         mg->invalidate = false;
1578         mg->cache = cache;
1579         mg->old_ocell = NULL;
1580         mg->new_ocell = cell;
1581         mg->start_jiffies = jiffies;
1582
1583         quiesce_migration(mg);
1584 }
1585
1586 /*----------------------------------------------------------------
1587  * bio processing
1588  *--------------------------------------------------------------*/
1589 static void defer_bio(struct cache *cache, struct bio *bio)
1590 {
1591         unsigned long flags;
1592
1593         spin_lock_irqsave(&cache->lock, flags);
1594         bio_list_add(&cache->deferred_bios, bio);
1595         spin_unlock_irqrestore(&cache->lock, flags);
1596
1597         wake_worker(cache);
1598 }
1599
1600 static void process_flush_bio(struct cache *cache, struct bio *bio)
1601 {
1602         size_t pb_data_size = get_per_bio_data_size(cache);
1603         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1604
1605         BUG_ON(bio->bi_iter.bi_size);
1606         if (!pb->req_nr)
1607                 remap_to_origin(cache, bio);
1608         else
1609                 remap_to_cache(cache, bio, 0);
1610
1611         /*
1612          * REQ_PREFLUSH is not directed at any particular block so we don't
1613          * need to inc_ds().  REQ_FUA's are split into a write + REQ_PREFLUSH
1614          * by dm-core.
1615          */
1616         issue(cache, bio);
1617 }
1618
1619 static void process_discard_bio(struct cache *cache, struct prealloc *structs,
1620                                 struct bio *bio)
1621 {
1622         int r;
1623         dm_dblock_t b, e;
1624         struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
1625
1626         calc_discard_block_range(cache, bio, &b, &e);
1627         if (b == e) {
1628                 bio_endio(bio);
1629                 return;
1630         }
1631
1632         cell_prealloc = prealloc_get_cell(structs);
1633         r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc,
1634                              (cell_free_fn) prealloc_put_cell,
1635                              structs, &new_ocell);
1636         if (r > 0)
1637                 return;
1638
1639         discard(cache, structs, new_ocell);
1640 }
1641
1642 static bool spare_migration_bandwidth(struct cache *cache)
1643 {
1644         sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
1645                 cache->sectors_per_block;
1646         return current_volume < cache->migration_threshold;
1647 }
1648
1649 static void inc_hit_counter(struct cache *cache, struct bio *bio)
1650 {
1651         atomic_inc(bio_data_dir(bio) == READ ?
1652                    &cache->stats.read_hit : &cache->stats.write_hit);
1653 }
1654
1655 static void inc_miss_counter(struct cache *cache, struct bio *bio)
1656 {
1657         atomic_inc(bio_data_dir(bio) == READ ?
1658                    &cache->stats.read_miss : &cache->stats.write_miss);
1659 }
1660
1661 /*----------------------------------------------------------------*/
1662
1663 struct inc_detail {
1664         struct cache *cache;
1665         struct bio_list bios_for_issue;
1666         struct bio_list unhandled_bios;
1667         bool any_writes;
1668 };
1669
1670 static void inc_fn(void *context, struct dm_bio_prison_cell *cell)
1671 {
1672         struct bio *bio;
1673         struct inc_detail *detail = context;
1674         struct cache *cache = detail->cache;
1675
1676         inc_ds(cache, cell->holder, cell);
1677         if (bio_data_dir(cell->holder) == WRITE)
1678                 detail->any_writes = true;
1679
1680         while ((bio = bio_list_pop(&cell->bios))) {
1681                 if (discard_or_flush(bio)) {
1682                         bio_list_add(&detail->unhandled_bios, bio);
1683                         continue;
1684                 }
1685
1686                 if (bio_data_dir(bio) == WRITE)
1687                         detail->any_writes = true;
1688
1689                 bio_list_add(&detail->bios_for_issue, bio);
1690                 inc_ds(cache, bio, cell);
1691         }
1692 }
1693
1694 // FIXME: refactor these two
1695 static void remap_cell_to_origin_clear_discard(struct cache *cache,
1696                                                struct dm_bio_prison_cell *cell,
1697                                                dm_oblock_t oblock, bool issue_holder)
1698 {
1699         struct bio *bio;
1700         unsigned long flags;
1701         struct inc_detail detail;
1702
1703         detail.cache = cache;
1704         bio_list_init(&detail.bios_for_issue);
1705         bio_list_init(&detail.unhandled_bios);
1706         detail.any_writes = false;
1707
1708         spin_lock_irqsave(&cache->lock, flags);
1709         dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
1710         bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
1711         spin_unlock_irqrestore(&cache->lock, flags);
1712
1713         remap_to_origin(cache, cell->holder);
1714         if (issue_holder)
1715                 issue(cache, cell->holder);
1716         else
1717                 accounted_begin(cache, cell->holder);
1718
1719         if (detail.any_writes)
1720                 clear_discard(cache, oblock_to_dblock(cache, oblock));
1721
1722         while ((bio = bio_list_pop(&detail.bios_for_issue))) {
1723                 remap_to_origin(cache, bio);
1724                 issue(cache, bio);
1725         }
1726
1727         free_prison_cell(cache, cell);
1728 }
1729
1730 static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell,
1731                                       dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder)
1732 {
1733         struct bio *bio;
1734         unsigned long flags;
1735         struct inc_detail detail;
1736
1737         detail.cache = cache;
1738         bio_list_init(&detail.bios_for_issue);
1739         bio_list_init(&detail.unhandled_bios);
1740         detail.any_writes = false;
1741
1742         spin_lock_irqsave(&cache->lock, flags);
1743         dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
1744         bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
1745         spin_unlock_irqrestore(&cache->lock, flags);
1746
1747         remap_to_cache(cache, cell->holder, cblock);
1748         if (issue_holder)
1749                 issue(cache, cell->holder);
1750         else
1751                 accounted_begin(cache, cell->holder);
1752
1753         if (detail.any_writes) {
1754                 set_dirty(cache, oblock, cblock);
1755                 clear_discard(cache, oblock_to_dblock(cache, oblock));
1756         }
1757
1758         while ((bio = bio_list_pop(&detail.bios_for_issue))) {
1759                 remap_to_cache(cache, bio, cblock);
1760                 issue(cache, bio);
1761         }
1762
1763         free_prison_cell(cache, cell);
1764 }
1765
1766 /*----------------------------------------------------------------*/
1767
1768 struct old_oblock_lock {
1769         struct policy_locker locker;
1770         struct cache *cache;
1771         struct prealloc *structs;
1772         struct dm_bio_prison_cell *cell;
1773 };
1774
1775 static int null_locker(struct policy_locker *locker, dm_oblock_t b)
1776 {
1777         /* This should never be called */
1778         BUG();
1779         return 0;
1780 }
1781
1782 static int cell_locker(struct policy_locker *locker, dm_oblock_t b)
1783 {
1784         struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker);
1785         struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs);
1786
1787         return bio_detain(l->cache, b, NULL, cell_prealloc,
1788                           (cell_free_fn) prealloc_put_cell,
1789                           l->structs, &l->cell);
1790 }
1791
1792 static void process_cell(struct cache *cache, struct prealloc *structs,
1793                          struct dm_bio_prison_cell *new_ocell)
1794 {
1795         int r;
1796         bool release_cell = true;
1797         struct bio *bio = new_ocell->holder;
1798         dm_oblock_t block = get_bio_block(cache, bio);
1799         struct policy_result lookup_result;
1800         bool passthrough = passthrough_mode(&cache->features);
1801         bool fast_promotion, can_migrate;
1802         struct old_oblock_lock ool;
1803
1804         fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
1805         can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache));
1806
1807         ool.locker.fn = cell_locker;
1808         ool.cache = cache;
1809         ool.structs = structs;
1810         ool.cell = NULL;
1811         r = policy_map(cache->policy, block, true, can_migrate, fast_promotion,
1812                        bio, &ool.locker, &lookup_result);
1813
1814         if (r == -EWOULDBLOCK)
1815                 /* migration has been denied */
1816                 lookup_result.op = POLICY_MISS;
1817
1818         switch (lookup_result.op) {
1819         case POLICY_HIT:
1820                 if (passthrough) {
1821                         inc_miss_counter(cache, bio);
1822
1823                         /*
1824                          * Passthrough always maps to the origin,
1825                          * invalidating any cache blocks that are written
1826                          * to.
1827                          */
1828
1829                         if (bio_data_dir(bio) == WRITE) {
1830                                 atomic_inc(&cache->stats.demotion);
1831                                 invalidate(cache, structs, block, lookup_result.cblock, new_ocell);
1832                                 release_cell = false;
1833
1834                         } else {
1835                                 /* FIXME: factor out issue_origin() */
1836                                 remap_to_origin_clear_discard(cache, bio, block);
1837                                 inc_and_issue(cache, bio, new_ocell);
1838                         }
1839                 } else {
1840                         inc_hit_counter(cache, bio);
1841
1842                         if (bio_data_dir(bio) == WRITE &&
1843                             writethrough_mode(&cache->features) &&
1844                             !is_dirty(cache, lookup_result.cblock)) {
1845                                 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
1846                                 inc_and_issue(cache, bio, new_ocell);
1847
1848                         } else {
1849                                 remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true);
1850                                 release_cell = false;
1851                         }
1852                 }
1853
1854                 break;
1855
1856         case POLICY_MISS:
1857                 inc_miss_counter(cache, bio);
1858                 remap_cell_to_origin_clear_discard(cache, new_ocell, block, true);
1859                 release_cell = false;
1860                 break;
1861
1862         case POLICY_NEW:
1863                 atomic_inc(&cache->stats.promotion);
1864                 promote(cache, structs, block, lookup_result.cblock, new_ocell);
1865                 release_cell = false;
1866                 break;
1867
1868         case POLICY_REPLACE:
1869                 atomic_inc(&cache->stats.demotion);
1870                 atomic_inc(&cache->stats.promotion);
1871                 demote_then_promote(cache, structs, lookup_result.old_oblock,
1872                                     block, lookup_result.cblock,
1873                                     ool.cell, new_ocell);
1874                 release_cell = false;
1875                 break;
1876
1877         default:
1878                 DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u",
1879                             cache_device_name(cache), __func__,
1880                             (unsigned) lookup_result.op);
1881                 bio_io_error(bio);
1882         }
1883
1884         if (release_cell)
1885                 cell_defer(cache, new_ocell, false);
1886 }
1887
1888 static void process_bio(struct cache *cache, struct prealloc *structs,
1889                         struct bio *bio)
1890 {
1891         int r;
1892         dm_oblock_t block = get_bio_block(cache, bio);
1893         struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
1894
1895         /*
1896          * Check to see if that block is currently migrating.
1897          */
1898         cell_prealloc = prealloc_get_cell(structs);
1899         r = bio_detain(cache, block, bio, cell_prealloc,
1900                        (cell_free_fn) prealloc_put_cell,
1901                        structs, &new_ocell);
1902         if (r > 0)
1903                 return;
1904
1905         process_cell(cache, structs, new_ocell);
1906 }
1907
1908 static int need_commit_due_to_time(struct cache *cache)
1909 {
1910         return jiffies < cache->last_commit_jiffies ||
1911                jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
1912 }
1913
1914 /*
1915  * A non-zero return indicates read_only or fail_io mode.
1916  */
1917 static int commit(struct cache *cache, bool clean_shutdown)
1918 {
1919         int r;
1920
1921         if (get_cache_mode(cache) >= CM_READ_ONLY)
1922                 return -EINVAL;
1923
1924         atomic_inc(&cache->stats.commit_count);
1925         r = dm_cache_commit(cache->cmd, clean_shutdown);
1926         if (r)
1927                 metadata_operation_failed(cache, "dm_cache_commit", r);
1928
1929         return r;
1930 }
1931
1932 static int commit_if_needed(struct cache *cache)
1933 {
1934         int r = 0;
1935
1936         if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
1937             dm_cache_changed_this_transaction(cache->cmd)) {
1938                 r = commit(cache, false);
1939                 cache->commit_requested = false;
1940                 cache->last_commit_jiffies = jiffies;
1941         }
1942
1943         return r;
1944 }
1945
1946 static void process_deferred_bios(struct cache *cache)
1947 {
1948         bool prealloc_used = false;
1949         unsigned long flags;
1950         struct bio_list bios;
1951         struct bio *bio;
1952         struct prealloc structs;
1953
1954         memset(&structs, 0, sizeof(structs));
1955         bio_list_init(&bios);
1956
1957         spin_lock_irqsave(&cache->lock, flags);
1958         bio_list_merge(&bios, &cache->deferred_bios);
1959         bio_list_init(&cache->deferred_bios);
1960         spin_unlock_irqrestore(&cache->lock, flags);
1961
1962         while (!bio_list_empty(&bios)) {
1963                 /*
1964                  * If we've got no free migration structs, and processing
1965                  * this bio might require one, we pause until there are some
1966                  * prepared mappings to process.
1967                  */
1968                 prealloc_used = true;
1969                 if (prealloc_data_structs(cache, &structs)) {
1970                         spin_lock_irqsave(&cache->lock, flags);
1971                         bio_list_merge(&cache->deferred_bios, &bios);
1972                         spin_unlock_irqrestore(&cache->lock, flags);
1973                         break;
1974                 }
1975
1976                 bio = bio_list_pop(&bios);
1977
1978                 if (bio->bi_opf & REQ_PREFLUSH)
1979                         process_flush_bio(cache, bio);
1980                 else if (bio_op(bio) == REQ_OP_DISCARD)
1981                         process_discard_bio(cache, &structs, bio);
1982                 else
1983                         process_bio(cache, &structs, bio);
1984         }
1985
1986         if (prealloc_used)
1987                 prealloc_free_structs(cache, &structs);
1988 }
1989
1990 static void process_deferred_cells(struct cache *cache)
1991 {
1992         bool prealloc_used = false;
1993         unsigned long flags;
1994         struct dm_bio_prison_cell *cell, *tmp;
1995         struct list_head cells;
1996         struct prealloc structs;
1997
1998         memset(&structs, 0, sizeof(structs));
1999
2000         INIT_LIST_HEAD(&cells);
2001
2002         spin_lock_irqsave(&cache->lock, flags);
2003         list_splice_init(&cache->deferred_cells, &cells);
2004         spin_unlock_irqrestore(&cache->lock, flags);
2005
2006         list_for_each_entry_safe(cell, tmp, &cells, user_list) {
2007                 /*
2008                  * If we've got no free migration structs, and processing
2009                  * this bio might require one, we pause until there are some
2010                  * prepared mappings to process.
2011                  */
2012                 prealloc_used = true;
2013                 if (prealloc_data_structs(cache, &structs)) {
2014                         spin_lock_irqsave(&cache->lock, flags);
2015                         list_splice(&cells, &cache->deferred_cells);
2016                         spin_unlock_irqrestore(&cache->lock, flags);
2017                         break;
2018                 }
2019
2020                 process_cell(cache, &structs, cell);
2021         }
2022
2023         if (prealloc_used)
2024                 prealloc_free_structs(cache, &structs);
2025 }
2026
2027 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
2028 {
2029         unsigned long flags;
2030         struct bio_list bios;
2031         struct bio *bio;
2032
2033         bio_list_init(&bios);
2034
2035         spin_lock_irqsave(&cache->lock, flags);
2036         bio_list_merge(&bios, &cache->deferred_flush_bios);
2037         bio_list_init(&cache->deferred_flush_bios);
2038         spin_unlock_irqrestore(&cache->lock, flags);
2039
2040         /*
2041          * These bios have already been through inc_ds()
2042          */
2043         while ((bio = bio_list_pop(&bios)))
2044                 submit_bios ? accounted_request(cache, bio) : bio_io_error(bio);
2045 }
2046
2047 static void process_deferred_writethrough_bios(struct cache *cache)
2048 {
2049         unsigned long flags;
2050         struct bio_list bios;
2051         struct bio *bio;
2052
2053         bio_list_init(&bios);
2054
2055         spin_lock_irqsave(&cache->lock, flags);
2056         bio_list_merge(&bios, &cache->deferred_writethrough_bios);
2057         bio_list_init(&cache->deferred_writethrough_bios);
2058         spin_unlock_irqrestore(&cache->lock, flags);
2059
2060         /*
2061          * These bios have already been through inc_ds()
2062          */
2063         while ((bio = bio_list_pop(&bios)))
2064                 accounted_request(cache, bio);
2065 }
2066
2067 static void writeback_some_dirty_blocks(struct cache *cache)
2068 {
2069         bool prealloc_used = false;
2070         dm_oblock_t oblock;
2071         dm_cblock_t cblock;
2072         struct prealloc structs;
2073         struct dm_bio_prison_cell *old_ocell;
2074         bool busy = !iot_idle_for(&cache->origin_tracker, HZ);
2075
2076         memset(&structs, 0, sizeof(structs));
2077
2078         while (spare_migration_bandwidth(cache)) {
2079                 if (policy_writeback_work(cache->policy, &oblock, &cblock, busy))
2080                         break; /* no work to do */
2081
2082                 prealloc_used = true;
2083                 if (prealloc_data_structs(cache, &structs) ||
2084                     get_cell(cache, oblock, &structs, &old_ocell)) {
2085                         policy_set_dirty(cache->policy, oblock);
2086                         break;
2087                 }
2088
2089                 writeback(cache, &structs, oblock, cblock, old_ocell);
2090         }
2091
2092         if (prealloc_used)
2093                 prealloc_free_structs(cache, &structs);
2094 }
2095
2096 /*----------------------------------------------------------------
2097  * Invalidations.
2098  * Dropping something from the cache *without* writing back.
2099  *--------------------------------------------------------------*/
2100
2101 static void process_invalidation_request(struct cache *cache, struct invalidation_request *req)
2102 {
2103         int r = 0;
2104         uint64_t begin = from_cblock(req->cblocks->begin);
2105         uint64_t end = from_cblock(req->cblocks->end);
2106
2107         while (begin != end) {
2108                 r = policy_remove_cblock(cache->policy, to_cblock(begin));
2109                 if (!r) {
2110                         r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
2111                         if (r) {
2112                                 metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
2113                                 break;
2114                         }
2115
2116                 } else if (r == -ENODATA) {
2117                         /* harmless, already unmapped */
2118                         r = 0;
2119
2120                 } else {
2121                         DMERR("%s: policy_remove_cblock failed", cache_device_name(cache));
2122                         break;
2123                 }
2124
2125                 begin++;
2126         }
2127
2128         cache->commit_requested = true;
2129
2130         req->err = r;
2131         atomic_set(&req->complete, 1);
2132
2133         wake_up(&req->result_wait);
2134 }
2135
2136 static void process_invalidation_requests(struct cache *cache)
2137 {
2138         struct list_head list;
2139         struct invalidation_request *req, *tmp;
2140
2141         INIT_LIST_HEAD(&list);
2142         spin_lock(&cache->invalidation_lock);
2143         list_splice_init(&cache->invalidation_requests, &list);
2144         spin_unlock(&cache->invalidation_lock);
2145
2146         list_for_each_entry_safe (req, tmp, &list, list)
2147                 process_invalidation_request(cache, req);
2148 }
2149
2150 /*----------------------------------------------------------------
2151  * Main worker loop
2152  *--------------------------------------------------------------*/
2153 static bool is_quiescing(struct cache *cache)
2154 {
2155         return atomic_read(&cache->quiescing);
2156 }
2157
2158 static void ack_quiescing(struct cache *cache)
2159 {
2160         if (is_quiescing(cache)) {
2161                 atomic_inc(&cache->quiescing_ack);
2162                 wake_up(&cache->quiescing_wait);
2163         }
2164 }
2165
2166 static void wait_for_quiescing_ack(struct cache *cache)
2167 {
2168         wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
2169 }
2170
2171 static void start_quiescing(struct cache *cache)
2172 {
2173         atomic_inc(&cache->quiescing);
2174         wait_for_quiescing_ack(cache);
2175 }
2176
2177 static void stop_quiescing(struct cache *cache)
2178 {
2179         atomic_set(&cache->quiescing, 0);
2180         atomic_set(&cache->quiescing_ack, 0);
2181 }
2182
2183 static void wait_for_migrations(struct cache *cache)
2184 {
2185         wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations));
2186 }
2187
2188 static void stop_worker(struct cache *cache)
2189 {
2190         cancel_delayed_work(&cache->waker);
2191         flush_workqueue(cache->wq);
2192 }
2193
2194 static void requeue_deferred_cells(struct cache *cache)
2195 {
2196         unsigned long flags;
2197         struct list_head cells;
2198         struct dm_bio_prison_cell *cell, *tmp;
2199
2200         INIT_LIST_HEAD(&cells);
2201         spin_lock_irqsave(&cache->lock, flags);
2202         list_splice_init(&cache->deferred_cells, &cells);
2203         spin_unlock_irqrestore(&cache->lock, flags);
2204
2205         list_for_each_entry_safe(cell, tmp, &cells, user_list)
2206                 cell_requeue(cache, cell);
2207 }
2208
2209 static void requeue_deferred_bios(struct cache *cache)
2210 {
2211         struct bio *bio;
2212         struct bio_list bios;
2213
2214         bio_list_init(&bios);
2215         bio_list_merge(&bios, &cache->deferred_bios);
2216         bio_list_init(&cache->deferred_bios);
2217
2218         while ((bio = bio_list_pop(&bios))) {
2219                 bio->bi_error = DM_ENDIO_REQUEUE;
2220                 bio_endio(bio);
2221         }
2222 }
2223
2224 static int more_work(struct cache *cache)
2225 {
2226         if (is_quiescing(cache))
2227                 return !list_empty(&cache->quiesced_migrations) ||
2228                         !list_empty(&cache->completed_migrations) ||
2229                         !list_empty(&cache->need_commit_migrations);
2230         else
2231                 return !bio_list_empty(&cache->deferred_bios) ||
2232                         !list_empty(&cache->deferred_cells) ||
2233                         !bio_list_empty(&cache->deferred_flush_bios) ||
2234                         !bio_list_empty(&cache->deferred_writethrough_bios) ||
2235                         !list_empty(&cache->quiesced_migrations) ||
2236                         !list_empty(&cache->completed_migrations) ||
2237                         !list_empty(&cache->need_commit_migrations) ||
2238                         cache->invalidate;
2239 }
2240
2241 static void do_worker(struct work_struct *ws)
2242 {
2243         struct cache *cache = container_of(ws, struct cache, worker);
2244
2245         do {
2246                 if (!is_quiescing(cache)) {
2247                         writeback_some_dirty_blocks(cache);
2248                         process_deferred_writethrough_bios(cache);
2249                         process_deferred_bios(cache);
2250                         process_deferred_cells(cache);
2251                         process_invalidation_requests(cache);
2252                 }
2253
2254                 process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard);
2255                 process_migrations(cache, &cache->completed_migrations, complete_migration);
2256
2257                 if (commit_if_needed(cache)) {
2258                         process_deferred_flush_bios(cache, false);
2259                         process_migrations(cache, &cache->need_commit_migrations, migration_failure);
2260                 } else {
2261                         process_deferred_flush_bios(cache, true);
2262                         process_migrations(cache, &cache->need_commit_migrations,
2263                                            migration_success_post_commit);
2264                 }
2265
2266                 ack_quiescing(cache);
2267
2268         } while (more_work(cache));
2269 }
2270
2271 /*
2272  * We want to commit periodically so that not too much
2273  * unwritten metadata builds up.
2274  */
2275 static void do_waker(struct work_struct *ws)
2276 {
2277         struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
2278         policy_tick(cache->policy, true);
2279         wake_worker(cache);
2280         queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
2281 }
2282
2283 /*----------------------------------------------------------------*/
2284
2285 static int is_congested(struct dm_dev *dev, int bdi_bits)
2286 {
2287         struct request_queue *q = bdev_get_queue(dev->bdev);
2288         return bdi_congested(q->backing_dev_info, bdi_bits);
2289 }
2290
2291 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
2292 {
2293         struct cache *cache = container_of(cb, struct cache, callbacks);
2294
2295         return is_congested(cache->origin_dev, bdi_bits) ||
2296                 is_congested(cache->cache_dev, bdi_bits);
2297 }
2298
2299 /*----------------------------------------------------------------
2300  * Target methods
2301  *--------------------------------------------------------------*/
2302
2303 /*
2304  * This function gets called on the error paths of the constructor, so we
2305  * have to cope with a partially initialised struct.
2306  */
2307 static void destroy(struct cache *cache)
2308 {
2309         unsigned i;
2310
2311         mempool_destroy(cache->migration_pool);
2312
2313         if (cache->all_io_ds)
2314                 dm_deferred_set_destroy(cache->all_io_ds);
2315
2316         if (cache->prison)
2317                 dm_bio_prison_destroy(cache->prison);
2318
2319         if (cache->wq)
2320                 destroy_workqueue(cache->wq);
2321
2322         if (cache->dirty_bitset)
2323                 free_bitset(cache->dirty_bitset);
2324
2325         if (cache->discard_bitset)
2326                 free_bitset(cache->discard_bitset);
2327
2328         if (cache->copier)
2329                 dm_kcopyd_client_destroy(cache->copier);
2330
2331         if (cache->cmd)
2332                 dm_cache_metadata_close(cache->cmd);
2333
2334         if (cache->metadata_dev)
2335                 dm_put_device(cache->ti, cache->metadata_dev);
2336
2337         if (cache->origin_dev)
2338                 dm_put_device(cache->ti, cache->origin_dev);
2339
2340         if (cache->cache_dev)
2341                 dm_put_device(cache->ti, cache->cache_dev);
2342
2343         if (cache->policy)
2344                 dm_cache_policy_destroy(cache->policy);
2345
2346         for (i = 0; i < cache->nr_ctr_args ; i++)
2347                 kfree(cache->ctr_args[i]);
2348         kfree(cache->ctr_args);
2349
2350         kfree(cache);
2351 }
2352
2353 static void cache_dtr(struct dm_target *ti)
2354 {
2355         struct cache *cache = ti->private;
2356
2357         destroy(cache);
2358 }
2359
2360 static sector_t get_dev_size(struct dm_dev *dev)
2361 {
2362         return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
2363 }
2364
2365 /*----------------------------------------------------------------*/
2366
2367 /*
2368  * Construct a cache device mapping.
2369  *
2370  * cache <metadata dev> <cache dev> <origin dev> <block size>
2371  *       <#feature args> [<feature arg>]*
2372  *       <policy> <#policy args> [<policy arg>]*
2373  *
2374  * metadata dev    : fast device holding the persistent metadata
2375  * cache dev       : fast device holding cached data blocks
2376  * origin dev      : slow device holding original data blocks
2377  * block size      : cache unit size in sectors
2378  *
2379  * #feature args   : number of feature arguments passed
2380  * feature args    : writethrough.  (The default is writeback.)
2381  *
2382  * policy          : the replacement policy to use
2383  * #policy args    : an even number of policy arguments corresponding
2384  *                   to key/value pairs passed to the policy
2385  * policy args     : key/value pairs passed to the policy
2386  *                   E.g. 'sequential_threshold 1024'
2387  *                   See cache-policies.txt for details.
2388  *
2389  * Optional feature arguments are:
2390  *   writethrough  : write through caching that prohibits cache block
2391  *                   content from being different from origin block content.
2392  *                   Without this argument, the default behaviour is to write
2393  *                   back cache block contents later for performance reasons,
2394  *                   so they may differ from the corresponding origin blocks.
2395  */
2396 struct cache_args {
2397         struct dm_target *ti;
2398
2399         struct dm_dev *metadata_dev;
2400
2401         struct dm_dev *cache_dev;
2402         sector_t cache_sectors;
2403
2404         struct dm_dev *origin_dev;
2405         sector_t origin_sectors;
2406
2407         uint32_t block_size;
2408
2409         const char *policy_name;
2410         int policy_argc;
2411         const char **policy_argv;
2412
2413         struct cache_features features;
2414 };
2415
2416 static void destroy_cache_args(struct cache_args *ca)
2417 {
2418         if (ca->metadata_dev)
2419                 dm_put_device(ca->ti, ca->metadata_dev);
2420
2421         if (ca->cache_dev)
2422                 dm_put_device(ca->ti, ca->cache_dev);
2423
2424         if (ca->origin_dev)
2425                 dm_put_device(ca->ti, ca->origin_dev);
2426
2427         kfree(ca);
2428 }
2429
2430 static bool at_least_one_arg(struct dm_arg_set *as, char **error)
2431 {
2432         if (!as->argc) {
2433                 *error = "Insufficient args";
2434                 return false;
2435         }
2436
2437         return true;
2438 }
2439
2440 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
2441                               char **error)
2442 {
2443         int r;
2444         sector_t metadata_dev_size;
2445         char b[BDEVNAME_SIZE];
2446
2447         if (!at_least_one_arg(as, error))
2448                 return -EINVAL;
2449
2450         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2451                           &ca->metadata_dev);
2452         if (r) {
2453                 *error = "Error opening metadata device";
2454                 return r;
2455         }
2456
2457         metadata_dev_size = get_dev_size(ca->metadata_dev);
2458         if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
2459                 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
2460                        bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
2461
2462         return 0;
2463 }
2464
2465 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
2466                            char **error)
2467 {
2468         int r;
2469
2470         if (!at_least_one_arg(as, error))
2471                 return -EINVAL;
2472
2473         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2474                           &ca->cache_dev);
2475         if (r) {
2476                 *error = "Error opening cache device";
2477                 return r;
2478         }
2479         ca->cache_sectors = get_dev_size(ca->cache_dev);
2480
2481         return 0;
2482 }
2483
2484 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
2485                             char **error)
2486 {
2487         int r;
2488
2489         if (!at_least_one_arg(as, error))
2490                 return -EINVAL;
2491
2492         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2493                           &ca->origin_dev);
2494         if (r) {
2495                 *error = "Error opening origin device";
2496                 return r;
2497         }
2498
2499         ca->origin_sectors = get_dev_size(ca->origin_dev);
2500         if (ca->ti->len > ca->origin_sectors) {
2501                 *error = "Device size larger than cached device";
2502                 return -EINVAL;
2503         }
2504
2505         return 0;
2506 }
2507
2508 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
2509                             char **error)
2510 {
2511         unsigned long block_size;
2512
2513         if (!at_least_one_arg(as, error))
2514                 return -EINVAL;
2515
2516         if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size ||
2517             block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
2518             block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
2519             block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
2520                 *error = "Invalid data block size";
2521                 return -EINVAL;
2522         }
2523
2524         if (block_size > ca->cache_sectors) {
2525                 *error = "Data block size is larger than the cache device";
2526                 return -EINVAL;
2527         }
2528
2529         ca->block_size = block_size;
2530
2531         return 0;
2532 }
2533
2534 static void init_features(struct cache_features *cf)
2535 {
2536         cf->mode = CM_WRITE;
2537         cf->io_mode = CM_IO_WRITEBACK;
2538         cf->metadata_version = 1;
2539 }
2540
2541 static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
2542                           char **error)
2543 {
2544         static struct dm_arg _args[] = {
2545                 {0, 2, "Invalid number of cache feature arguments"},
2546         };
2547
2548         int r;
2549         unsigned argc;
2550         const char *arg;
2551         struct cache_features *cf = &ca->features;
2552
2553         init_features(cf);
2554
2555         r = dm_read_arg_group(_args, as, &argc, error);
2556         if (r)
2557                 return -EINVAL;
2558
2559         while (argc--) {
2560                 arg = dm_shift_arg(as);
2561
2562                 if (!strcasecmp(arg, "writeback"))
2563                         cf->io_mode = CM_IO_WRITEBACK;
2564
2565                 else if (!strcasecmp(arg, "writethrough"))
2566                         cf->io_mode = CM_IO_WRITETHROUGH;
2567
2568                 else if (!strcasecmp(arg, "passthrough"))
2569                         cf->io_mode = CM_IO_PASSTHROUGH;
2570
2571                 else if (!strcasecmp(arg, "metadata2"))
2572                         cf->metadata_version = 2;
2573
2574                 else {
2575                         *error = "Unrecognised cache feature requested";
2576                         return -EINVAL;
2577                 }
2578         }
2579
2580         return 0;
2581 }
2582
2583 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
2584                         char **error)
2585 {
2586         static struct dm_arg _args[] = {
2587                 {0, 1024, "Invalid number of policy arguments"},
2588         };
2589
2590         int r;
2591
2592         if (!at_least_one_arg(as, error))
2593                 return -EINVAL;
2594
2595         ca->policy_name = dm_shift_arg(as);
2596
2597         r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
2598         if (r)
2599                 return -EINVAL;
2600
2601         ca->policy_argv = (const char **)as->argv;
2602         dm_consume_args(as, ca->policy_argc);
2603
2604         return 0;
2605 }
2606
2607 static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
2608                             char **error)
2609 {
2610         int r;
2611         struct dm_arg_set as;
2612
2613         as.argc = argc;
2614         as.argv = argv;
2615
2616         r = parse_metadata_dev(ca, &as, error);
2617         if (r)
2618                 return r;
2619
2620         r = parse_cache_dev(ca, &as, error);
2621         if (r)
2622                 return r;
2623
2624         r = parse_origin_dev(ca, &as, error);
2625         if (r)
2626                 return r;
2627
2628         r = parse_block_size(ca, &as, error);
2629         if (r)
2630                 return r;
2631
2632         r = parse_features(ca, &as, error);
2633         if (r)
2634                 return r;
2635
2636         r = parse_policy(ca, &as, error);
2637         if (r)
2638                 return r;
2639
2640         return 0;
2641 }
2642
2643 /*----------------------------------------------------------------*/
2644
2645 static struct kmem_cache *migration_cache;
2646
2647 #define NOT_CORE_OPTION 1
2648
2649 static int process_config_option(struct cache *cache, const char *key, const char *value)
2650 {
2651         unsigned long tmp;
2652
2653         if (!strcasecmp(key, "migration_threshold")) {
2654                 if (kstrtoul(value, 10, &tmp))
2655                         return -EINVAL;
2656
2657                 cache->migration_threshold = tmp;
2658                 return 0;
2659         }
2660
2661         return NOT_CORE_OPTION;
2662 }
2663
2664 static int set_config_value(struct cache *cache, const char *key, const char *value)
2665 {
2666         int r = process_config_option(cache, key, value);
2667
2668         if (r == NOT_CORE_OPTION)
2669                 r = policy_set_config_value(cache->policy, key, value);
2670
2671         if (r)
2672                 DMWARN("bad config value for %s: %s", key, value);
2673
2674         return r;
2675 }
2676
2677 static int set_config_values(struct cache *cache, int argc, const char **argv)
2678 {
2679         int r = 0;
2680
2681         if (argc & 1) {
2682                 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
2683                 return -EINVAL;
2684         }
2685
2686         while (argc) {
2687                 r = set_config_value(cache, argv[0], argv[1]);
2688                 if (r)
2689                         break;
2690
2691                 argc -= 2;
2692                 argv += 2;
2693         }
2694
2695         return r;
2696 }
2697
2698 static int create_cache_policy(struct cache *cache, struct cache_args *ca,
2699                                char **error)
2700 {
2701         struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
2702                                                            cache->cache_size,
2703                                                            cache->origin_sectors,
2704                                                            cache->sectors_per_block);
2705         if (IS_ERR(p)) {
2706                 *error = "Error creating cache's policy";
2707                 return PTR_ERR(p);
2708         }
2709         cache->policy = p;
2710
2711         return 0;
2712 }
2713
2714 /*
2715  * We want the discard block size to be at least the size of the cache
2716  * block size and have no more than 2^14 discard blocks across the origin.
2717  */
2718 #define MAX_DISCARD_BLOCKS (1 << 14)
2719
2720 static bool too_many_discard_blocks(sector_t discard_block_size,
2721                                     sector_t origin_size)
2722 {
2723         (void) sector_div(origin_size, discard_block_size);
2724
2725         return origin_size > MAX_DISCARD_BLOCKS;
2726 }
2727
2728 static sector_t calculate_discard_block_size(sector_t cache_block_size,
2729                                              sector_t origin_size)
2730 {
2731         sector_t discard_block_size = cache_block_size;
2732
2733         if (origin_size)
2734                 while (too_many_discard_blocks(discard_block_size, origin_size))
2735                         discard_block_size *= 2;
2736
2737         return discard_block_size;
2738 }
2739
2740 static void set_cache_size(struct cache *cache, dm_cblock_t size)
2741 {
2742         dm_block_t nr_blocks = from_cblock(size);
2743
2744         if (nr_blocks > (1 << 20) && cache->cache_size != size)
2745                 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n"
2746                              "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n"
2747                              "Please consider increasing the cache block size to reduce the overall cache block count.",
2748                              (unsigned long long) nr_blocks);
2749
2750         cache->cache_size = size;
2751 }
2752
2753 #define DEFAULT_MIGRATION_THRESHOLD 2048
2754
2755 static int cache_create(struct cache_args *ca, struct cache **result)
2756 {
2757         int r = 0;
2758         char **error = &ca->ti->error;
2759         struct cache *cache;
2760         struct dm_target *ti = ca->ti;
2761         dm_block_t origin_blocks;
2762         struct dm_cache_metadata *cmd;
2763         bool may_format = ca->features.mode == CM_WRITE;
2764
2765         cache = kzalloc(sizeof(*cache), GFP_KERNEL);
2766         if (!cache)
2767                 return -ENOMEM;
2768
2769         cache->ti = ca->ti;
2770         ti->private = cache;
2771         ti->num_flush_bios = 2;
2772         ti->flush_supported = true;
2773
2774         ti->num_discard_bios = 1;
2775         ti->discards_supported = true;
2776         ti->split_discard_bios = false;
2777
2778         cache->features = ca->features;
2779         ti->per_io_data_size = get_per_bio_data_size(cache);
2780
2781         cache->callbacks.congested_fn = cache_is_congested;
2782         dm_table_add_target_callbacks(ti->table, &cache->callbacks);
2783
2784         cache->metadata_dev = ca->metadata_dev;
2785         cache->origin_dev = ca->origin_dev;
2786         cache->cache_dev = ca->cache_dev;
2787
2788         ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
2789
2790         /* FIXME: factor out this whole section */
2791         origin_blocks = cache->origin_sectors = ca->origin_sectors;
2792         origin_blocks = block_div(origin_blocks, ca->block_size);
2793         cache->origin_blocks = to_oblock(origin_blocks);
2794
2795         cache->sectors_per_block = ca->block_size;
2796         if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
2797                 r = -EINVAL;
2798                 goto bad;
2799         }
2800
2801         if (ca->block_size & (ca->block_size - 1)) {
2802                 dm_block_t cache_size = ca->cache_sectors;
2803
2804                 cache->sectors_per_block_shift = -1;
2805                 cache_size = block_div(cache_size, ca->block_size);
2806                 set_cache_size(cache, to_cblock(cache_size));
2807         } else {
2808                 cache->sectors_per_block_shift = __ffs(ca->block_size);
2809                 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift));
2810         }
2811
2812         r = create_cache_policy(cache, ca, error);
2813         if (r)
2814                 goto bad;
2815
2816         cache->policy_nr_args = ca->policy_argc;
2817         cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
2818
2819         r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
2820         if (r) {
2821                 *error = "Error setting cache policy's config values";
2822                 goto bad;
2823         }
2824
2825         cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
2826                                      ca->block_size, may_format,
2827                                      dm_cache_policy_get_hint_size(cache->policy),
2828                                      ca->features.metadata_version);
2829         if (IS_ERR(cmd)) {
2830                 *error = "Error creating metadata object";
2831                 r = PTR_ERR(cmd);
2832                 goto bad;
2833         }
2834         cache->cmd = cmd;
2835         set_cache_mode(cache, CM_WRITE);
2836         if (get_cache_mode(cache) != CM_WRITE) {
2837                 *error = "Unable to get write access to metadata, please check/repair metadata.";
2838                 r = -EINVAL;
2839                 goto bad;
2840         }
2841
2842         if (passthrough_mode(&cache->features)) {
2843                 bool all_clean;
2844
2845                 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
2846                 if (r) {
2847                         *error = "dm_cache_metadata_all_clean() failed";
2848                         goto bad;
2849                 }
2850
2851                 if (!all_clean) {
2852                         *error = "Cannot enter passthrough mode unless all blocks are clean";
2853                         r = -EINVAL;
2854                         goto bad;
2855                 }
2856         }
2857
2858         spin_lock_init(&cache->lock);
2859         INIT_LIST_HEAD(&cache->deferred_cells);
2860         bio_list_init(&cache->deferred_bios);
2861         bio_list_init(&cache->deferred_flush_bios);
2862         bio_list_init(&cache->deferred_writethrough_bios);
2863         INIT_LIST_HEAD(&cache->quiesced_migrations);
2864         INIT_LIST_HEAD(&cache->completed_migrations);
2865         INIT_LIST_HEAD(&cache->need_commit_migrations);
2866         atomic_set(&cache->nr_allocated_migrations, 0);
2867         atomic_set(&cache->nr_io_migrations, 0);
2868         init_waitqueue_head(&cache->migration_wait);
2869
2870         init_waitqueue_head(&cache->quiescing_wait);
2871         atomic_set(&cache->quiescing, 0);
2872         atomic_set(&cache->quiescing_ack, 0);
2873
2874         r = -ENOMEM;
2875         atomic_set(&cache->nr_dirty, 0);
2876         cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
2877         if (!cache->dirty_bitset) {
2878                 *error = "could not allocate dirty bitset";
2879                 goto bad;
2880         }
2881         clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
2882
2883         cache->discard_block_size =
2884                 calculate_discard_block_size(cache->sectors_per_block,
2885                                              cache->origin_sectors);
2886         cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors,
2887                                                               cache->discard_block_size));
2888         cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
2889         if (!cache->discard_bitset) {
2890                 *error = "could not allocate discard bitset";
2891                 goto bad;
2892         }
2893         clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
2894
2895         cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2896         if (IS_ERR(cache->copier)) {
2897                 *error = "could not create kcopyd client";
2898                 r = PTR_ERR(cache->copier);
2899                 goto bad;
2900         }
2901
2902         cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
2903         if (!cache->wq) {
2904                 *error = "could not create workqueue for metadata object";
2905                 goto bad;
2906         }
2907         INIT_WORK(&cache->worker, do_worker);
2908         INIT_DELAYED_WORK(&cache->waker, do_waker);
2909         cache->last_commit_jiffies = jiffies;
2910
2911         cache->prison = dm_bio_prison_create();
2912         if (!cache->prison) {
2913                 *error = "could not create bio prison";
2914                 goto bad;
2915         }
2916
2917         cache->all_io_ds = dm_deferred_set_create();
2918         if (!cache->all_io_ds) {
2919                 *error = "could not create all_io deferred set";
2920                 goto bad;
2921         }
2922
2923         cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
2924                                                          migration_cache);
2925         if (!cache->migration_pool) {
2926                 *error = "Error creating cache's migration mempool";
2927                 goto bad;
2928         }
2929
2930         cache->need_tick_bio = true;
2931         cache->sized = false;
2932         cache->invalidate = false;
2933         cache->commit_requested = false;
2934         cache->loaded_mappings = false;
2935         cache->loaded_discards = false;
2936
2937         load_stats(cache);
2938
2939         atomic_set(&cache->stats.demotion, 0);
2940         atomic_set(&cache->stats.promotion, 0);
2941         atomic_set(&cache->stats.copies_avoided, 0);
2942         atomic_set(&cache->stats.cache_cell_clash, 0);
2943         atomic_set(&cache->stats.commit_count, 0);
2944         atomic_set(&cache->stats.discard_count, 0);
2945
2946         spin_lock_init(&cache->invalidation_lock);
2947         INIT_LIST_HEAD(&cache->invalidation_requests);
2948
2949         iot_init(&cache->origin_tracker);
2950
2951         *result = cache;
2952         return 0;
2953
2954 bad:
2955         destroy(cache);
2956         return r;
2957 }
2958
2959 static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
2960 {
2961         unsigned i;
2962         const char **copy;
2963
2964         copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
2965         if (!copy)
2966                 return -ENOMEM;
2967         for (i = 0; i < argc; i++) {
2968                 copy[i] = kstrdup(argv[i], GFP_KERNEL);
2969                 if (!copy[i]) {
2970                         while (i--)
2971                                 kfree(copy[i]);
2972                         kfree(copy);
2973                         return -ENOMEM;
2974                 }
2975         }
2976
2977         cache->nr_ctr_args = argc;
2978         cache->ctr_args = copy;
2979
2980         return 0;
2981 }
2982
2983 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2984 {
2985         int r = -EINVAL;
2986         struct cache_args *ca;
2987         struct cache *cache = NULL;
2988
2989         ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2990         if (!ca) {
2991                 ti->error = "Error allocating memory for cache";
2992                 return -ENOMEM;
2993         }
2994         ca->ti = ti;
2995
2996         r = parse_cache_args(ca, argc, argv, &ti->error);
2997         if (r)
2998                 goto out;
2999
3000         r = cache_create(ca, &cache);
3001         if (r)
3002                 goto out;
3003
3004         r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
3005         if (r) {
3006                 destroy(cache);
3007                 goto out;
3008         }
3009
3010         ti->private = cache;
3011
3012 out:
3013         destroy_cache_args(ca);
3014         return r;
3015 }
3016
3017 /*----------------------------------------------------------------*/
3018
3019 static int cache_map(struct dm_target *ti, struct bio *bio)
3020 {
3021         struct cache *cache = ti->private;
3022
3023         int r;
3024         struct dm_bio_prison_cell *cell = NULL;
3025         dm_oblock_t block = get_bio_block(cache, bio);
3026         size_t pb_data_size = get_per_bio_data_size(cache);
3027         bool can_migrate = false;
3028         bool fast_promotion;
3029         struct policy_result lookup_result;
3030         struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
3031         struct old_oblock_lock ool;
3032
3033         ool.locker.fn = null_locker;
3034
3035         if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
3036                 /*
3037                  * This can only occur if the io goes to a partial block at
3038                  * the end of the origin device.  We don't cache these.
3039                  * Just remap to the origin and carry on.
3040                  */
3041                 remap_to_origin(cache, bio);
3042                 accounted_begin(cache, bio);
3043                 return DM_MAPIO_REMAPPED;
3044         }
3045
3046         if (discard_or_flush(bio)) {
3047                 defer_bio(cache, bio);
3048                 return DM_MAPIO_SUBMITTED;
3049         }
3050
3051         /*
3052          * Check to see if that block is currently migrating.
3053          */
3054         cell = alloc_prison_cell(cache);
3055         if (!cell) {
3056                 defer_bio(cache, bio);
3057                 return DM_MAPIO_SUBMITTED;
3058         }
3059
3060         r = bio_detain(cache, block, bio, cell,
3061                        (cell_free_fn) free_prison_cell,
3062                        cache, &cell);
3063         if (r) {
3064                 if (r < 0)
3065                         defer_bio(cache, bio);
3066
3067                 return DM_MAPIO_SUBMITTED;
3068         }
3069
3070         fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
3071
3072         r = policy_map(cache->policy, block, false, can_migrate, fast_promotion,
3073                        bio, &ool.locker, &lookup_result);
3074         if (r == -EWOULDBLOCK) {
3075                 cell_defer(cache, cell, true);
3076                 return DM_MAPIO_SUBMITTED;
3077
3078         } else if (r) {
3079                 DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d",
3080                             cache_device_name(cache), r);
3081                 cell_defer(cache, cell, false);
3082                 bio_io_error(bio);
3083                 return DM_MAPIO_SUBMITTED;
3084         }
3085
3086         r = DM_MAPIO_REMAPPED;
3087         switch (lookup_result.op) {
3088         case POLICY_HIT:
3089                 if (passthrough_mode(&cache->features)) {
3090                         if (bio_data_dir(bio) == WRITE) {
3091                                 /*
3092                                  * We need to invalidate this block, so
3093                                  * defer for the worker thread.
3094                                  */
3095                                 cell_defer(cache, cell, true);
3096                                 r = DM_MAPIO_SUBMITTED;
3097
3098                         } else {
3099                                 inc_miss_counter(cache, bio);
3100                                 remap_to_origin_clear_discard(cache, bio, block);
3101                                 accounted_begin(cache, bio);
3102                                 inc_ds(cache, bio, cell);
3103                                 // FIXME: we want to remap hits or misses straight
3104                                 // away rather than passing over to the worker.
3105                                 cell_defer(cache, cell, false);
3106                         }
3107
3108                 } else {
3109                         inc_hit_counter(cache, bio);
3110                         if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
3111                             !is_dirty(cache, lookup_result.cblock)) {
3112                                 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
3113                                 accounted_begin(cache, bio);
3114                                 inc_ds(cache, bio, cell);
3115                                 cell_defer(cache, cell, false);
3116
3117                         } else
3118                                 remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false);
3119                 }
3120                 break;
3121
3122         case POLICY_MISS:
3123                 inc_miss_counter(cache, bio);
3124                 if (pb->req_nr != 0) {
3125                         /*
3126                          * This is a duplicate writethrough io that is no
3127                          * longer needed because the block has been demoted.
3128                          */
3129                         bio_endio(bio);
3130                         // FIXME: remap everything as a miss
3131                         cell_defer(cache, cell, false);
3132                         r = DM_MAPIO_SUBMITTED;
3133
3134                 } else
3135                         remap_cell_to_origin_clear_discard(cache, cell, block, false);
3136                 break;
3137
3138         default:
3139                 DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u",
3140                             cache_device_name(cache), __func__,
3141                             (unsigned) lookup_result.op);
3142                 cell_defer(cache, cell, false);
3143                 bio_io_error(bio);
3144                 r = DM_MAPIO_SUBMITTED;
3145         }
3146
3147         return r;
3148 }
3149
3150 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
3151 {
3152         struct cache *cache = ti->private;
3153         unsigned long flags;
3154         size_t pb_data_size = get_per_bio_data_size(cache);
3155         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
3156
3157         if (pb->tick) {
3158                 policy_tick(cache->policy, false);
3159
3160                 spin_lock_irqsave(&cache->lock, flags);
3161                 cache->need_tick_bio = true;
3162                 spin_unlock_irqrestore(&cache->lock, flags);
3163         }
3164
3165         check_for_quiesced_migrations(cache, pb);
3166         accounted_complete(cache, bio);
3167
3168         return 0;
3169 }
3170
3171 static int write_dirty_bitset(struct cache *cache)
3172 {
3173         int r;
3174
3175         if (get_cache_mode(cache) >= CM_READ_ONLY)
3176                 return -EINVAL;
3177
3178         r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset);
3179         if (r)
3180                 metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r);
3181
3182         return r;
3183 }
3184
3185 static int write_discard_bitset(struct cache *cache)
3186 {
3187         unsigned i, r;
3188
3189         if (get_cache_mode(cache) >= CM_READ_ONLY)
3190                 return -EINVAL;
3191
3192         r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
3193                                            cache->discard_nr_blocks);
3194         if (r) {
3195                 DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache));
3196                 metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r);
3197                 return r;
3198         }
3199
3200         for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
3201                 r = dm_cache_set_discard(cache->cmd, to_dblock(i),
3202                                          is_discarded(cache, to_dblock(i)));
3203                 if (r) {
3204                         metadata_operation_failed(cache, "dm_cache_set_discard", r);
3205                         return r;
3206                 }
3207         }
3208
3209         return 0;
3210 }
3211
3212 static int write_hints(struct cache *cache)
3213 {
3214         int r;
3215
3216         if (get_cache_mode(cache) >= CM_READ_ONLY)
3217                 return -EINVAL;
3218
3219         r = dm_cache_write_hints(cache->cmd, cache->policy);
3220         if (r) {
3221                 metadata_operation_failed(cache, "dm_cache_write_hints", r);
3222                 return r;
3223         }
3224
3225         return 0;
3226 }
3227
3228 /*
3229  * returns true on success
3230  */
3231 static bool sync_metadata(struct cache *cache)
3232 {
3233         int r1, r2, r3, r4;
3234
3235         r1 = write_dirty_bitset(cache);
3236         if (r1)
3237                 DMERR("%s: could not write dirty bitset", cache_device_name(cache));
3238
3239         r2 = write_discard_bitset(cache);
3240         if (r2)
3241                 DMERR("%s: could not write discard bitset", cache_device_name(cache));
3242
3243         save_stats(cache);
3244
3245         r3 = write_hints(cache);
3246         if (r3)
3247                 DMERR("%s: could not write hints", cache_device_name(cache));
3248
3249         /*
3250          * If writing the above metadata failed, we still commit, but don't
3251          * set the clean shutdown flag.  This will effectively force every
3252          * dirty bit to be set on reload.
3253          */
3254         r4 = commit(cache, !r1 && !r2 && !r3);
3255         if (r4)
3256                 DMERR("%s: could not write cache metadata", cache_device_name(cache));
3257
3258         return !r1 && !r2 && !r3 && !r4;
3259 }
3260
3261 static void cache_postsuspend(struct dm_target *ti)
3262 {
3263         struct cache *cache = ti->private;
3264
3265         start_quiescing(cache);
3266         wait_for_migrations(cache);
3267         stop_worker(cache);
3268         requeue_deferred_bios(cache);
3269         requeue_deferred_cells(cache);
3270         stop_quiescing(cache);
3271
3272         if (get_cache_mode(cache) == CM_WRITE)
3273                 (void) sync_metadata(cache);
3274 }
3275
3276 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
3277                         bool dirty, uint32_t hint, bool hint_valid)
3278 {
3279         int r;
3280         struct cache *cache = context;
3281
3282         r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
3283         if (r)
3284                 return r;
3285
3286         if (dirty)
3287                 set_dirty(cache, oblock, cblock);
3288         else
3289                 clear_dirty(cache, oblock, cblock);
3290
3291         return 0;
3292 }
3293
3294 /*
3295  * The discard block size in the on disk metadata is not
3296  * neccessarily the same as we're currently using.  So we have to
3297  * be careful to only set the discarded attribute if we know it
3298  * covers a complete block of the new size.
3299  */
3300 struct discard_load_info {
3301         struct cache *cache;
3302
3303         /*
3304          * These blocks are sized using the on disk dblock size, rather
3305          * than the current one.
3306          */
3307         dm_block_t block_size;
3308         dm_block_t discard_begin, discard_end;
3309 };
3310
3311 static void discard_load_info_init(struct cache *cache,
3312                                    struct discard_load_info *li)
3313 {
3314         li->cache = cache;
3315         li->discard_begin = li->discard_end = 0;
3316 }
3317
3318 static void set_discard_range(struct discard_load_info *li)
3319 {
3320         sector_t b, e;
3321
3322         if (li->discard_begin == li->discard_end)
3323                 return;
3324
3325         /*
3326          * Convert to sectors.
3327          */
3328         b = li->discard_begin * li->block_size;
3329         e = li->discard_end * li->block_size;
3330
3331         /*
3332          * Then convert back to the current dblock size.
3333          */
3334         b = dm_sector_div_up(b, li->cache->discard_block_size);
3335         sector_div(e, li->cache->discard_block_size);
3336
3337         /*
3338          * The origin may have shrunk, so we need to check we're still in
3339          * bounds.
3340          */
3341         if (e > from_dblock(li->cache->discard_nr_blocks))
3342                 e = from_dblock(li->cache->discard_nr_blocks);
3343
3344         for (; b < e; b++)
3345                 set_discard(li->cache, to_dblock(b));
3346 }
3347
3348 static int load_discard(void *context, sector_t discard_block_size,
3349                         dm_dblock_t dblock, bool discard)
3350 {
3351         struct discard_load_info *li = context;
3352
3353         li->block_size = discard_block_size;
3354
3355         if (discard) {
3356                 if (from_dblock(dblock) == li->discard_end)
3357                         /*
3358                          * We're already in a discard range, just extend it.
3359                          */
3360                         li->discard_end = li->discard_end + 1ULL;
3361
3362                 else {
3363                         /*
3364                          * Emit the old range and start a new one.
3365                          */
3366                         set_discard_range(li);
3367                         li->discard_begin = from_dblock(dblock);
3368                         li->discard_end = li->discard_begin + 1ULL;
3369                 }
3370         } else {
3371                 set_discard_range(li);
3372                 li->discard_begin = li->discard_end = 0;
3373         }
3374
3375         return 0;
3376 }
3377
3378 static dm_cblock_t get_cache_dev_size(struct cache *cache)
3379 {
3380         sector_t size = get_dev_size(cache->cache_dev);
3381         (void) sector_div(size, cache->sectors_per_block);
3382         return to_cblock(size);
3383 }
3384
3385 static bool can_resize(struct cache *cache, dm_cblock_t new_size)
3386 {
3387         if (from_cblock(new_size) > from_cblock(cache->cache_size))
3388                 return true;
3389
3390         /*
3391          * We can't drop a dirty block when shrinking the cache.
3392          */
3393         while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
3394                 new_size = to_cblock(from_cblock(new_size) + 1);
3395                 if (is_dirty(cache, new_size)) {
3396                         DMERR("%s: unable to shrink cache; cache block %llu is dirty",
3397                               cache_device_name(cache),
3398                               (unsigned long long) from_cblock(new_size));
3399                         return false;
3400                 }
3401         }
3402
3403         return true;
3404 }
3405
3406 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
3407 {
3408         int r;
3409
3410         r = dm_cache_resize(cache->cmd, new_size);
3411         if (r) {
3412                 DMERR("%s: could not resize cache metadata", cache_device_name(cache));
3413                 metadata_operation_failed(cache, "dm_cache_resize", r);
3414                 return r;
3415         }
3416
3417         set_cache_size(cache, new_size);
3418
3419         return 0;
3420 }
3421
3422 static int cache_preresume(struct dm_target *ti)
3423 {
3424         int r = 0;
3425         struct cache *cache = ti->private;
3426         dm_cblock_t csize = get_cache_dev_size(cache);
3427
3428         /*
3429          * Check to see if the cache has resized.
3430          */
3431         if (!cache->sized) {
3432                 r = resize_cache_dev(cache, csize);
3433                 if (r)
3434                         return r;
3435
3436                 cache->sized = true;
3437
3438         } else if (csize != cache->cache_size) {
3439                 if (!can_resize(cache, csize))
3440                         return -EINVAL;
3441
3442                 r = resize_cache_dev(cache, csize);
3443                 if (r)
3444                         return r;
3445         }
3446
3447         if (!cache->loaded_mappings) {
3448                 r = dm_cache_load_mappings(cache->cmd, cache->policy,
3449                                            load_mapping, cache);
3450                 if (r) {
3451                         DMERR("%s: could not load cache mappings", cache_device_name(cache));
3452                         metadata_operation_failed(cache, "dm_cache_load_mappings", r);
3453                         return r;
3454                 }
3455
3456                 cache->loaded_mappings = true;
3457         }
3458
3459         if (!cache->loaded_discards) {
3460                 struct discard_load_info li;
3461
3462                 /*
3463                  * The discard bitset could have been resized, or the
3464                  * discard block size changed.  To be safe we start by
3465                  * setting every dblock to not discarded.
3466                  */
3467                 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
3468
3469                 discard_load_info_init(cache, &li);
3470                 r = dm_cache_load_discards(cache->cmd, load_discard, &li);
3471                 if (r) {
3472                         DMERR("%s: could not load origin discards", cache_device_name(cache));
3473                         metadata_operation_failed(cache, "dm_cache_load_discards", r);
3474                         return r;
3475                 }
3476                 set_discard_range(&li);
3477
3478                 cache->loaded_discards = true;
3479         }
3480
3481         return r;
3482 }
3483
3484 static void cache_resume(struct dm_target *ti)
3485 {
3486         struct cache *cache = ti->private;
3487
3488         cache->need_tick_bio = true;
3489         do_waker(&cache->waker.work);
3490 }
3491
3492 /*
3493  * Status format:
3494  *
3495  * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
3496  * <cache block size> <#used cache blocks>/<#total cache blocks>
3497  * <#read hits> <#read misses> <#write hits> <#write misses>
3498  * <#demotions> <#promotions> <#dirty>
3499  * <#features> <features>*
3500  * <#core args> <core args>
3501  * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check>
3502  */
3503 static void cache_status(struct dm_target *ti, status_type_t type,
3504                          unsigned status_flags, char *result, unsigned maxlen)
3505 {
3506         int r = 0;
3507         unsigned i;
3508         ssize_t sz = 0;
3509         dm_block_t nr_free_blocks_metadata = 0;
3510         dm_block_t nr_blocks_metadata = 0;
3511         char buf[BDEVNAME_SIZE];
3512         struct cache *cache = ti->private;
3513         dm_cblock_t residency;
3514         bool needs_check;
3515
3516         switch (type) {
3517         case STATUSTYPE_INFO:
3518                 if (get_cache_mode(cache) == CM_FAIL) {
3519                         DMEMIT("Fail");
3520                         break;
3521                 }
3522
3523                 /* Commit to ensure statistics aren't out-of-date */
3524                 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
3525                         (void) commit(cache, false);
3526
3527                 r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata);
3528                 if (r) {
3529                         DMERR("%s: dm_cache_get_free_metadata_block_count returned %d",
3530                               cache_device_name(cache), r);
3531                         goto err;
3532                 }
3533
3534                 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
3535                 if (r) {
3536                         DMERR("%s: dm_cache_get_metadata_dev_size returned %d",
3537                               cache_device_name(cache), r);
3538                         goto err;
3539                 }
3540
3541                 residency = policy_residency(cache->policy);
3542
3543                 DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ",
3544                        (unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
3545                        (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
3546                        (unsigned long long)nr_blocks_metadata,
3547                        (unsigned long long)cache->sectors_per_block,
3548                        (unsigned long long) from_cblock(residency),
3549                        (unsigned long long) from_cblock(cache->cache_size),
3550                        (unsigned) atomic_read(&cache->stats.read_hit),
3551                        (unsigned) atomic_read(&cache->stats.read_miss),
3552                        (unsigned) atomic_read(&cache->stats.write_hit),
3553                        (unsigned) atomic_read(&cache->stats.write_miss),
3554                        (unsigned) atomic_read(&cache->stats.demotion),
3555                        (unsigned) atomic_read(&cache->stats.promotion),
3556                        (unsigned long) atomic_read(&cache->nr_dirty));
3557
3558                 if (cache->features.metadata_version == 2)
3559                         DMEMIT("2 metadata2 ");
3560                 else
3561                         DMEMIT("1 ");
3562
3563                 if (writethrough_mode(&cache->features))
3564                         DMEMIT("writethrough ");
3565
3566                 else if (passthrough_mode(&cache->features))
3567                         DMEMIT("passthrough ");
3568
3569                 else if (writeback_mode(&cache->features))
3570                         DMEMIT("writeback ");
3571
3572                 else {
3573                         DMERR("%s: internal error: unknown io mode: %d",
3574                               cache_device_name(cache), (int) cache->features.io_mode);
3575                         goto err;
3576                 }
3577
3578                 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
3579
3580                 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
3581                 if (sz < maxlen) {
3582                         r = policy_emit_config_values(cache->policy, result, maxlen, &sz);
3583                         if (r)
3584                                 DMERR("%s: policy_emit_config_values returned %d",
3585                                       cache_device_name(cache), r);
3586                 }
3587
3588                 if (get_cache_mode(cache) == CM_READ_ONLY)
3589                         DMEMIT("ro ");
3590                 else
3591                         DMEMIT("rw ");
3592
3593                 r = dm_cache_metadata_needs_check(cache->cmd, &needs_check);
3594
3595                 if (r || needs_check)
3596                         DMEMIT("needs_check ");
3597                 else
3598                         DMEMIT("- ");
3599
3600                 break;
3601
3602         case STATUSTYPE_TABLE:
3603                 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
3604                 DMEMIT("%s ", buf);
3605                 format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
3606                 DMEMIT("%s ", buf);
3607                 format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
3608                 DMEMIT("%s", buf);
3609
3610                 for (i = 0; i < cache->nr_ctr_args - 1; i++)
3611                         DMEMIT(" %s", cache->ctr_args[i]);
3612                 if (cache->nr_ctr_args)
3613                         DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
3614         }
3615
3616         return;
3617
3618 err:
3619         DMEMIT("Error");
3620 }
3621
3622 /*
3623  * A cache block range can take two forms:
3624  *
3625  * i) A single cblock, eg. '3456'
3626  * ii) A begin and end cblock with dots between, eg. 123-234
3627  */
3628 static int parse_cblock_range(struct cache *cache, const char *str,
3629                               struct cblock_range *result)
3630 {
3631         char dummy;
3632         uint64_t b, e;
3633         int r;
3634
3635         /*
3636          * Try and parse form (ii) first.
3637          */
3638         r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
3639         if (r < 0)
3640                 return r;
3641
3642         if (r == 2) {
3643                 result->begin = to_cblock(b);
3644                 result->end = to_cblock(e);
3645                 return 0;
3646         }
3647
3648         /*
3649          * That didn't work, try form (i).
3650          */
3651         r = sscanf(str, "%llu%c", &b, &dummy);
3652         if (r < 0)
3653                 return r;
3654
3655         if (r == 1) {
3656                 result->begin = to_cblock(b);
3657                 result->end = to_cblock(from_cblock(result->begin) + 1u);
3658                 return 0;
3659         }
3660
3661         DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str);
3662         return -EINVAL;
3663 }
3664
3665 static int validate_cblock_range(struct cache *cache, struct cblock_range *range)
3666 {
3667         uint64_t b = from_cblock(range->begin);
3668         uint64_t e = from_cblock(range->end);
3669         uint64_t n = from_cblock(cache->cache_size);
3670
3671         if (b >= n) {
3672                 DMERR("%s: begin cblock out of range: %llu >= %llu",
3673                       cache_device_name(cache), b, n);
3674                 return -EINVAL;
3675         }
3676
3677         if (e > n) {
3678                 DMERR("%s: end cblock out of range: %llu > %llu",
3679                       cache_device_name(cache), e, n);
3680                 return -EINVAL;
3681         }
3682
3683         if (b >= e) {
3684                 DMERR("%s: invalid cblock range: %llu >= %llu",
3685                       cache_device_name(cache), b, e);
3686                 return -EINVAL;
3687         }
3688
3689         return 0;
3690 }
3691
3692 static int request_invalidation(struct cache *cache, struct cblock_range *range)
3693 {
3694         struct invalidation_request req;
3695
3696         INIT_LIST_HEAD(&req.list);
3697         req.cblocks = range;
3698         atomic_set(&req.complete, 0);
3699         req.err = 0;
3700         init_waitqueue_head(&req.result_wait);
3701
3702         spin_lock(&cache->invalidation_lock);
3703         list_add(&req.list, &cache->invalidation_requests);
3704         spin_unlock(&cache->invalidation_lock);
3705         wake_worker(cache);
3706
3707         wait_event(req.result_wait, atomic_read(&req.complete));
3708         return req.err;
3709 }
3710
3711 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
3712                                               const char **cblock_ranges)
3713 {
3714         int r = 0;
3715         unsigned i;
3716         struct cblock_range range;
3717
3718         if (!passthrough_mode(&cache->features)) {
3719                 DMERR("%s: cache has to be in passthrough mode for invalidation",
3720                       cache_device_name(cache));
3721                 return -EPERM;
3722         }
3723
3724         for (i = 0; i < count; i++) {
3725                 r = parse_cblock_range(cache, cblock_ranges[i], &range);
3726                 if (r)
3727                         break;
3728
3729                 r = validate_cblock_range(cache, &range);
3730                 if (r)
3731                         break;
3732
3733                 /*
3734                  * Pass begin and end origin blocks to the worker and wake it.
3735                  */
3736                 r = request_invalidation(cache, &range);
3737                 if (r)
3738                         break;
3739         }
3740
3741         return r;
3742 }
3743
3744 /*
3745  * Supports
3746  *      "<key> <value>"
3747  * and
3748  *     "invalidate_cblocks [(<begin>)|(<begin>-<end>)]*
3749  *
3750  * The key migration_threshold is supported by the cache target core.
3751  */
3752 static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
3753 {
3754         struct cache *cache = ti->private;
3755
3756         if (!argc)
3757                 return -EINVAL;
3758
3759         if (get_cache_mode(cache) >= CM_READ_ONLY) {
3760                 DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode",
3761                       cache_device_name(cache));
3762                 return -EOPNOTSUPP;
3763         }
3764
3765         if (!strcasecmp(argv[0], "invalidate_cblocks"))
3766                 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
3767
3768         if (argc != 2)
3769                 return -EINVAL;
3770
3771         return set_config_value(cache, argv[0], argv[1]);
3772 }
3773
3774 static int cache_iterate_devices(struct dm_target *ti,
3775                                  iterate_devices_callout_fn fn, void *data)
3776 {
3777         int r = 0;
3778         struct cache *cache = ti->private;
3779
3780         r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
3781         if (!r)
3782                 r = fn(ti, cache->origin_dev, 0, ti->len, data);
3783
3784         return r;
3785 }
3786
3787 static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
3788 {
3789         /*
3790          * FIXME: these limits may be incompatible with the cache device
3791          */
3792         limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
3793                                             cache->origin_sectors);
3794         limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
3795 }
3796
3797 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3798 {
3799         struct cache *cache = ti->private;
3800         uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
3801
3802         /*
3803          * If the system-determined stacked limits are compatible with the
3804          * cache's blocksize (io_opt is a factor) do not override them.
3805          */
3806         if (io_opt_sectors < cache->sectors_per_block ||
3807             do_div(io_opt_sectors, cache->sectors_per_block)) {
3808                 blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
3809                 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
3810         }
3811         set_discard_limits(cache, limits);
3812 }
3813
3814 /*----------------------------------------------------------------*/
3815
3816 static struct target_type cache_target = {
3817         .name = "cache",
3818         .version = {1, 10, 0},
3819         .module = THIS_MODULE,
3820         .ctr = cache_ctr,
3821         .dtr = cache_dtr,
3822         .map = cache_map,
3823         .end_io = cache_end_io,
3824         .postsuspend = cache_postsuspend,
3825         .preresume = cache_preresume,
3826         .resume = cache_resume,
3827         .status = cache_status,
3828         .message = cache_message,
3829         .iterate_devices = cache_iterate_devices,
3830         .io_hints = cache_io_hints,
3831 };
3832
3833 static int __init dm_cache_init(void)
3834 {
3835         int r;
3836
3837         r = dm_register_target(&cache_target);
3838         if (r) {
3839                 DMERR("cache target registration failed: %d", r);
3840                 return r;
3841         }
3842
3843         migration_cache = KMEM_CACHE(dm_cache_migration, 0);
3844         if (!migration_cache) {
3845                 dm_unregister_target(&cache_target);
3846                 return -ENOMEM;
3847         }
3848
3849         return 0;
3850 }
3851
3852 static void __exit dm_cache_exit(void)
3853 {
3854         dm_unregister_target(&cache_target);
3855         kmem_cache_destroy(migration_cache);
3856 }
3857
3858 module_init(dm_cache_init);
3859 module_exit(dm_cache_exit);
3860
3861 MODULE_DESCRIPTION(DM_NAME " cache target");
3862 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
3863 MODULE_LICENSE("GPL");