dm cache policy smq: allow demotions to happen even during continuous IO

[linux.git] / drivers / md / dm-cache-policy-smq.c
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c

index f19c6930a67c512894e4cd14bf66e2fa1d3e3c33..d13d9edf8dfe4dc55d88065df4417c8f9ba94942 100644 (file)
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -4,8 +4,9 @@
   * This file is released under the GPL.
   */
  
-#include "dm-cache-policy.h"
+#include "dm-cache-background-tracker.h"
  #include "dm-cache-policy-internal.h"
+#include "dm-cache-policy.h"
  #include "dm.h"
  
  #include <linux/hash.h>
@@ -38,10 +39,11 @@ struct entry {
         unsigned hash_next:28;
         unsigned prev:28;
         unsigned next:28;
-       unsigned level:7;
+       unsigned level:6;
         bool dirty:1;
         bool allocated:1;
         bool sentinel:1;
+       bool pending_work:1;
  
         dm_oblock_t oblock;
  };
@@ -279,14 +281,28 @@ static unsigned q_size(struct queue *q)
   */
  static void q_push(struct queue *q, struct entry *e)
  {
+       BUG_ON(e->pending_work);
+
         if (!e->sentinel)
                 q->nr_elts++;
  
         l_add_tail(q->es, q->qs + e->level, e);
  }
  
+static void q_push_front(struct queue *q, struct entry *e)
+{
+       BUG_ON(e->pending_work);
+
+       if (!e->sentinel)
+               q->nr_elts++;
+
+       l_add_head(q->es, q->qs + e->level, e);
+}
+
  static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
  {
+       BUG_ON(e->pending_work);
+
         if (!e->sentinel)
                 q->nr_elts++;
  
@@ -335,19 +351,6 @@ static struct entry *q_pop(struct queue *q)
         return e;
  }
  
-/*
- * Pops an entry from a level that is not past a sentinel.
- */
-static struct entry *q_pop_old(struct queue *q, unsigned max_level)
-{
-       struct entry *e = q_peek(q, max_level, false);
-
-       if (e)
-               q_del(q, e);
-
-       return e;
-}
-
  /*
   * This function assumes there is a non-sentinel entry to pop.  It's only
   * used by redistribute, so we know this is true.  It also doesn't adjust
@@ -446,45 +449,49 @@ static void q_redistribute(struct queue *q)
                                 break;
  
                         e->level = level + 1u;
-                       l_add_head(q->es, l_above, e);
+                       l_add_tail(q->es, l_above, e);
                 }
         }
  }
  
-static void q_requeue_before(struct queue *q, struct entry *dest, struct entry *e, unsigned extra_levels)
+static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels,
+                     struct entry *s1, struct entry *s2)
  {
         struct entry *de;
-       unsigned new_level;
-
-       q_del(q, e);
+       unsigned sentinels_passed = 0;
+       unsigned new_level = min(q->nr_levels - 1u, e->level + extra_levels);
  
+       /* try and find an entry to swap with */
         if (extra_levels && (e->level < q->nr_levels - 1u)) {
-               new_level = min(q->nr_levels - 1u, e->level + extra_levels);
-               for (de = l_head(q->es, q->qs + new_level); de; de = l_next(q->es, de)) {
-                       if (de->sentinel)
-                               continue;
+               for (de = l_head(q->es, q->qs + new_level); de && de->sentinel; de = l_next(q->es, de))
+                       sentinels_passed++;
  
+               if (de) {
                         q_del(q, de);
                         de->level = e->level;
+                       if (s1) {
+                               switch (sentinels_passed) {
+                               case 0:
+                                       q_push_before(q, s1, de);
+                                       break;
+
+                               case 1:
+                                       q_push_before(q, s2, de);
+                                       break;
  
-                       if (dest)
-                               q_push_before(q, dest, de);
-                       else
+                               default:
+                                       q_push(q, de);
+                               }
+                       } else
                                 q_push(q, de);
-                       break;
                 }
-
-               e->level = new_level;
         }
  
+       q_del(q, e);
+       e->level = new_level;
         q_push(q, e);
  }
  
-static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels)
-{
-       q_requeue_before(q, NULL, e, extra_levels);
-}
-
  /*----------------------------------------------------------------*/
  
  #define FP_SHIFT 8
@@ -550,7 +557,7 @@ static enum performance stats_assess(struct stats *s)
  
  /*----------------------------------------------------------------*/
  
-struct hash_table {
+struct smq_hash_table {
         struct entry_space *es;
         unsigned long long hash_bits;
         unsigned *buckets;
@@ -560,7 +567,7 @@ struct hash_table {
   * All cache entries are stored in a chained hash table.  To save space we
   * use indexing again, and only store indexes to the next entry.
   */
-static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_entries)
+static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned nr_entries)
  {
         unsigned i, nr_buckets;
  
@@ -578,34 +585,34 @@ static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_ent
         return 0;
  }
  
-static void h_exit(struct hash_table *ht)
+static void h_exit(struct smq_hash_table *ht)
  {
         vfree(ht->buckets);
  }
  
-static struct entry *h_head(struct hash_table *ht, unsigned bucket)
+static struct entry *h_head(struct smq_hash_table *ht, unsigned bucket)
  {
         return to_entry(ht->es, ht->buckets[bucket]);
  }
  
-static struct entry *h_next(struct hash_table *ht, struct entry *e)
+static struct entry *h_next(struct smq_hash_table *ht, struct entry *e)
  {
         return to_entry(ht->es, e->hash_next);
  }
  
-static void __h_insert(struct hash_table *ht, unsigned bucket, struct entry *e)
+static void __h_insert(struct smq_hash_table *ht, unsigned bucket, struct entry *e)
  {
         e->hash_next = ht->buckets[bucket];
         ht->buckets[bucket] = to_index(ht->es, e);
  }
  
-static void h_insert(struct hash_table *ht, struct entry *e)
+static void h_insert(struct smq_hash_table *ht, struct entry *e)
  {
         unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
         __h_insert(ht, h, e);
  }
  
-static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t oblock,
+static struct entry *__h_lookup(struct smq_hash_table *ht, unsigned h, dm_oblock_t oblock,
                                 struct entry **prev)
  {
         struct entry *e;
@@ -621,7 +628,7 @@ static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t o
         return NULL;
  }
  
-static void __h_unlink(struct hash_table *ht, unsigned h,
+static void __h_unlink(struct smq_hash_table *ht, unsigned h,
                        struct entry *e, struct entry *prev)
  {
         if (prev)
@@ -633,7 +640,7 @@ static void __h_unlink(struct hash_table *ht, unsigned h,
  /*
   * Also moves each entry to the front of the bucket.
   */
-static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock)
+static struct entry *h_lookup(struct smq_hash_table *ht, dm_oblock_t oblock)
  {
         struct entry *e, *prev;
         unsigned h = hash_64(from_oblock(oblock), ht->hash_bits);
@@ -651,7 +658,7 @@ static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock)
         return e;
  }
  
-static void h_remove(struct hash_table *ht, struct entry *e)
+static void h_remove(struct smq_hash_table *ht, struct entry *e)
  {
         unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
         struct entry *prev;
@@ -699,7 +706,10 @@ static void init_entry(struct entry *e)
         e->next = INDEXER_NULL;
         e->prev = INDEXER_NULL;
         e->level = 0u;
+       e->dirty = true;        /* FIXME: audit */
         e->allocated = true;
+       e->sentinel = false;
+       e->pending_work = false;
  }
  
  static struct entry *alloc_entry(struct entry_alloc *ea)
@@ -762,11 +772,11 @@ static struct entry *get_entry(struct entry_alloc *ea, unsigned index)
  #define NR_HOTSPOT_LEVELS 64u
  #define NR_CACHE_LEVELS 64u
  
-#define WRITEBACK_PERIOD (10 * HZ)
-#define DEMOTE_PERIOD (60 * HZ)
+#define WRITEBACK_PERIOD (10ul * HZ)
+#define DEMOTE_PERIOD (60ul * HZ)
  
  #define HOTSPOT_UPDATE_PERIOD (HZ)
-#define CACHE_UPDATE_PERIOD (10u * HZ)
+#define CACHE_UPDATE_PERIOD (60ul * HZ)
  
  struct smq_policy {
         struct dm_cache_policy policy;
@@ -814,8 +824,8 @@ struct smq_policy {
          * The hash tables allows us to quickly find an entry by origin
          * block.
          */
-       struct hash_table table;
-       struct hash_table hotspot_table;
+       struct smq_hash_table table;
+       struct smq_hash_table hotspot_table;
  
         bool current_writeback_sentinels;
         unsigned long next_writeback_period;
@@ -828,6 +838,10 @@ struct smq_policy {
  
         unsigned long next_hotspot_period;
         unsigned long next_cache_period;
+
+       struct background_tracker *bg_work;
+
+       bool migrations_allowed;
  };
  
  /*----------------------------------------------------------------*/
@@ -876,15 +890,15 @@ static void __update_demote_sentinels(struct smq_policy *mq)
  static void update_sentinels(struct smq_policy *mq)
  {
         if (time_after(jiffies, mq->next_writeback_period)) {
-               __update_writeback_sentinels(mq);
                 mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
                 mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
+               __update_writeback_sentinels(mq);
         }
  
         if (time_after(jiffies, mq->next_demote_period)) {
-               __update_demote_sentinels(mq);
                 mq->next_demote_period = jiffies + DEMOTE_PERIOD;
                 mq->current_demote_sentinels = !mq->current_demote_sentinels;
+               __update_demote_sentinels(mq);
         }
  }
  
@@ -920,55 +934,40 @@ static void sentinels_init(struct smq_policy *mq)
  
  /*----------------------------------------------------------------*/
  
-/*
- * These methods tie together the dirty queue, clean queue and hash table.
- */
-static void push_new(struct smq_policy *mq, struct entry *e)
+static void del_queue(struct smq_policy *mq, struct entry *e)
  {
-       struct queue *q = e->dirty ? &mq->dirty : &mq->clean;
-       h_insert(&mq->table, e);
-       q_push(q, e);
+       q_del(e->dirty ? &mq->dirty : &mq->clean, e);
  }
  
-static void push(struct smq_policy *mq, struct entry *e)
+static void push_queue(struct smq_policy *mq, struct entry *e)
  {
-       struct entry *sentinel;
-
-       h_insert(&mq->table, e);
-
-       /*
-        * Punch this into the queue just in front of the sentinel, to
-        * ensure it's cleaned straight away.
-        */
-       if (e->dirty) {
-               sentinel = writeback_sentinel(mq, e->level);
-               q_push_before(&mq->dirty, sentinel, e);
-       } else {
-               sentinel = demote_sentinel(mq, e->level);
-               q_push_before(&mq->clean, sentinel, e);
-       }
+       if (e->dirty)
+               q_push(&mq->dirty, e);
+       else
+               q_push(&mq->clean, e);
  }
  
-/*
- * Removes an entry from cache.  Removes from the hash table.
- */
-static void __del(struct smq_policy *mq, struct queue *q, struct entry *e)
+// !h, !q, a -> h, q, a
+static void push(struct smq_policy *mq, struct entry *e)
  {
-       q_del(q, e);
-       h_remove(&mq->table, e);
+       h_insert(&mq->table, e);
+       if (!e->pending_work)
+               push_queue(mq, e);
  }
  
-static void del(struct smq_policy *mq, struct entry *e)
+static void push_queue_front(struct smq_policy *mq, struct entry *e)
  {
-       __del(mq, e->dirty ? &mq->dirty : &mq->clean, e);
+       if (e->dirty)
+               q_push_front(&mq->dirty, e);
+       else
+               q_push_front(&mq->clean, e);
  }
  
-static struct entry *pop_old(struct smq_policy *mq, struct queue *q, unsigned max_level)
+static void push_front(struct smq_policy *mq, struct entry *e)
  {
-       struct entry *e = q_pop_old(q, max_level);
-       if (e)
-               h_remove(&mq->table, e);
-       return e;
+       h_insert(&mq->table, e);
+       if (!e->pending_work)
+               push_queue_front(mq, e);
  }
  
  static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
@@ -978,16 +977,21 @@ static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
  
  static void requeue(struct smq_policy *mq, struct entry *e)
  {
-       struct entry *sentinel;
+       /*
+        * Pending work has temporarily been taken out of the queues.
+        */
+       if (e->pending_work)
+               return;
  
         if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
-               if (e->dirty) {
-                       sentinel = writeback_sentinel(mq, e->level);
-                       q_requeue_before(&mq->dirty, sentinel, e, 1u);
-               } else {
-                       sentinel = demote_sentinel(mq, e->level);
-                       q_requeue_before(&mq->clean, sentinel, e, 1u);
+               if (!e->dirty) {
+                       q_requeue(&mq->clean, e, 1u, NULL, NULL);
+                       return;
                 }
+
+               q_requeue(&mq->dirty, e, 1u,
+                         get_sentinel(&mq->writeback_sentinel_alloc, e->level, !mq->current_writeback_sentinels),
+                         get_sentinel(&mq->writeback_sentinel_alloc, e->level, mq->current_writeback_sentinels));
         }
  }
  
@@ -1026,6 +1030,8 @@ static void update_promote_levels(struct smq_policy *mq)
         unsigned threshold_level = allocator_empty(&mq->cache_alloc) ?
                 default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
  
+       threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS);
+
         /*
          * If the hotspot queue is performing badly then we have little
          * confidence that we know which blocks to promote.  So we cut down
@@ -1045,7 +1051,7 @@ static void update_promote_levels(struct smq_policy *mq)
         }
  
         mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
-       mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level) + 2u;
+       mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level);
  }
  
  /*
@@ -1095,34 +1101,146 @@ static void end_cache_period(struct smq_policy *mq)
         }
  }
  
-static int demote_cblock(struct smq_policy *mq,
-                        struct policy_locker *locker,
-                        dm_oblock_t *oblock)
+/*----------------------------------------------------------------*/
+
+/*
+ * Targets are given as a percentage.
+ */
+#define CLEAN_TARGET 25u
+#define FREE_TARGET 25u
+
+static unsigned percent_to_target(struct smq_policy *mq, unsigned p)
+{
+       return from_cblock(mq->cache_size) * p / 100u;
+}
+
+static bool clean_target_met(struct smq_policy *mq, bool idle)
  {
-       struct entry *demoted = q_peek(&mq->clean, mq->clean.nr_levels, false);
-       if (!demoted)
+       /*
+        * Cache entries may not be populated.  So we cannot rely on the
+        * size of the clean queue.
+        */
+       unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty);
+
+       if (idle)
                 /*
-                * We could get a block from mq->dirty, but that
-                * would add extra latency to the triggering bio as it
-                * waits for the writeback.  Better to not promote this
-                * time and hope there's a clean block next time this block
-                * is hit.
+                * We'd like to clean everything.
                  */
-               return -ENOSPC;
+               return q_size(&mq->dirty) == 0u;
+       else
+               return (nr_clean + btracker_nr_writebacks_queued(mq->bg_work)) >=
+                      percent_to_target(mq, CLEAN_TARGET);
+}
+
+static bool free_target_met(struct smq_policy *mq, bool idle)
+{
+       unsigned nr_free = from_cblock(mq->cache_size) -
+                          mq->cache_alloc.nr_allocated;
+
+       if (idle)
+               return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >=
+                      percent_to_target(mq, FREE_TARGET);
+       else
+               return true;
+}
+
+/*----------------------------------------------------------------*/
+
+static void mark_pending(struct smq_policy *mq, struct entry *e)
+{
+       BUG_ON(e->sentinel);
+       BUG_ON(!e->allocated);
+       BUG_ON(e->pending_work);
+       e->pending_work = true;
+}
+
+static void clear_pending(struct smq_policy *mq, struct entry *e)
+{
+       BUG_ON(!e->pending_work);
+       e->pending_work = false;
+}
+
+static void queue_writeback(struct smq_policy *mq)
+{
+       int r;
+       struct policy_work work;
+       struct entry *e;
+
+       e = q_peek(&mq->dirty, mq->dirty.nr_levels, !mq->migrations_allowed);
+       if (e) {
+               mark_pending(mq, e);
+               q_del(&mq->dirty, e);
+
+               work.op = POLICY_WRITEBACK;
+               work.oblock = e->oblock;
+               work.cblock = infer_cblock(mq, e);
+
+               r = btracker_queue(mq->bg_work, &work, NULL);
+               WARN_ON_ONCE(r); // FIXME: finish, I think we have to get rid of this race.
+       }
+}
+
+static void queue_demotion(struct smq_policy *mq)
+{
+       struct policy_work work;
+       struct entry *e;
+
+       if (unlikely(WARN_ON_ONCE(!mq->migrations_allowed)))
+               return;
+
+       e = q_peek(&mq->clean, mq->clean.nr_levels, true);
+       if (!e) {
+               if (!clean_target_met(mq, false))
+                       queue_writeback(mq);
+               return;
+       }
+
+       mark_pending(mq, e);
+       q_del(&mq->clean, e);
+
+       work.op = POLICY_DEMOTE;
+       work.oblock = e->oblock;
+       work.cblock = infer_cblock(mq, e);
+       btracker_queue(mq->bg_work, &work, NULL);
+}
+
+static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock,
+                           struct policy_work **workp)
+{
+       struct entry *e;
+       struct policy_work work;
+
+       if (!mq->migrations_allowed)
+               return;
  
-       if (locker->fn(locker, demoted->oblock))
+       if (allocator_empty(&mq->cache_alloc)) {
                 /*
-                * We couldn't lock this block.
+                * We always claim to be 'idle' to ensure some demotions happen
+                * with continuous loads.
                  */
-               return -EBUSY;
+               if (!free_target_met(mq, true))
+                       queue_demotion(mq);
+               return;
+       }
  
-       del(mq, demoted);
-       *oblock = demoted->oblock;
-       free_entry(&mq->cache_alloc, demoted);
+       if (btracker_promotion_already_present(mq->bg_work, oblock))
+               return;
  
-       return 0;
+       /*
+        * We allocate the entry now to reserve the cblock.  If the
+        * background work is aborted we must remember to free it.
+        */
+       e = alloc_entry(&mq->cache_alloc);
+       BUG_ON(!e);
+       e->pending_work = true;
+       work.op = POLICY_PROMOTE;
+       work.oblock = oblock;
+       work.cblock = infer_cblock(mq, e);
+       btracker_queue(mq->bg_work, &work, workp);
  }
  
+/*----------------------------------------------------------------*/
+
  enum promote_result {
         PROMOTE_NOT,
         PROMOTE_TEMPORARY,
@@ -1137,49 +1255,18 @@ static enum promote_result maybe_promote(bool promote)
         return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
  }
  
-static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, struct bio *bio,
-                                         bool fast_promote)
+static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e,
+                                         int data_dir, bool fast_promote)
  {
-       if (bio_data_dir(bio) == WRITE) {
+       if (data_dir == WRITE) {
                 if (!allocator_empty(&mq->cache_alloc) && fast_promote)
                         return PROMOTE_TEMPORARY;
  
-               else
-                       return maybe_promote(hs_e->level >= mq->write_promote_level);
+               return maybe_promote(hs_e->level >= mq->write_promote_level);
         } else
                 return maybe_promote(hs_e->level >= mq->read_promote_level);
  }
  
-static void insert_in_cache(struct smq_policy *mq, dm_oblock_t oblock,
-                           struct policy_locker *locker,
-                           struct policy_result *result, enum promote_result pr)
-{
-       int r;
-       struct entry *e;
-
-       if (allocator_empty(&mq->cache_alloc)) {
-               result->op = POLICY_REPLACE;
-               r = demote_cblock(mq, locker, &result->old_oblock);
-               if (r) {
-                       result->op = POLICY_MISS;
-                       return;
-               }
-
-       } else
-               result->op = POLICY_NEW;
-
-       e = alloc_entry(&mq->cache_alloc);
-       BUG_ON(!e);
-       e->oblock = oblock;
-
-       if (pr == PROMOTE_TEMPORARY)
-               push(mq, e);
-       else
-               push_new(mq, e);
-
-       result->cblock = infer_cblock(mq, e);
-}
-
  static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
  {
         sector_t r = from_oblock(b);
@@ -1187,7 +1274,7 @@ static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
         return to_oblock(r);
  }
  
-static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, struct bio *bio)
+static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b)
  {
         unsigned hi;
         dm_oblock_t hb = to_hblock(mq, b);
@@ -1199,7 +1286,8 @@ static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b,
                 hi = get_index(&mq->hotspot_alloc, e);
                 q_requeue(&mq->hotspot, e,
                           test_and_set_bit(hi, mq->hotspot_hit_bits) ?
-                         0u : mq->hotspot_level_jump);
+                         0u : mq->hotspot_level_jump,
+                         NULL, NULL);
  
         } else {
                 stats_miss(&mq->hotspot_stats);
@@ -1225,47 +1313,6 @@ static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b,
         return e;
  }
  
-/*
- * Looks the oblock up in the hash table, then decides whether to put in
- * pre_cache, or cache etc.
- */
-static int map(struct smq_policy *mq, struct bio *bio, dm_oblock_t oblock,
-              bool can_migrate, bool fast_promote,
-              struct policy_locker *locker, struct policy_result *result)
-{
-       struct entry *e, *hs_e;
-       enum promote_result pr;
-
-       hs_e = update_hotspot_queue(mq, oblock, bio);
-
-       e = h_lookup(&mq->table, oblock);
-       if (e) {
-               stats_level_accessed(&mq->cache_stats, e->level);
-
-               requeue(mq, e);
-               result->op = POLICY_HIT;
-               result->cblock = infer_cblock(mq, e);
-
-       } else {
-               stats_miss(&mq->cache_stats);
-
-               pr = should_promote(mq, hs_e, bio, fast_promote);
-               if (pr == PROMOTE_NOT)
-                       result->op = POLICY_MISS;
-
-               else {
-                       if (!can_migrate) {
-                               result->op = POLICY_MISS;
-                               return -EWOULDBLOCK;
-                       }
-
-                       insert_in_cache(mq, oblock, locker, result, pr);
-               }
-       }
-
-       return 0;
-}
-
  /*----------------------------------------------------------------*/
  
  /*
@@ -1282,6 +1329,7 @@ static void smq_destroy(struct dm_cache_policy *p)
  {
         struct smq_policy *mq = to_smq_policy(p);
  
+       btracker_destroy(mq->bg_work);
         h_exit(&mq->hotspot_table);
         h_exit(&mq->table);
         free_bitset(mq->hotspot_hit_bits);
@@ -1290,234 +1338,247 @@ static void smq_destroy(struct dm_cache_policy *p)
         kfree(mq);
  }
  
-static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
-                  bool can_block, bool can_migrate, bool fast_promote,
-                  struct bio *bio, struct policy_locker *locker,
-                  struct policy_result *result)
-{
-       int r;
-       unsigned long flags;
-       struct smq_policy *mq = to_smq_policy(p);
-
-       result->op = POLICY_MISS;
-
-       spin_lock_irqsave(&mq->lock, flags);
-       r = map(mq, bio, oblock, can_migrate, fast_promote, locker, result);
-       spin_unlock_irqrestore(&mq->lock, flags);
-
-       return r;
-}
+/*----------------------------------------------------------------*/
  
-static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
+static int __lookup(struct smq_policy *mq, dm_oblock_t oblock, dm_cblock_t *cblock,
+                   int data_dir, bool fast_copy,
+                   struct policy_work **work, bool *background_work)
  {
-       int r;
-       unsigned long flags;
-       struct smq_policy *mq = to_smq_policy(p);
-       struct entry *e;
+       struct entry *e, *hs_e;
+       enum promote_result pr;
+
+       *background_work = false;
  
-       spin_lock_irqsave(&mq->lock, flags);
         e = h_lookup(&mq->table, oblock);
         if (e) {
+               stats_level_accessed(&mq->cache_stats, e->level);
+
+               requeue(mq, e);
                 *cblock = infer_cblock(mq, e);
-               r = 0;
-       } else
-               r = -ENOENT;
-       spin_unlock_irqrestore(&mq->lock, flags);
+               return 0;
  
-       return r;
-}
+       } else {
+               stats_miss(&mq->cache_stats);
  
-static void __smq_set_clear_dirty(struct smq_policy *mq, dm_oblock_t oblock, bool set)
-{
-       struct entry *e;
+               /*
+                * The hotspot queue only gets updated with misses.
+                */
+               hs_e = update_hotspot_queue(mq, oblock);
  
-       e = h_lookup(&mq->table, oblock);
-       BUG_ON(!e);
+               pr = should_promote(mq, hs_e, data_dir, fast_copy);
+               if (pr != PROMOTE_NOT) {
+                       queue_promotion(mq, oblock, work);
+                       *background_work = true;
+               }
  
-       del(mq, e);
-       e->dirty = set;
-       push(mq, e);
+               return -ENOENT;
+       }
  }
  
-static void smq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
+                     int data_dir, bool fast_copy,
+                     bool *background_work)
  {
+       int r;
         unsigned long flags;
         struct smq_policy *mq = to_smq_policy(p);
  
         spin_lock_irqsave(&mq->lock, flags);
-       __smq_set_clear_dirty(mq, oblock, true);
+       r = __lookup(mq, oblock, cblock,
+                    data_dir, fast_copy,
+                    NULL, background_work);
         spin_unlock_irqrestore(&mq->lock, flags);
+
+       return r;
  }
  
-static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+static int smq_lookup_with_work(struct dm_cache_policy *p,
+                               dm_oblock_t oblock, dm_cblock_t *cblock,
+                               int data_dir, bool fast_copy,
+                               struct policy_work **work)
  {
-       struct smq_policy *mq = to_smq_policy(p);
+       int r;
+       bool background_queued;
         unsigned long flags;
+       struct smq_policy *mq = to_smq_policy(p);
  
         spin_lock_irqsave(&mq->lock, flags);
-       __smq_set_clear_dirty(mq, oblock, false);
+       r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, &background_queued);
         spin_unlock_irqrestore(&mq->lock, flags);
-}
  
-static unsigned random_level(dm_cblock_t cblock)
-{
-       return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
+       return r;
  }
  
-static int smq_load_mapping(struct dm_cache_policy *p,
-                           dm_oblock_t oblock, dm_cblock_t cblock,
-                           uint32_t hint, bool hint_valid)
+static int smq_get_background_work(struct dm_cache_policy *p, bool idle,
+                                  struct policy_work **result)
  {
+       int r;
+       unsigned long flags;
         struct smq_policy *mq = to_smq_policy(p);
-       struct entry *e;
  
-       e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
-       e->oblock = oblock;
-       e->dirty = false;       /* this gets corrected in a minute */
-       e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
-       push(mq, e);
-
-       return 0;
-}
+       spin_lock_irqsave(&mq->lock, flags);
+       r = btracker_issue(mq->bg_work, result);
+       if (r == -ENODATA) {
+               /* find some writeback work to do */
+               if (mq->migrations_allowed && !free_target_met(mq, idle))
+                       queue_demotion(mq);
  
-static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
-{
-       struct smq_policy *mq = to_smq_policy(p);
-       struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
+               else if (!clean_target_met(mq, idle))
+                       queue_writeback(mq);
  
-       if (!e->allocated)
-               return 0;
+               r = btracker_issue(mq->bg_work, result);
+       }
+       spin_unlock_irqrestore(&mq->lock, flags);
  
-       return e->level;
+       return r;
  }
  
-static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock)
-{
-       struct entry *e;
+/*
+ * We need to clear any pending work flags that have been set, and in the
+ * case of promotion free the entry for the destination cblock.
+ */
+static void __complete_background_work(struct smq_policy *mq,
+                                      struct policy_work *work,
+                                      bool success)
+{
+       struct entry *e = get_entry(&mq->cache_alloc,
+                                   from_cblock(work->cblock));
+
+       switch (work->op) {
+       case POLICY_PROMOTE:
+               // !h, !q, a
+               clear_pending(mq, e);
+               if (success) {
+                       e->oblock = work->oblock;
+                       push(mq, e);
+                       // h, q, a
+               } else {
+                       free_entry(&mq->cache_alloc, e);
+                       // !h, !q, !a
+               }
+               break;
  
-       e = h_lookup(&mq->table, oblock);
-       BUG_ON(!e);
+       case POLICY_DEMOTE:
+               // h, !q, a
+               if (success) {
+                       h_remove(&mq->table, e);
+                       free_entry(&mq->cache_alloc, e);
+                       // !h, !q, !a
+               } else {
+                       clear_pending(mq, e);
+                       push_queue(mq, e);
+                       // h, q, a
+               }
+               break;
  
-       del(mq, e);
-       free_entry(&mq->cache_alloc, e);
+       case POLICY_WRITEBACK:
+               // h, !q, a
+               clear_pending(mq, e);
+               push_queue(mq, e);
+               // h, q, a
+               break;
+       }
+
+       btracker_complete(mq->bg_work, work);
  }
  
-static void smq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
+static void smq_complete_background_work(struct dm_cache_policy *p,
+                                        struct policy_work *work,
+                                        bool success)
  {
-       struct smq_policy *mq = to_smq_policy(p);
         unsigned long flags;
+       struct smq_policy *mq = to_smq_policy(p);
  
         spin_lock_irqsave(&mq->lock, flags);
-       __remove_mapping(mq, oblock);
+       __complete_background_work(mq, work, success);
         spin_unlock_irqrestore(&mq->lock, flags);
  }
  
-static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock)
+// in_hash(oblock) -> in_hash(oblock)
+static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set)
  {
         struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
  
-       if (!e || !e->allocated)
-               return -ENODATA;
-
-       del(mq, e);
-       free_entry(&mq->cache_alloc, e);
-
-       return 0;
+       if (e->pending_work)
+               e->dirty = set;
+       else {
+               del_queue(mq, e);
+               e->dirty = set;
+               push_queue(mq, e);
+       }
  }
  
-static int smq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
+static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
  {
-       int r;
         unsigned long flags;
         struct smq_policy *mq = to_smq_policy(p);
  
         spin_lock_irqsave(&mq->lock, flags);
-       r = __remove_cblock(mq, cblock);
+       __smq_set_clear_dirty(mq, cblock, true);
         spin_unlock_irqrestore(&mq->lock, flags);
-
-       return r;
  }
  
-
-#define CLEAN_TARGET_CRITICAL 5u /* percent */
-
-static bool clean_target_met(struct smq_policy *mq, bool critical)
+static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
  {
-       if (critical) {
-               /*
-                * Cache entries may not be populated.  So we're cannot rely on the
-                * size of the clean queue.
-                */
-               unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty);
-               unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_CRITICAL / 100u;
+       struct smq_policy *mq = to_smq_policy(p);
+       unsigned long flags;
  
-               return nr_clean >= target;
-       } else
-               return !q_size(&mq->dirty);
+       spin_lock_irqsave(&mq->lock, flags);
+       __smq_set_clear_dirty(mq, cblock, false);
+       spin_unlock_irqrestore(&mq->lock, flags);
  }
  
-static int __smq_writeback_work(struct smq_policy *mq, dm_oblock_t *oblock,
-                               dm_cblock_t *cblock, bool critical_only)
+static unsigned random_level(dm_cblock_t cblock)
  {
-       struct entry *e = NULL;
-       bool target_met = clean_target_met(mq, critical_only);
-
-       if (critical_only)
-               /*
-                * Always try and keep the bottom level clean.
-                */
-               e = pop_old(mq, &mq->dirty, target_met ? 1u : mq->dirty.nr_levels);
+       return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
+}
  
-       else
-               e = pop_old(mq, &mq->dirty, mq->dirty.nr_levels);
+static int smq_load_mapping(struct dm_cache_policy *p,
+                           dm_oblock_t oblock, dm_cblock_t cblock,
+                           bool dirty, uint32_t hint, bool hint_valid)
+{
+       struct smq_policy *mq = to_smq_policy(p);
+       struct entry *e;
  
-       if (!e)
-               return -ENODATA;
+       e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
+       e->oblock = oblock;
+       e->dirty = dirty;
+       e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
+       e->pending_work = false;
  
-       *oblock = e->oblock;
-       *cblock = infer_cblock(mq, e);
-       e->dirty = false;
-       push_new(mq, e);
+       /*
+        * When we load mappings we push ahead of both sentinels in order to
+        * allow demotions and cleaning to occur immediately.
+        */
+       push_front(mq, e);
  
         return 0;
  }
  
-static int smq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
-                             dm_cblock_t *cblock, bool critical_only)
+static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock)
  {
-       int r;
-       unsigned long flags;
         struct smq_policy *mq = to_smq_policy(p);
+       struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
  
-       spin_lock_irqsave(&mq->lock, flags);
-       r = __smq_writeback_work(mq, oblock, cblock, critical_only);
-       spin_unlock_irqrestore(&mq->lock, flags);
-
-       return r;
-}
-
-static void __force_mapping(struct smq_policy *mq,
-                           dm_oblock_t current_oblock, dm_oblock_t new_oblock)
-{
-       struct entry *e = h_lookup(&mq->table, current_oblock);
+       if (!e->allocated)
+               return -ENODATA;
  
-       if (e) {
-               del(mq, e);
-               e->oblock = new_oblock;
-               e->dirty = true;
-               push(mq, e);
-       }
+       // FIXME: what if this block has pending background work?
+       del_queue(mq, e);
+       h_remove(&mq->table, e);
+       free_entry(&mq->cache_alloc, e);
+       return 0;
  }
  
-static void smq_force_mapping(struct dm_cache_policy *p,
-                             dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
  {
-       unsigned long flags;
         struct smq_policy *mq = to_smq_policy(p);
+       struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
  
-       spin_lock_irqsave(&mq->lock, flags);
-       __force_mapping(mq, current_oblock, new_oblock);
-       spin_unlock_irqrestore(&mq->lock, flags);
+       if (!e->allocated)
+               return 0;
+
+       return e->level;
  }
  
  static dm_cblock_t smq_residency(struct dm_cache_policy *p)
@@ -1546,6 +1607,12 @@ static void smq_tick(struct dm_cache_policy *p, bool can_block)
         spin_unlock_irqrestore(&mq->lock, flags);
  }
  
+static void smq_allow_migrations(struct dm_cache_policy *p, bool allow)
+{
+       struct smq_policy *mq = to_smq_policy(p);
+       mq->migrations_allowed = allow;
+}
+
  /*
   * smq has no config values, but the old mq policy did.  To avoid breaking
   * software we continue to accept these configurables for the mq policy,
@@ -1590,18 +1657,18 @@ static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
  static void init_policy_functions(struct smq_policy *mq, bool mimic_mq)
  {
         mq->policy.destroy = smq_destroy;
-       mq->policy.map = smq_map;
         mq->policy.lookup = smq_lookup;
+       mq->policy.lookup_with_work = smq_lookup_with_work;
+       mq->policy.get_background_work = smq_get_background_work;
+       mq->policy.complete_background_work = smq_complete_background_work;
         mq->policy.set_dirty = smq_set_dirty;
         mq->policy.clear_dirty = smq_clear_dirty;
         mq->policy.load_mapping = smq_load_mapping;
+       mq->policy.invalidate_mapping = smq_invalidate_mapping;
         mq->policy.get_hint = smq_get_hint;
-       mq->policy.remove_mapping = smq_remove_mapping;
-       mq->policy.remove_cblock = smq_remove_cblock;
-       mq->policy.writeback_work = smq_writeback_work;
-       mq->policy.force_mapping = smq_force_mapping;
         mq->policy.residency = smq_residency;
         mq->policy.tick = smq_tick;
+       mq->policy.allow_migrations = smq_allow_migrations;
  
         if (mimic_mq) {
                 mq->policy.set_config_value = mq_set_config_value;
@@ -1633,7 +1700,8 @@ static void calc_hotspot_params(sector_t origin_size,
  static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
                                             sector_t origin_size,
                                             sector_t cache_block_size,
-                                           bool mimic_mq)
+                                           bool mimic_mq,
+                                           bool migrations_allowed)
  {
         unsigned i;
         unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
@@ -1658,11 +1726,11 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
         }
  
         init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
-        for (i = 0; i < nr_sentinels_per_queue; i++)
+       for (i = 0; i < nr_sentinels_per_queue; i++)
                 get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
  
         init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
-        for (i = 0; i < nr_sentinels_per_queue; i++)
+       for (i = 0; i < nr_sentinels_per_queue; i++)
                 get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
  
         init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
@@ -1715,8 +1783,16 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
         mq->next_hotspot_period = jiffies;
         mq->next_cache_period = jiffies;
  
+       mq->bg_work = btracker_create(10240); /* FIXME: hard coded value */
+       if (!mq->bg_work)
+               goto bad_btracker;
+
+       mq->migrations_allowed = migrations_allowed;
+
         return &mq->policy;
  
+bad_btracker:
+       h_exit(&mq->hotspot_table);
  bad_alloc_hotspot_table:
         h_exit(&mq->table);
  bad_alloc_table:
@@ -1735,21 +1811,28 @@ static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
                                           sector_t origin_size,
                                           sector_t cache_block_size)
  {
-       return __smq_create(cache_size, origin_size, cache_block_size, false);
+       return __smq_create(cache_size, origin_size, cache_block_size, false, true);
  }
  
  static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
                                          sector_t origin_size,
                                          sector_t cache_block_size)
  {
-       return __smq_create(cache_size, origin_size, cache_block_size, true);
+       return __smq_create(cache_size, origin_size, cache_block_size, true, true);
+}
+
+static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size,
+                                             sector_t origin_size,
+                                             sector_t cache_block_size)
+{
+       return __smq_create(cache_size, origin_size, cache_block_size, false, false);
  }
  
  /*----------------------------------------------------------------*/
  
  static struct dm_cache_policy_type smq_policy_type = {
         .name = "smq",
-       .version = {1, 5, 0},
+       .version = {2, 0, 0},
         .hint_size = 4,
         .owner = THIS_MODULE,
         .create = smq_create
@@ -1757,15 +1840,23 @@ static struct dm_cache_policy_type smq_policy_type = {
  
  static struct dm_cache_policy_type mq_policy_type = {
         .name = "mq",
-       .version = {1, 5, 0},
+       .version = {2, 0, 0},
         .hint_size = 4,
         .owner = THIS_MODULE,
         .create = mq_create,
  };
  
+static struct dm_cache_policy_type cleaner_policy_type = {
+       .name = "cleaner",
+       .version = {2, 0, 0},
+       .hint_size = 4,
+       .owner = THIS_MODULE,
+       .create = cleaner_create,
+};
+
  static struct dm_cache_policy_type default_policy_type = {
         .name = "default",
-       .version = {1, 5, 0},
+       .version = {2, 0, 0},
         .hint_size = 4,
         .owner = THIS_MODULE,
         .create = smq_create,
@@ -1785,23 +1876,36 @@ static int __init smq_init(void)
         r = dm_cache_policy_register(&mq_policy_type);
         if (r) {
                 DMERR("register failed (as mq) %d", r);
-               dm_cache_policy_unregister(&smq_policy_type);
-               return -ENOMEM;
+               goto out_mq;
+       }
+
+       r = dm_cache_policy_register(&cleaner_policy_type);
+       if (r) {
+               DMERR("register failed (as cleaner) %d", r);
+               goto out_cleaner;
         }
  
         r = dm_cache_policy_register(&default_policy_type);
         if (r) {
                 DMERR("register failed (as default) %d", r);
-               dm_cache_policy_unregister(&mq_policy_type);
-               dm_cache_policy_unregister(&smq_policy_type);
-               return -ENOMEM;
+               goto out_default;
         }
  
         return 0;
+
+out_default:
+       dm_cache_policy_unregister(&cleaner_policy_type);
+out_cleaner:
+       dm_cache_policy_unregister(&mq_policy_type);
+out_mq:
+       dm_cache_policy_unregister(&smq_policy_type);
+
+       return -ENOMEM;
  }
  
  static void __exit smq_exit(void)
  {
+       dm_cache_policy_unregister(&cleaner_policy_type);
         dm_cache_policy_unregister(&smq_policy_type);
         dm_cache_policy_unregister(&mq_policy_type);
         dm_cache_policy_unregister(&default_policy_type);
@@ -1816,3 +1920,4 @@ MODULE_DESCRIPTION("smq cache policy");
  
  MODULE_ALIAS("dm-cache-default");
  MODULE_ALIAS("dm-cache-mq");
+MODULE_ALIAS("dm-cache-cleaner");