Merge tag 'for-linus' of git://git.armlinux.org.uk/~rmk/linux-arm

[linux.git] / fs / fs-writeback.c
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c

index 542b02d170f8bb99408aa7537bde91c10f977099..8aaa7eec7b74a24018795aae91e29781bdbc89ac 100644 (file)
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -36,10 +36,6 @@
   */
  #define MIN_WRITEBACK_PAGES    (4096UL >> (PAGE_SHIFT - 10))
  
-struct wb_completion {
-       atomic_t                cnt;
-};
-
  /*
   * Passed into wb_writeback(), essentially a subset of writeback_control
   */
@@ -60,19 +56,6 @@ struct wb_writeback_work {
         struct wb_completion *done;     /* set if the caller waits */
  };
  
-/*
- * If one wants to wait for one or more wb_writeback_works, each work's
- * ->done should be set to a wb_completion defined using the following
- * macro.  Once all work items are issued with wb_queue_work(), the caller
- * can wait for the completion of all using wb_wait_for_completion().  Work
- * items which are waited upon aren't freed automatically on completion.
- */
-#define DEFINE_WB_COMPLETION_ONSTACK(cmpl)                             \
-       struct wb_completion cmpl = {                                   \
-               .cnt            = ATOMIC_INIT(1),                       \
-       }
-
-
  /*
   * If an inode is constantly having its pages dirtied, but then the
   * updates stop dirtytime_expire_interval seconds in the past, it's
@@ -182,7 +165,7 @@ static void finish_writeback_work(struct bdi_writeback *wb,
         if (work->auto_free)
                 kfree(work);
         if (done && atomic_dec_and_test(&done->cnt))
-               wake_up_all(&wb->bdi->wb_waitq);
+               wake_up_all(done->waitq);
  }
  
  static void wb_queue_work(struct bdi_writeback *wb,
@@ -206,28 +189,44 @@ static void wb_queue_work(struct bdi_writeback *wb,
  
  /**
   * wb_wait_for_completion - wait for completion of bdi_writeback_works
- * @bdi: bdi work items were issued to
   * @done: target wb_completion
   *
   * Wait for one or more work items issued to @bdi with their ->done field
- * set to @done, which should have been defined with
- * DEFINE_WB_COMPLETION_ONSTACK().  This function returns after all such
- * work items are completed.  Work items which are waited upon aren't freed
+ * set to @done, which should have been initialized with
+ * DEFINE_WB_COMPLETION().  This function returns after all such work items
+ * are completed.  Work items which are waited upon aren't freed
   * automatically on completion.
   */
-static void wb_wait_for_completion(struct backing_dev_info *bdi,
-                                  struct wb_completion *done)
+void wb_wait_for_completion(struct wb_completion *done)
  {
         atomic_dec(&done->cnt);         /* put down the initial count */
-       wait_event(bdi->wb_waitq, !atomic_read(&done->cnt));
+       wait_event(*done->waitq, !atomic_read(&done->cnt));
  }
  
  #ifdef CONFIG_CGROUP_WRITEBACK
  
-/* parameters for foreign inode detection, see wb_detach_inode() */
+/*
+ * Parameters for foreign inode detection, see wbc_detach_inode() to see
+ * how they're used.
+ *
+ * These paramters are inherently heuristical as the detection target
+ * itself is fuzzy.  All we want to do is detaching an inode from the
+ * current owner if it's being written to by some other cgroups too much.
+ *
+ * The current cgroup writeback is built on the assumption that multiple
+ * cgroups writing to the same inode concurrently is very rare and a mode
+ * of operation which isn't well supported.  As such, the goal is not
+ * taking too long when a different cgroup takes over an inode while
+ * avoiding too aggressive flip-flops from occasional foreign writes.
+ *
+ * We record, very roughly, 2s worth of IO time history and if more than
+ * half of that is foreign, trigger the switch.  The recording is quantized
+ * to 16 slots.  To avoid tiny writes from swinging the decision too much,
+ * writes smaller than 1/8 of avg size are ignored.
+ */
  #define WB_FRN_TIME_SHIFT      13      /* 1s = 2^13, upto 8 secs w/ 16bit */
  #define WB_FRN_TIME_AVG_SHIFT  3       /* avg = avg * 7/8 + new * 1/8 */
-#define WB_FRN_TIME_CUT_DIV    2       /* ignore rounds < avg / 2 */
+#define WB_FRN_TIME_CUT_DIV    8       /* ignore rounds < avg / 8 */
  #define WB_FRN_TIME_PERIOD     (2 * (1 << WB_FRN_TIME_SHIFT))  /* 2s */
  
  #define WB_FRN_HIST_SLOTS      16      /* inode->i_wb_frn_history is 16bit */
@@ -237,6 +236,7 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi,
                                         /* if foreign slots >= 8, switch */
  #define WB_FRN_HIST_MAX_SLOTS  (WB_FRN_HIST_THR_SLOTS / 2 + 1)
                                         /* one round can affect upto 5 slots */
+#define WB_FRN_MAX_IN_FLIGHT   1024    /* don't queue too many concurrently */
  
  static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
  static struct workqueue_struct *isw_wq;
@@ -389,6 +389,8 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
         if (unlikely(inode->i_state & I_FREEING))
                 goto skip_switch;
  
+       trace_inode_switch_wbs(inode, old_wb, new_wb);
+
         /*
          * Count and transfer stats.  Note that PAGECACHE_TAG_DIRTY points
          * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
@@ -489,18 +491,13 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
         if (inode->i_state & I_WB_SWITCH)
                 return;
  
-       /*
-        * Avoid starting new switches while sync_inodes_sb() is in
-        * progress.  Otherwise, if the down_write protected issue path
-        * blocks heavily, we might end up starting a large number of
-        * switches which will block on the rwsem.
-        */
-       if (!down_read_trylock(&bdi->wb_switch_rwsem))
+       /* avoid queueing a new switch if too many are already in flight */
+       if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
                 return;
  
         isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
         if (!isw)
-               goto out_unlock;
+               return;
  
         /* find and pin the new wb */
         rcu_read_lock();
@@ -534,15 +531,12 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
         call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
  
         atomic_inc(&isw_nr_in_flight);
-
-       goto out_unlock;
+       return;
  
  out_free:
         if (isw->new_wb)
                 wb_put(isw->new_wb);
         kfree(isw);
-out_unlock:
-       up_read(&bdi->wb_switch_rwsem);
  }
  
  /**
@@ -681,6 +675,9 @@ void wbc_detach_inode(struct writeback_control *wbc)
                 if (wbc->wb_id != max_id)
                         history |= (1U << slots) - 1;
  
+               if (history)
+                       trace_inode_foreign_history(inode, wbc, history);
+
                 /*
                  * Switch if the current wb isn't the consistent winner.
                  * If there are multiple closely competing dirtiers, the
@@ -843,7 +840,7 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
  restart:
         rcu_read_lock();
         list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
-               DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
+               DEFINE_WB_COMPLETION(fallback_work_done, bdi);
                 struct wb_writeback_work fallback_work;
                 struct wb_writeback_work *work;
                 long nr_pages;
@@ -890,7 +887,7 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
                 last_wb = wb;
  
                 rcu_read_unlock();
-               wb_wait_for_completion(bdi, &fallback_work_done);
+               wb_wait_for_completion(&fallback_work_done);
                 goto restart;
         }
         rcu_read_unlock();
@@ -899,6 +896,89 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
                 wb_put(last_wb);
  }
  
+/**
+ * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs
+ * @bdi_id: target bdi id
+ * @memcg_id: target memcg css id
+ * @nr_pages: number of pages to write, 0 for best-effort dirty flushing
+ * @reason: reason why some writeback work initiated
+ * @done: target wb_completion
+ *
+ * Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id
+ * with the specified parameters.
+ */
+int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr,
+                          enum wb_reason reason, struct wb_completion *done)
+{
+       struct backing_dev_info *bdi;
+       struct cgroup_subsys_state *memcg_css;
+       struct bdi_writeback *wb;
+       struct wb_writeback_work *work;
+       int ret;
+
+       /* lookup bdi and memcg */
+       bdi = bdi_get_by_id(bdi_id);
+       if (!bdi)
+               return -ENOENT;
+
+       rcu_read_lock();
+       memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys);
+       if (memcg_css && !css_tryget(memcg_css))
+               memcg_css = NULL;
+       rcu_read_unlock();
+       if (!memcg_css) {
+               ret = -ENOENT;
+               goto out_bdi_put;
+       }
+
+       /*
+        * And find the associated wb.  If the wb isn't there already
+        * there's nothing to flush, don't create one.
+        */
+       wb = wb_get_lookup(bdi, memcg_css);
+       if (!wb) {
+               ret = -ENOENT;
+               goto out_css_put;
+       }
+
+       /*
+        * If @nr is zero, the caller is attempting to write out most of
+        * the currently dirty pages.  Let's take the current dirty page
+        * count and inflate it by 25% which should be large enough to
+        * flush out most dirty pages while avoiding getting livelocked by
+        * concurrent dirtiers.
+        */
+       if (!nr) {
+               unsigned long filepages, headroom, dirty, writeback;
+
+               mem_cgroup_wb_stats(wb, &filepages, &headroom, &dirty,
+                                     &writeback);
+               nr = dirty * 10 / 8;
+       }
+
+       /* issue the writeback work */
+       work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
+       if (work) {
+               work->nr_pages = nr;
+               work->sync_mode = WB_SYNC_NONE;
+               work->range_cyclic = 1;
+               work->reason = reason;
+               work->done = done;
+               work->auto_free = 1;
+               wb_queue_work(wb, work);
+               ret = 0;
+       } else {
+               ret = -ENOMEM;
+       }
+
+       wb_put(wb);
+out_css_put:
+       css_put(memcg_css);
+out_bdi_put:
+       bdi_put(bdi);
+       return ret;
+}
+
  /**
   * cgroup_writeback_umount - flush inode wb switches for umount
   *
@@ -2362,7 +2442,8 @@ static void wait_sb_inodes(struct super_block *sb)
  static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
                                      enum wb_reason reason, bool skip_if_busy)
  {
-       DEFINE_WB_COMPLETION_ONSTACK(done);
+       struct backing_dev_info *bdi = sb->s_bdi;
+       DEFINE_WB_COMPLETION(done, bdi);
         struct wb_writeback_work work = {
                 .sb                     = sb,
                 .sync_mode              = WB_SYNC_NONE,
@@ -2371,14 +2452,13 @@ static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
                 .nr_pages               = nr,
                 .reason                 = reason,
         };
-       struct backing_dev_info *bdi = sb->s_bdi;
  
         if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
                 return;
         WARN_ON(!rwsem_is_locked(&sb->s_umount));
  
         bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
-       wb_wait_for_completion(bdi, &done);
+       wb_wait_for_completion(&done);
  }
  
  /**
@@ -2440,7 +2520,8 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb);
   */
  void sync_inodes_sb(struct super_block *sb)
  {
-       DEFINE_WB_COMPLETION_ONSTACK(done);
+       struct backing_dev_info *bdi = sb->s_bdi;
+       DEFINE_WB_COMPLETION(done, bdi);
         struct wb_writeback_work work = {
                 .sb             = sb,
                 .sync_mode      = WB_SYNC_ALL,
@@ -2450,7 +2531,6 @@ void sync_inodes_sb(struct super_block *sb)
                 .reason         = WB_REASON_SYNC,
                 .for_sync       = 1,
         };
-       struct backing_dev_info *bdi = sb->s_bdi;
  
         /*
          * Can't skip on !bdi_has_dirty() because we should wait for !dirty
@@ -2464,7 +2544,7 @@ void sync_inodes_sb(struct super_block *sb)
         /* protect against inode wb switch, see inode_switch_wbs_work_fn() */
         bdi_down_write_wb_switch_rwsem(bdi);
         bdi_split_work_to_wbs(bdi, &work, false);
-       wb_wait_for_completion(bdi, &done);
+       wb_wait_for_completion(&done);
         bdi_up_write_wb_switch_rwsem(bdi);
  
         wait_sb_inodes(sb);