* https://github.com/osandov/drgn. The ouput looks like the following.
*
* sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
- * active weight hweight% inflt% del_ms usages%
- * test/a * 50/ 50 33.33/ 33.33 27.65 0*041 033:033:033
- * test/b * 100/ 100 66.67/ 66.67 17.56 0*000 066:079:077
+ * active weight hweight% inflt% dbt delay usages%
+ * test/a * 50/ 50 33.33/ 33.33 27.65 2 0*041 033:033:033
+ * test/b * 100/ 100 66.67/ 66.67 17.56 0 0*000 066:079:077
*
* - per : Timer period
* - cur_per : Internal wall and device vtime clock
*/
atomic64_t vtime;
atomic64_t done_vtime;
+ atomic64_t abs_vdebt;
u64 last_vtime;
/*
/*
* Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical
- * weight, the more expensive each IO.
+ * weight, the more expensive each IO. Must round up.
*/
static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
{
return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
}
+/*
+ * The inverse of abs_cost_to_cost(). Must round up.
+ */
+static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
+{
+ return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE);
+}
+
static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost)
{
bio->bi_iocost_cost = cost;
struct iocg_wake_ctx ctx = { .iocg = iocg };
u64 margin_ns = (u64)(ioc->period_us *
WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
- u64 vshortage, expires, oexpires;
+ u64 abs_vdebt, vdebt, vshortage, expires, oexpires;
+ s64 vbudget;
+ u32 hw_inuse;
lockdep_assert_held(&iocg->waitq.lock);
+ current_hweight(iocg, NULL, &hw_inuse);
+ vbudget = now->vnow - atomic64_read(&iocg->vtime);
+
+ /* pay off debt */
+ abs_vdebt = atomic64_read(&iocg->abs_vdebt);
+ vdebt = abs_cost_to_cost(abs_vdebt, hw_inuse);
+ if (vdebt && vbudget > 0) {
+ u64 delta = min_t(u64, vbudget, vdebt);
+ u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
+ abs_vdebt);
+
+ atomic64_add(delta, &iocg->vtime);
+ atomic64_add(delta, &iocg->done_vtime);
+ atomic64_sub(abs_delta, &iocg->abs_vdebt);
+ if (WARN_ON_ONCE(atomic64_read(&iocg->abs_vdebt) < 0))
+ atomic64_set(&iocg->abs_vdebt, 0);
+ }
+
/*
* Wake up the ones which are due and see how much vtime we'll need
* for the next one.
*/
- current_hweight(iocg, NULL, &ctx.hw_inuse);
- ctx.vbudget = now->vnow - atomic64_read(&iocg->vtime);
+ ctx.hw_inuse = hw_inuse;
+ ctx.vbudget = vbudget - vdebt;
__wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
if (!waitqueue_active(&iocg->waitq))
return;
u64 vmargin = ioc->margin_us * now->vrate;
u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
u64 expires, oexpires;
+ u32 hw_inuse;
+
+ /* debt-adjust vtime */
+ current_hweight(iocg, NULL, &hw_inuse);
+ vtime += abs_cost_to_cost(atomic64_read(&iocg->abs_vdebt), hw_inuse);
/* clear or maintain depending on the overage */
if (time_before_eq64(vtime, now->vnow)) {
* should have woken up in the last period and expire idle iocgs.
*/
list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
- if (!waitqueue_active(&iocg->waitq) && !iocg_is_idle(iocg))
+ if (!waitqueue_active(&iocg->waitq) &&
+ !atomic64_read(&iocg->abs_vdebt) && !iocg_is_idle(iocg))
continue;
spin_lock(&iocg->waitq.lock);
- if (waitqueue_active(&iocg->waitq)) {
+ if (waitqueue_active(&iocg->waitq) ||
+ atomic64_read(&iocg->abs_vdebt)) {
/* might be oversleeping vtime / hweight changes, kick */
iocg_kick_waitq(iocg, &now);
iocg_kick_delay(iocg, &now, 0);
* in a while which is fine.
*/
if (!waitqueue_active(&iocg->waitq) &&
+ !atomic64_read(&iocg->abs_vdebt) &&
time_before_eq64(vtime + cost, now.vnow)) {
iocg_commit_bio(iocg, bio, cost);
return;
}
+ /*
+ * We're over budget. If @bio has to be issued regardless,
+ * remember the abs_cost instead of advancing vtime.
+ * iocg_kick_waitq() will pay off the debt before waking more IOs.
+ * This way, the debt is continuously paid off each period with the
+ * actual budget available to the cgroup. If we just wound vtime,
+ * we would incorrectly use the current hw_inuse for the entire
+ * amount which, for example, can lead to the cgroup staying
+ * blocked for a long time even with substantially raised hw_inuse.
+ */
if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
- iocg_commit_bio(iocg, bio, cost);
+ atomic64_add(abs_cost, &iocg->abs_vdebt);
iocg_kick_delay(iocg, &now, cost);
return;
}
struct bio *bio)
{
struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
+ struct ioc *ioc = iocg->ioc;
sector_t bio_end = bio_end_sector(bio);
+ struct ioc_now now;
u32 hw_inuse;
u64 abs_cost, cost;
- /* add iff the existing request has cost assigned */
- if (!rq->bio || !rq->bio->bi_iocost_cost)
+ /* bypass if disabled or for root cgroup */
+ if (!ioc->enabled || !iocg->level)
return;
abs_cost = calc_vtime_cost(bio, iocg, true);
if (!abs_cost)
return;
+ ioc_now(ioc, &now);
+ current_hweight(iocg, NULL, &hw_inuse);
+ cost = abs_cost_to_cost(abs_cost, hw_inuse);
+
/* update cursor if backmerging into the request at the cursor */
if (blk_rq_pos(rq) < bio_end &&
blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
iocg->cursor = bio_end;
- current_hweight(iocg, NULL, &hw_inuse);
- cost = div64_u64(abs_cost * HWEIGHT_WHOLE, hw_inuse);
- bio->bi_iocost_cost = cost;
-
- atomic64_add(cost, &iocg->vtime);
+ /*
+ * Charge if there's enough vtime budget and the existing request
+ * has cost assigned. Otherwise, account it as debt. See debt
+ * handling in ioc_rqos_throttle() for details.
+ */
+ if (rq->bio && rq->bio->bi_iocost_cost &&
+ time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow))
+ iocg_commit_bio(iocg, bio, cost);
+ else
+ atomic64_add(abs_cost, &iocg->abs_vdebt);
}
static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
iocg->ioc = ioc;
atomic64_set(&iocg->vtime, now.vnow);
atomic64_set(&iocg->done_vtime, now.vnow);
+ atomic64_set(&iocg->abs_vdebt, 0);
atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
INIT_LIST_HEAD(&iocg->active_list);
iocg->hweight_active = HWEIGHT_WHOLE;
struct ioc *ioc = iocg->ioc;
if (ioc) {
- hrtimer_cancel(&iocg->waitq_timer);
- hrtimer_cancel(&iocg->delay_timer);
-
spin_lock(&ioc->lock);
if (!list_empty(&iocg->active_list)) {
propagate_active_weight(iocg, 0, 0);
list_del_init(&iocg->active_list);
}
spin_unlock(&ioc->lock);
+
+ hrtimer_cancel(&iocg->waitq_timer);
+ hrtimer_cancel(&iocg->delay_timer);
}
kfree(iocg);
}