/* Throttling is performed over 100ms slice and after that slice is renewed */
static unsigned long throtl_slice = HZ/10; /* 100 ms */
+/* A workqueue to queue throttle related work */
+static struct workqueue_struct *kthrotld_workqueue;
+static void throtl_schedule_delayed_work(struct throtl_data *td,
+ unsigned long delay);
+
struct throtl_rb_root {
struct rb_root rb;
struct rb_node *left;
* tree of blkg (instead of traversing through hash list all
* the time.
*/
- tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
+
+ /*
+ * This is the common case when there are no blkio cgroups.
+ * Avoid lookup in this case
+ */
+ if (blkcg == &blkio_root_cgroup)
+ tg = &td->root_tg;
+ else
+ tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
/* Fill in device details for root group */
if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
update_min_dispatch_time(st);
if (time_before_eq(st->min_disptime, jiffies))
- throtl_schedule_delayed_work(td->queue, 0);
+ throtl_schedule_delayed_work(td, 0);
else
- throtl_schedule_delayed_work(td->queue,
- (st->min_disptime - jiffies));
+ throtl_schedule_delayed_work(td, (st->min_disptime - jiffies));
}
static inline void
tg->slice_end[rw], jiffies);
}
+static inline void throtl_set_slice_end(struct throtl_data *td,
+ struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
+{
+ tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
+}
+
static inline void throtl_extend_slice(struct throtl_data *td,
struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
{
if (throtl_slice_used(td, tg, rw))
return;
+ /*
+ * A bio has been dispatched. Also adjust slice_end. It might happen
+ * that initially cgroup limit was very low resulting in high
+ * slice_end, but later limit was bumped up and bio was dispached
+ * sooner, then we need to reduce slice_end. A high bogus slice_end
+ * is bad because it does not allow new slice to start.
+ */
+
+ throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice);
+
time_elapsed = jiffies - tg->slice_start[rw];
nr_slices = time_elapsed / throtl_slice;
bool rw = bio_data_dir(bio);
unsigned int io_allowed;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
+ u64 tmp;
jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
- io_allowed = (tg->iops[rw] * jiffy_elapsed_rnd) / HZ;
+ /*
+ * jiffy_elapsed_rnd should not be a big value as minimum iops can be
+ * 1 then at max jiffy elapsed should be equivalent of 1 second as we
+ * will allow dispatch after 1 second and after that slice should
+ * have been trimmed.
+ */
+
+ tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
+ do_div(tmp, HZ);
+
+ if (tmp > UINT_MAX)
+ io_allowed = UINT_MAX;
+ else
+ io_allowed = tmp;
if (tg->io_disp[rw] + 1 <= io_allowed) {
if (wait)
{
unsigned int nr_reads = 0, nr_writes = 0;
unsigned int max_nr_reads = throtl_grp_quantum*3/4;
- unsigned int max_nr_writes = throtl_grp_quantum - nr_reads;
+ unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
struct bio *bio;
/* Try to dispatch 75% READS and 25% WRITES */
struct throtl_grp *tg;
struct hlist_node *pos, *n;
- /*
- * Make sure atomic_inc() effects from
- * throtl_update_blkio_group_read_bps(), group of functions are
- * visible.
- * Is this required or smp_mb__after_atomic_inc() was suffcient
- * after the atomic_inc().
- */
- smp_rmb();
if (!atomic_read(&td->limits_changed))
return;
throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed));
- hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
- /*
- * Do I need an smp_rmb() here to make sure tg->limits_changed
- * update is visible. I am relying on smp_rmb() at the
- * beginning of function and not putting a new one here.
- */
+ /*
+ * Make sure updates from throtl_update_blkio_group_read_bps() group
+ * of functions to tg->limits_changed are visible. We do not
+ * want update td->limits_changed to be visible but update to
+ * tg->limits_changed not being visible yet on this cpu. Hence
+ * the read barrier.
+ */
+ smp_rmb();
+ hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
if (throtl_tg_on_rr(tg) && tg->limits_changed) {
throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
" riops=%u wiops=%u", tg->bps[READ],
}
/* Call with queue lock held */
-void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay)
+static void
+throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
{
- struct throtl_data *td = q->td;
struct delayed_work *dwork = &td->throtl_work;
if (total_nr_queued(td) > 0) {
* Cancel that and schedule a new one.
*/
__cancel_delayed_work(dwork);
- kblockd_schedule_delayed_work(q, dwork, delay);
+ queue_delayed_work(kthrotld_workqueue, dwork, delay);
throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
delay, jiffies);
}
}
-EXPORT_SYMBOL(throtl_schedule_delayed_work);
static void
throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
smp_mb__after_atomic_inc();
/* Schedule a work now to process the limit change */
- throtl_schedule_delayed_work(td->queue, 0);
+ throtl_schedule_delayed_work(td, 0);
}
static void throtl_update_blkio_group_write_bps(void *key,
smp_mb__before_atomic_inc();
atomic_inc(&td->limits_changed);
smp_mb__after_atomic_inc();
- throtl_schedule_delayed_work(td->queue, 0);
+ throtl_schedule_delayed_work(td, 0);
}
static void throtl_update_blkio_group_read_iops(void *key,
smp_mb__before_atomic_inc();
atomic_inc(&td->limits_changed);
smp_mb__after_atomic_inc();
- throtl_schedule_delayed_work(td->queue, 0);
+ throtl_schedule_delayed_work(td, 0);
}
static void throtl_update_blkio_group_write_iops(void *key,
smp_mb__before_atomic_inc();
atomic_inc(&td->limits_changed);
smp_mb__after_atomic_inc();
- throtl_schedule_delayed_work(td->queue, 0);
+ throtl_schedule_delayed_work(td, 0);
}
void throtl_shutdown_timer_wq(struct request_queue *q)
static int __init throtl_init(void)
{
+ kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
+ if (!kthrotld_workqueue)
+ panic("Failed to create kthrotld\n");
+
blkio_policy_register(&blkio_policy_throtl);
return 0;
}