blkio: Add io_merged stat
[linux-flexiantxendom0-natty.git] / block / blk-cgroup.c
1 /*
2  * Common Block IO controller cgroup interface
3  *
4  * Based on ideas and code from CFQ, CFS and BFQ:
5  * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6  *
7  * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8  *                    Paolo Valente <paolo.valente@unimore.it>
9  *
10  * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11  *                    Nauman Rafique <nauman@google.com>
12  */
13 #include <linux/ioprio.h>
14 #include <linux/seq_file.h>
15 #include <linux/kdev_t.h>
16 #include <linux/module.h>
17 #include <linux/err.h>
18 #include <linux/blkdev.h>
19 #include "blk-cgroup.h"
20
21 #define MAX_KEY_LEN 100
22
23 static DEFINE_SPINLOCK(blkio_list_lock);
24 static LIST_HEAD(blkio_list);
25
26 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
27 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
28
29 static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
30                                                   struct cgroup *);
31 static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
32                               struct task_struct *, bool);
33 static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
34                            struct cgroup *, struct task_struct *, bool);
35 static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
36 static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
37
38 struct cgroup_subsys blkio_subsys = {
39         .name = "blkio",
40         .create = blkiocg_create,
41         .can_attach = blkiocg_can_attach,
42         .attach = blkiocg_attach,
43         .destroy = blkiocg_destroy,
44         .populate = blkiocg_populate,
45 #ifdef CONFIG_BLK_CGROUP
46         /* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
47         .subsys_id = blkio_subsys_id,
48 #endif
49         .use_id = 1,
50         .module = THIS_MODULE,
51 };
52 EXPORT_SYMBOL_GPL(blkio_subsys);
53
54 struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
55 {
56         return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
57                             struct blkio_cgroup, css);
58 }
59 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
60
61 void blkio_group_init(struct blkio_group *blkg)
62 {
63         spin_lock_init(&blkg->stats_lock);
64 }
65 EXPORT_SYMBOL_GPL(blkio_group_init);
66
67 /*
68  * Add to the appropriate stat variable depending on the request type.
69  * This should be called with the blkg->stats_lock held.
70  */
71 static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
72                                 bool sync)
73 {
74         if (direction)
75                 stat[BLKIO_STAT_WRITE] += add;
76         else
77                 stat[BLKIO_STAT_READ] += add;
78         if (sync)
79                 stat[BLKIO_STAT_SYNC] += add;
80         else
81                 stat[BLKIO_STAT_ASYNC] += add;
82 }
83
84 void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
85 {
86         unsigned long flags;
87
88         spin_lock_irqsave(&blkg->stats_lock, flags);
89         blkg->stats.time += time;
90         spin_unlock_irqrestore(&blkg->stats_lock, flags);
91 }
92 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
93
94 void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
95                                 uint64_t bytes, bool direction, bool sync)
96 {
97         struct blkio_group_stats *stats;
98         unsigned long flags;
99
100         spin_lock_irqsave(&blkg->stats_lock, flags);
101         stats = &blkg->stats;
102         stats->sectors += bytes >> 9;
103         blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
104                         sync);
105         blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
106                         direction, sync);
107         spin_unlock_irqrestore(&blkg->stats_lock, flags);
108 }
109 EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
110
111 void blkiocg_update_completion_stats(struct blkio_group *blkg,
112         uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
113 {
114         struct blkio_group_stats *stats;
115         unsigned long flags;
116         unsigned long long now = sched_clock();
117
118         spin_lock_irqsave(&blkg->stats_lock, flags);
119         stats = &blkg->stats;
120         if (time_after64(now, io_start_time))
121                 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
122                                 now - io_start_time, direction, sync);
123         if (time_after64(io_start_time, start_time))
124                 blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
125                                 io_start_time - start_time, direction, sync);
126         spin_unlock_irqrestore(&blkg->stats_lock, flags);
127 }
128 EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
129
130 void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
131                                         bool sync)
132 {
133         unsigned long flags;
134
135         spin_lock_irqsave(&blkg->stats_lock, flags);
136         blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
137                         sync);
138         spin_unlock_irqrestore(&blkg->stats_lock, flags);
139 }
140 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
141
142 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
143                         struct blkio_group *blkg, void *key, dev_t dev)
144 {
145         unsigned long flags;
146
147         spin_lock_irqsave(&blkcg->lock, flags);
148         rcu_assign_pointer(blkg->key, key);
149         blkg->blkcg_id = css_id(&blkcg->css);
150         hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
151         spin_unlock_irqrestore(&blkcg->lock, flags);
152 #ifdef CONFIG_DEBUG_BLK_CGROUP
153         /* Need to take css reference ? */
154         cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
155 #endif
156         blkg->dev = dev;
157 }
158 EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
159
160 static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
161 {
162         hlist_del_init_rcu(&blkg->blkcg_node);
163         blkg->blkcg_id = 0;
164 }
165
166 /*
167  * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
168  * indicating that blk_group was unhashed by the time we got to it.
169  */
170 int blkiocg_del_blkio_group(struct blkio_group *blkg)
171 {
172         struct blkio_cgroup *blkcg;
173         unsigned long flags;
174         struct cgroup_subsys_state *css;
175         int ret = 1;
176
177         rcu_read_lock();
178         css = css_lookup(&blkio_subsys, blkg->blkcg_id);
179         if (!css)
180                 goto out;
181
182         blkcg = container_of(css, struct blkio_cgroup, css);
183         spin_lock_irqsave(&blkcg->lock, flags);
184         if (!hlist_unhashed(&blkg->blkcg_node)) {
185                 __blkiocg_del_blkio_group(blkg);
186                 ret = 0;
187         }
188         spin_unlock_irqrestore(&blkcg->lock, flags);
189 out:
190         rcu_read_unlock();
191         return ret;
192 }
193 EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
194
195 /* called under rcu_read_lock(). */
196 struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
197 {
198         struct blkio_group *blkg;
199         struct hlist_node *n;
200         void *__key;
201
202         hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
203                 __key = blkg->key;
204                 if (__key == key)
205                         return blkg;
206         }
207
208         return NULL;
209 }
210 EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
211
212 #define SHOW_FUNCTION(__VAR)                                            \
213 static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup,                \
214                                        struct cftype *cftype)           \
215 {                                                                       \
216         struct blkio_cgroup *blkcg;                                     \
217                                                                         \
218         blkcg = cgroup_to_blkio_cgroup(cgroup);                         \
219         return (u64)blkcg->__VAR;                                       \
220 }
221
222 SHOW_FUNCTION(weight);
223 #undef SHOW_FUNCTION
224
225 static int
226 blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
227 {
228         struct blkio_cgroup *blkcg;
229         struct blkio_group *blkg;
230         struct hlist_node *n;
231         struct blkio_policy_type *blkiop;
232
233         if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
234                 return -EINVAL;
235
236         blkcg = cgroup_to_blkio_cgroup(cgroup);
237         spin_lock(&blkio_list_lock);
238         spin_lock_irq(&blkcg->lock);
239         blkcg->weight = (unsigned int)val;
240         hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
241                 list_for_each_entry(blkiop, &blkio_list, list)
242                         blkiop->ops.blkio_update_group_weight_fn(blkg,
243                                         blkcg->weight);
244         }
245         spin_unlock_irq(&blkcg->lock);
246         spin_unlock(&blkio_list_lock);
247         return 0;
248 }
249
250 static int
251 blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
252 {
253         struct blkio_cgroup *blkcg;
254         struct blkio_group *blkg;
255         struct hlist_node *n;
256         struct blkio_group_stats *stats;
257
258         blkcg = cgroup_to_blkio_cgroup(cgroup);
259         spin_lock_irq(&blkcg->lock);
260         hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
261                 spin_lock(&blkg->stats_lock);
262                 stats = &blkg->stats;
263                 memset(stats, 0, sizeof(struct blkio_group_stats));
264                 spin_unlock(&blkg->stats_lock);
265         }
266         spin_unlock_irq(&blkcg->lock);
267         return 0;
268 }
269
270 static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
271                                 int chars_left, bool diskname_only)
272 {
273         snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
274         chars_left -= strlen(str);
275         if (chars_left <= 0) {
276                 printk(KERN_WARNING
277                         "Possibly incorrect cgroup stat display format");
278                 return;
279         }
280         if (diskname_only)
281                 return;
282         switch (type) {
283         case BLKIO_STAT_READ:
284                 strlcat(str, " Read", chars_left);
285                 break;
286         case BLKIO_STAT_WRITE:
287                 strlcat(str, " Write", chars_left);
288                 break;
289         case BLKIO_STAT_SYNC:
290                 strlcat(str, " Sync", chars_left);
291                 break;
292         case BLKIO_STAT_ASYNC:
293                 strlcat(str, " Async", chars_left);
294                 break;
295         case BLKIO_STAT_TOTAL:
296                 strlcat(str, " Total", chars_left);
297                 break;
298         default:
299                 strlcat(str, " Invalid", chars_left);
300         }
301 }
302
303 static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
304                                 struct cgroup_map_cb *cb, dev_t dev)
305 {
306         blkio_get_key_name(0, dev, str, chars_left, true);
307         cb->fill(cb, str, val);
308         return val;
309 }
310
311 /* This should be called with blkg->stats_lock held */
312 static uint64_t blkio_get_stat(struct blkio_group *blkg,
313                 struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
314 {
315         uint64_t disk_total;
316         char key_str[MAX_KEY_LEN];
317         enum stat_sub_type sub_type;
318
319         if (type == BLKIO_STAT_TIME)
320                 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
321                                         blkg->stats.time, cb, dev);
322         if (type == BLKIO_STAT_SECTORS)
323                 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
324                                         blkg->stats.sectors, cb, dev);
325 #ifdef CONFIG_DEBUG_BLK_CGROUP
326         if (type == BLKIO_STAT_DEQUEUE)
327                 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
328                                         blkg->stats.dequeue, cb, dev);
329 #endif
330
331         for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
332                         sub_type++) {
333                 blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
334                 cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
335         }
336         disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
337                         blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
338         blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
339         cb->fill(cb, key_str, disk_total);
340         return disk_total;
341 }
342
343 #define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total)                \
344 static int blkiocg_##__VAR##_read(struct cgroup *cgroup,                \
345                 struct cftype *cftype, struct cgroup_map_cb *cb)        \
346 {                                                                       \
347         struct blkio_cgroup *blkcg;                                     \
348         struct blkio_group *blkg;                                       \
349         struct hlist_node *n;                                           \
350         uint64_t cgroup_total = 0;                                      \
351                                                                         \
352         if (!cgroup_lock_live_group(cgroup))                            \
353                 return -ENODEV;                                         \
354                                                                         \
355         blkcg = cgroup_to_blkio_cgroup(cgroup);                         \
356         rcu_read_lock();                                                \
357         hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
358                 if (blkg->dev) {                                        \
359                         spin_lock_irq(&blkg->stats_lock);               \
360                         cgroup_total += blkio_get_stat(blkg, cb,        \
361                                                 blkg->dev, type);       \
362                         spin_unlock_irq(&blkg->stats_lock);             \
363                 }                                                       \
364         }                                                               \
365         if (show_total)                                                 \
366                 cb->fill(cb, "Total", cgroup_total);                    \
367         rcu_read_unlock();                                              \
368         cgroup_unlock();                                                \
369         return 0;                                                       \
370 }
371
372 SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
373 SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
374 SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
375 SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
376 SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
377 SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
378 SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
379 #ifdef CONFIG_DEBUG_BLK_CGROUP
380 SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
381 #endif
382 #undef SHOW_FUNCTION_PER_GROUP
383
384 #ifdef CONFIG_DEBUG_BLK_CGROUP
385 void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
386                         unsigned long dequeue)
387 {
388         blkg->stats.dequeue += dequeue;
389 }
390 EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
391 #endif
392
393 struct cftype blkio_files[] = {
394         {
395                 .name = "weight",
396                 .read_u64 = blkiocg_weight_read,
397                 .write_u64 = blkiocg_weight_write,
398         },
399         {
400                 .name = "time",
401                 .read_map = blkiocg_time_read,
402         },
403         {
404                 .name = "sectors",
405                 .read_map = blkiocg_sectors_read,
406         },
407         {
408                 .name = "io_service_bytes",
409                 .read_map = blkiocg_io_service_bytes_read,
410         },
411         {
412                 .name = "io_serviced",
413                 .read_map = blkiocg_io_serviced_read,
414         },
415         {
416                 .name = "io_service_time",
417                 .read_map = blkiocg_io_service_time_read,
418         },
419         {
420                 .name = "io_wait_time",
421                 .read_map = blkiocg_io_wait_time_read,
422         },
423         {
424                 .name = "io_merged",
425                 .read_map = blkiocg_io_merged_read,
426         },
427         {
428                 .name = "reset_stats",
429                 .write_u64 = blkiocg_reset_stats,
430         },
431 #ifdef CONFIG_DEBUG_BLK_CGROUP
432        {
433                 .name = "dequeue",
434                 .read_map = blkiocg_dequeue_read,
435        },
436 #endif
437 };
438
439 static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
440 {
441         return cgroup_add_files(cgroup, subsys, blkio_files,
442                                 ARRAY_SIZE(blkio_files));
443 }
444
445 static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
446 {
447         struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
448         unsigned long flags;
449         struct blkio_group *blkg;
450         void *key;
451         struct blkio_policy_type *blkiop;
452
453         rcu_read_lock();
454 remove_entry:
455         spin_lock_irqsave(&blkcg->lock, flags);
456
457         if (hlist_empty(&blkcg->blkg_list)) {
458                 spin_unlock_irqrestore(&blkcg->lock, flags);
459                 goto done;
460         }
461
462         blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
463                                 blkcg_node);
464         key = rcu_dereference(blkg->key);
465         __blkiocg_del_blkio_group(blkg);
466
467         spin_unlock_irqrestore(&blkcg->lock, flags);
468
469         /*
470          * This blkio_group is being unlinked as associated cgroup is going
471          * away. Let all the IO controlling policies know about this event.
472          *
473          * Currently this is static call to one io controlling policy. Once
474          * we have more policies in place, we need some dynamic registration
475          * of callback function.
476          */
477         spin_lock(&blkio_list_lock);
478         list_for_each_entry(blkiop, &blkio_list, list)
479                 blkiop->ops.blkio_unlink_group_fn(key, blkg);
480         spin_unlock(&blkio_list_lock);
481         goto remove_entry;
482 done:
483         free_css_id(&blkio_subsys, &blkcg->css);
484         rcu_read_unlock();
485         if (blkcg != &blkio_root_cgroup)
486                 kfree(blkcg);
487 }
488
489 static struct cgroup_subsys_state *
490 blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
491 {
492         struct blkio_cgroup *blkcg, *parent_blkcg;
493
494         if (!cgroup->parent) {
495                 blkcg = &blkio_root_cgroup;
496                 goto done;
497         }
498
499         /* Currently we do not support hierarchy deeper than two level (0,1) */
500         parent_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
501         if (css_depth(&parent_blkcg->css) > 0)
502                 return ERR_PTR(-EINVAL);
503
504         blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
505         if (!blkcg)
506                 return ERR_PTR(-ENOMEM);
507
508         blkcg->weight = BLKIO_WEIGHT_DEFAULT;
509 done:
510         spin_lock_init(&blkcg->lock);
511         INIT_HLIST_HEAD(&blkcg->blkg_list);
512
513         return &blkcg->css;
514 }
515
516 /*
517  * We cannot support shared io contexts, as we have no mean to support
518  * two tasks with the same ioc in two different groups without major rework
519  * of the main cic data structures.  For now we allow a task to change
520  * its cgroup only if it's the only owner of its ioc.
521  */
522 static int blkiocg_can_attach(struct cgroup_subsys *subsys,
523                                 struct cgroup *cgroup, struct task_struct *tsk,
524                                 bool threadgroup)
525 {
526         struct io_context *ioc;
527         int ret = 0;
528
529         /* task_lock() is needed to avoid races with exit_io_context() */
530         task_lock(tsk);
531         ioc = tsk->io_context;
532         if (ioc && atomic_read(&ioc->nr_tasks) > 1)
533                 ret = -EINVAL;
534         task_unlock(tsk);
535
536         return ret;
537 }
538
539 static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
540                                 struct cgroup *prev, struct task_struct *tsk,
541                                 bool threadgroup)
542 {
543         struct io_context *ioc;
544
545         task_lock(tsk);
546         ioc = tsk->io_context;
547         if (ioc)
548                 ioc->cgroup_changed = 1;
549         task_unlock(tsk);
550 }
551
552 void blkio_policy_register(struct blkio_policy_type *blkiop)
553 {
554         spin_lock(&blkio_list_lock);
555         list_add_tail(&blkiop->list, &blkio_list);
556         spin_unlock(&blkio_list_lock);
557 }
558 EXPORT_SYMBOL_GPL(blkio_policy_register);
559
560 void blkio_policy_unregister(struct blkio_policy_type *blkiop)
561 {
562         spin_lock(&blkio_list_lock);
563         list_del_init(&blkiop->list);
564         spin_unlock(&blkio_list_lock);
565 }
566 EXPORT_SYMBOL_GPL(blkio_policy_unregister);
567
568 static int __init init_cgroup_blkio(void)
569 {
570         return cgroup_load_subsys(&blkio_subsys);
571 }
572
573 static void __exit exit_cgroup_blkio(void)
574 {
575         cgroup_unload_subsys(&blkio_subsys);
576 }
577
578 module_init(init_cgroup_blkio);
579 module_exit(exit_cgroup_blkio);
580 MODULE_LICENSE("GPL");