blk-cgroup.c - block/blk-cgroup.c - Linux diff v3.5.6

  1/*
  2 * Common Block IO controller cgroup interface
  3 *
  4 * Based on ideas and code from CFQ, CFS and BFQ:
  5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
  6 *
  7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
  8 *		      Paolo Valente <paolo.valente@unimore.it>
  9 *
 10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
 11 * 	              Nauman Rafique <nauman@google.com>
 12 */
 13#include <linux/ioprio.h>
 
 14#include <linux/kdev_t.h>
 15#include <linux/module.h>
 16#include <linux/err.h>
 17#include <linux/blkdev.h>
 18#include <linux/slab.h>
 19#include <linux/genhd.h>
 20#include <linux/delay.h>
 21#include <linux/atomic.h>
 22#include "blk-cgroup.h"
 23#include "blk.h"
 24
 25#define MAX_KEY_LEN 100
 26
 27static DEFINE_MUTEX(blkcg_pol_mutex);
 
 28
 29struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT };
 30EXPORT_SYMBOL_GPL(blkcg_root);
 31
 32static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 33
 34struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup)
 
 35{
 36	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
 37			    struct blkcg, css);
 38}
 39EXPORT_SYMBOL_GPL(cgroup_to_blkcg);
 40
 41static struct blkcg *task_blkcg(struct task_struct *tsk)
 
 42{
 43	return container_of(task_subsys_state(tsk, blkio_subsys_id),
 44			    struct blkcg, css);
 
 
 
 
 45}
 46
 47struct blkcg *bio_blkcg(struct bio *bio)
 
 
 48{
 49	if (bio && bio->bi_css)
 50		return container_of(bio->bi_css, struct blkcg, css);
 51	return task_blkcg(current);
 
 52}
 53EXPORT_SYMBOL_GPL(bio_blkcg);
 54
 55static bool blkcg_policy_enabled(struct request_queue *q,
 56				 const struct blkcg_policy *pol)
 57{
 58	return pol && test_bit(pol->plid, q->blkcg_pols);
 59}
 60
 61/**
 62 * blkg_free - free a blkg
 63 * @blkg: blkg to free
 64 *
 65 * Free @blkg which may be partially allocated.
 66 */
 67static void blkg_free(struct blkcg_gq *blkg)
 68{
 69	int i;
 70
 71	if (!blkg)
 72		return;
 
 
 73
 74	for (i = 0; i < BLKCG_MAX_POLS; i++) {
 75		struct blkcg_policy *pol = blkcg_policy[i];
 76		struct blkg_policy_data *pd = blkg->pd[i];
 77
 78		if (!pd)
 79			continue;
 
 
 
 
 80
 81		if (pol && pol->pd_exit_fn)
 82			pol->pd_exit_fn(blkg);
 
 
 
 
 83
 84		kfree(pd);
 85	}
 
 
 86
 87	kfree(blkg);
 
 
 
 
 
 
 
 88}
 89
 90/**
 91 * blkg_alloc - allocate a blkg
 92 * @blkcg: block cgroup the new blkg is associated with
 93 * @q: request_queue the new blkg is associated with
 94 *
 95 * Allocate a new blkg assocating @blkcg and @q.
 96 */
 97static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q)
 98{
 99	struct blkcg_gq *blkg;
100	int i;
101
102	/* alloc and init base part */
103	blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
104	if (!blkg)
105		return NULL;
106
107	blkg->q = q;
108	INIT_LIST_HEAD(&blkg->q_node);
109	blkg->blkcg = blkcg;
110	blkg->refcnt = 1;
111
112	for (i = 0; i < BLKCG_MAX_POLS; i++) {
113		struct blkcg_policy *pol = blkcg_policy[i];
114		struct blkg_policy_data *pd;
115
116		if (!blkcg_policy_enabled(q, pol))
 
117			continue;
118
119		/* alloc per-policy data and attach it to blkg */
120		pd = kzalloc_node(pol->pd_size, GFP_ATOMIC, q->node);
121		if (!pd) {
122			blkg_free(blkg);
123			return NULL;
124		}
 
 
 
 
 
 
 
 
 
 
125
126		blkg->pd[i] = pd;
127		pd->blkg = blkg;
128
129		/* invoke per-policy init */
130		if (blkcg_policy_enabled(blkg->q, pol))
131			pol->pd_init_fn(blkg);
 
 
 
 
 
 
 
 
 
 
132	}
 
133
134	return blkg;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135}
136
137static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
138				      struct request_queue *q)
 
 
 
 
139{
140	struct blkcg_gq *blkg;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
142	blkg = rcu_dereference(blkcg->blkg_hint);
143	if (blkg && blkg->q == q)
144		return blkg;
 
145
146	/*
147	 * Hint didn't match.  Look up from the radix tree.  Note that we
148	 * may not be holding queue_lock and thus are not sure whether
149	 * @blkg from blkg_tree has already been removed or not, so we
150	 * can't update hint to the lookup result.  Leave it to the caller.
151	 */
152	blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
153	if (blkg && blkg->q == q)
154		return blkg;
155
156	return NULL;
 
 
 
157}
158
159/**
160 * blkg_lookup - lookup blkg for the specified blkcg - q pair
161 * @blkcg: blkcg of interest
162 * @q: request_queue of interest
163 *
164 * Lookup blkg for the @blkcg - @q pair.  This function should be called
165 * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
166 * - see blk_queue_bypass_start() for details.
167 */
168struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
169{
170	WARN_ON_ONCE(!rcu_read_lock_held());
 
 
 
171
172	if (unlikely(blk_queue_bypass(q)))
173		return NULL;
174	return __blkg_lookup(blkcg, q);
 
175}
176EXPORT_SYMBOL_GPL(blkg_lookup);
177
178static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
179					     struct request_queue *q)
180	__releases(q->queue_lock) __acquires(q->queue_lock)
181{
182	struct blkcg_gq *blkg;
183	int ret;
184
185	WARN_ON_ONCE(!rcu_read_lock_held());
186	lockdep_assert_held(q->queue_lock);
187
188	/* lookup and update hint on success, see __blkg_lookup() for details */
189	blkg = __blkg_lookup(blkcg, q);
190	if (blkg) {
191		rcu_assign_pointer(blkcg->blkg_hint, blkg);
192		return blkg;
193	}
194
195	/* blkg holds a reference to blkcg */
196	if (!css_tryget(&blkcg->css))
197		return ERR_PTR(-EINVAL);
198
199	/* allocate */
200	ret = -ENOMEM;
201	blkg = blkg_alloc(blkcg, q);
202	if (unlikely(!blkg))
203		goto err_put;
204
205	/* insert */
206	ret = radix_tree_preload(GFP_ATOMIC);
207	if (ret)
208		goto err_free;
 
209
210	spin_lock(&blkcg->lock);
211	ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
212	if (likely(!ret)) {
213		hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
214		list_add(&blkg->q_node, &q->blkg_list);
 
 
215	}
216	spin_unlock(&blkcg->lock);
 
 
217
218	radix_tree_preload_end();
 
 
 
219
220	if (!ret)
221		return blkg;
222err_free:
223	blkg_free(blkg);
224err_put:
225	css_put(&blkcg->css);
226	return ERR_PTR(ret);
 
227}
 
228
229struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
230				    struct request_queue *q)
231{
 
 
 
 
 
 
 
 
 
 
 
 
232	/*
233	 * This could be the first entry point of blkcg implementation and
234	 * we shouldn't allow anything to go through for a bypassing queue.
 
235	 */
236	if (unlikely(blk_queue_bypass(q)))
237		return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
238	return __blkg_lookup_create(blkcg, q);
 
 
 
 
 
239}
240EXPORT_SYMBOL_GPL(blkg_lookup_create);
241
242static void blkg_destroy(struct blkcg_gq *blkg)
 
243{
244	struct blkcg *blkcg = blkg->blkcg;
 
 
 
 
 
 
 
245
246	lockdep_assert_held(blkg->q->queue_lock);
247	lockdep_assert_held(&blkcg->lock);
 
 
 
248
249	/* Something wrong if we are trying to remove same group twice */
250	WARN_ON_ONCE(list_empty(&blkg->q_node));
251	WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
 
 
 
 
 
252
253	radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
254	list_del_init(&blkg->q_node);
255	hlist_del_init_rcu(&blkg->blkcg_node);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
257	/*
258	 * Both setting lookup hint to and clearing it from @blkg are done
259	 * under queue_lock.  If it's not pointing to @blkg now, it never
260	 * will.  Hint assignment itself can race safely.
261	 */
262	if (rcu_dereference_raw(blkcg->blkg_hint) == blkg)
263		rcu_assign_pointer(blkcg->blkg_hint, NULL);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
265	/*
266	 * Put the reference taken at the time of creation so that when all
267	 * queues are gone, group can be destroyed.
 
268	 */
269	blkg_put(blkg);
 
 
 
 
 
 
 
 
270}
 
271
272/**
273 * blkg_destroy_all - destroy all blkgs associated with a request_queue
274 * @q: request_queue of interest
275 *
276 * Destroy all blkgs associated with @q.
277 */
278static void blkg_destroy_all(struct request_queue *q)
279{
280	struct blkcg_gq *blkg, *n;
 
 
 
 
 
 
281
282	lockdep_assert_held(q->queue_lock);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
284	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
285		struct blkcg *blkcg = blkg->blkcg;
 
 
 
286
287		spin_lock(&blkcg->lock);
288		blkg_destroy(blkg);
289		spin_unlock(&blkcg->lock);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290	}
 
 
 
291}
 
292
293static void blkg_rcu_free(struct rcu_head *rcu_head)
 
294{
295	blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head));
 
 
 
 
 
 
 
 
 
 
296}
 
297
298void __blkg_release(struct blkcg_gq *blkg)
299{
300	/* release the extra blkcg reference this blkg has been holding */
301	css_put(&blkg->blkcg->css);
302
303	/*
304	 * A group is freed in rcu manner. But having an rcu lock does not
305	 * mean that one can access all the fields of blkg and assume these
306	 * are valid. For example, don't try to follow throtl_data and
307	 * request queue links.
 
308	 *
309	 * Having a reference to blkg under an rcu allows acess to only
310	 * values local to groups like group stats and group rate limits
 
311	 */
312	call_rcu(&blkg->rcu_head, blkg_rcu_free);
 
 
 
 
 
 
313}
314EXPORT_SYMBOL_GPL(__blkg_release);
315
316static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype,
317			     u64 val)
318{
319	struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
320	struct blkcg_gq *blkg;
 
321	struct hlist_node *n;
 
322	int i;
 
 
 
 
323
324	mutex_lock(&blkcg_pol_mutex);
325	spin_lock_irq(&blkcg->lock);
326
327	/*
328	 * Note that stat reset is racy - it doesn't synchronize against
329	 * stat updates.  This is a debug feature which shouldn't exist
330	 * anyway.  If you get hit by a race, retry.
331	 */
332	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
333		for (i = 0; i < BLKCG_MAX_POLS; i++) {
334			struct blkcg_policy *pol = blkcg_policy[i];
335
336			if (blkcg_policy_enabled(blkg->q, pol) &&
337			    pol->pd_reset_stats_fn)
338				pol->pd_reset_stats_fn(blkg);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339		}
 
 
 
 
 
 
 
 
 
340	}
341
342	spin_unlock_irq(&blkcg->lock);
343	mutex_unlock(&blkcg_pol_mutex);
344	return 0;
345}
346
347static const char *blkg_dev_name(struct blkcg_gq *blkg)
 
348{
349	/* some drivers (floppy) instantiate a queue w/o disk registered */
350	if (blkg->q->backing_dev_info.dev)
351		return dev_name(blkg->q->backing_dev_info.dev);
352	return NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353}
354
355/**
356 * blkcg_print_blkgs - helper for printing per-blkg data
357 * @sf: seq_file to print to
358 * @blkcg: blkcg of interest
359 * @prfill: fill function to print out a blkg
360 * @pol: policy in question
361 * @data: data to be passed to @prfill
362 * @show_total: to print out sum of prfill return values or not
363 *
364 * This function invokes @prfill on each blkg of @blkcg if pd for the
365 * policy specified by @pol exists.  @prfill is invoked with @sf, the
366 * policy data and @data.  If @show_total is %true, the sum of the return
367 * values from @prfill is printed with "Total" label at the end.
368 *
369 * This is to be used to construct print functions for
370 * cftype->read_seq_string method.
371 */
372void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
373		       u64 (*prfill)(struct seq_file *,
374				     struct blkg_policy_data *, int),
375		       const struct blkcg_policy *pol, int data,
376		       bool show_total)
377{
378	struct blkcg_gq *blkg;
379	struct hlist_node *n;
380	u64 total = 0;
381
382	spin_lock_irq(&blkcg->lock);
383	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
384		if (blkcg_policy_enabled(blkg->q, pol))
385			total += prfill(sf, blkg->pd[pol->plid], data);
386	spin_unlock_irq(&blkcg->lock);
387
388	if (show_total)
389		seq_printf(sf, "Total %llu\n", (unsigned long long)total);
390}
391EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
392
393/**
394 * __blkg_prfill_u64 - prfill helper for a single u64 value
395 * @sf: seq_file to print to
396 * @pd: policy private data of interest
397 * @v: value to print
398 *
399 * Print @v to @sf for the device assocaited with @pd.
400 */
401u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
402{
403	const char *dname = blkg_dev_name(pd->blkg);
 
 
 
 
 
 
404
405	if (!dname)
406		return 0;
 
 
 
 
 
407
408	seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
409	return v;
 
 
410}
411EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
412
413/**
414 * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
415 * @sf: seq_file to print to
416 * @pd: policy private data of interest
417 * @rwstat: rwstat to print
418 *
419 * Print @rwstat to @sf for the device assocaited with @pd.
420 */
421u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
422			 const struct blkg_rwstat *rwstat)
423{
424	static const char *rwstr[] = {
425		[BLKG_RWSTAT_READ]	= "Read",
426		[BLKG_RWSTAT_WRITE]	= "Write",
427		[BLKG_RWSTAT_SYNC]	= "Sync",
428		[BLKG_RWSTAT_ASYNC]	= "Async",
429	};
430	const char *dname = blkg_dev_name(pd->blkg);
431	u64 v;
432	int i;
433
434	if (!dname)
435		return 0;
 
 
436
437	for (i = 0; i < BLKG_RWSTAT_NR; i++)
438		seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
439			   (unsigned long long)rwstat->cnt[i]);
440
441	v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
442	seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
443	return v;
444}
445
446/**
447 * blkg_prfill_stat - prfill callback for blkg_stat
448 * @sf: seq_file to print to
449 * @pd: policy private data of interest
450 * @off: offset to the blkg_stat in @pd
451 *
452 * prfill callback for printing a blkg_stat.
453 */
454u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off)
455{
456	return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457}
458EXPORT_SYMBOL_GPL(blkg_prfill_stat);
459
460/**
461 * blkg_prfill_rwstat - prfill callback for blkg_rwstat
462 * @sf: seq_file to print to
463 * @pd: policy private data of interest
464 * @off: offset to the blkg_rwstat in @pd
465 *
466 * prfill callback for printing a blkg_rwstat.
467 */
468u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
469		       int off)
470{
471	struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off);
 
472
473	return __blkg_prfill_rwstat(sf, pd, &rwstat);
 
 
 
 
474}
475EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
476
477/**
478 * blkg_conf_prep - parse and prepare for per-blkg config update
479 * @blkcg: target block cgroup
480 * @pol: target policy
481 * @input: input string
482 * @ctx: blkg_conf_ctx to be filled
483 *
484 * Parse per-blkg config update from @input and initialize @ctx with the
485 * result.  @ctx->blkg points to the blkg to be updated and @ctx->v the new
486 * value.  This function returns with RCU read lock and queue lock held and
487 * must be paired with blkg_conf_finish().
488 */
489int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
490		   const char *input, struct blkg_conf_ctx *ctx)
491	__acquires(rcu) __acquires(disk->queue->queue_lock)
492{
493	struct gendisk *disk;
494	struct blkcg_gq *blkg;
495	unsigned int major, minor;
496	unsigned long long v;
497	int part, ret;
 
498
499	if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
 
 
 
 
 
 
 
 
 
 
 
 
 
500		return -EINVAL;
501
502	disk = get_gendisk(MKDEV(major, minor), &part);
503	if (!disk || part)
 
 
504		return -EINVAL;
505
506	rcu_read_lock();
507	spin_lock_irq(disk->queue->queue_lock);
 
508
509	if (blkcg_policy_enabled(disk->queue, pol))
510		blkg = blkg_lookup_create(blkcg, disk->queue);
511	else
512		blkg = ERR_PTR(-EINVAL);
513
514	if (IS_ERR(blkg)) {
515		ret = PTR_ERR(blkg);
516		rcu_read_unlock();
517		spin_unlock_irq(disk->queue->queue_lock);
518		put_disk(disk);
519		/*
520		 * If queue was bypassing, we should retry.  Do so after a
521		 * short msleep().  It isn't strictly necessary but queue
522		 * can be bypassing for some time and it's always nice to
523		 * avoid busy looping.
524		 */
525		if (ret == -EBUSY) {
526			msleep(10);
527			ret = restart_syscall();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
528		}
529		return ret;
 
 
530	}
531
532	ctx->disk = disk;
533	ctx->blkg = blkg;
534	ctx->v = v;
535	return 0;
536}
537EXPORT_SYMBOL_GPL(blkg_conf_prep);
538
539/**
540 * blkg_conf_finish - finish up per-blkg config update
541 * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
542 *
543 * Finish up after per-blkg config update.  This function must be paired
544 * with blkg_conf_prep().
545 */
546void blkg_conf_finish(struct blkg_conf_ctx *ctx)
547	__releases(ctx->disk->queue->queue_lock) __releases(rcu)
548{
549	spin_unlock_irq(ctx->disk->queue->queue_lock);
550	rcu_read_unlock();
551	put_disk(ctx->disk);
552}
553EXPORT_SYMBOL_GPL(blkg_conf_finish);
554
555struct cftype blkcg_files[] = {
556	{
557		.name = "reset_stats",
558		.write_u64 = blkcg_reset_stats,
559	},
560	{ }	/* terminate */
561};
 
562
563/**
564 * blkcg_pre_destroy - cgroup pre_destroy callback
565 * @cgroup: cgroup of interest
566 *
567 * This function is called when @cgroup is about to go away and responsible
568 * for shooting down all blkgs associated with @cgroup.  blkgs should be
569 * removed while holding both q and blkcg locks.  As blkcg lock is nested
570 * inside q lock, this function performs reverse double lock dancing.
571 *
572 * This is the blkcg counterpart of ioc_release_fn().
573 */
574static int blkcg_pre_destroy(struct cgroup *cgroup)
575{
576	struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
577
578	spin_lock_irq(&blkcg->lock);
579
580	while (!hlist_empty(&blkcg->blkg_list)) {
581		struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
582						struct blkcg_gq, blkcg_node);
583		struct request_queue *q = blkg->q;
584
585		if (spin_trylock(q->queue_lock)) {
586			blkg_destroy(blkg);
587			spin_unlock(q->queue_lock);
588		} else {
589			spin_unlock_irq(&blkcg->lock);
590			cpu_relax();
591			spin_lock_irq(&blkcg->lock);
592		}
593	}
594
595	spin_unlock_irq(&blkcg->lock);
596	return 0;
 
 
 
 
 
 
 
597}
598
599static void blkcg_destroy(struct cgroup *cgroup)
600{
601	struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
602
603	if (blkcg != &blkcg_root)
604		kfree(blkcg);
 
 
 
 
605}
606
607static struct cgroup_subsys_state *blkcg_create(struct cgroup *cgroup)
608{
609	static atomic64_t id_seq = ATOMIC64_INIT(0);
610	struct blkcg *blkcg;
611	struct cgroup *parent = cgroup->parent;
 
 
 
 
 
612
613	if (!parent) {
614		blkcg = &blkcg_root;
615		goto done;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
616	}
617
618	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
619	if (!blkcg)
620		return ERR_PTR(-ENOMEM);
621
622	blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
623	blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
624done:
625	spin_lock_init(&blkcg->lock);
626	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
627	INIT_HLIST_HEAD(&blkcg->blkg_list);
628
629	return &blkcg->css;
630}
631
632/**
633 * blkcg_init_queue - initialize blkcg part of request queue
634 * @q: request_queue to initialize
635 *
636 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
637 * part of new request_queue @q.
638 *
639 * RETURNS:
640 * 0 on success, -errno on failure.
641 */
642int blkcg_init_queue(struct request_queue *q)
643{
644	might_sleep();
645
646	return blk_throtl_init(q);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
647}
648
649/**
650 * blkcg_drain_queue - drain blkcg part of request_queue
651 * @q: request_queue to drain
652 *
653 * Called from blk_drain_queue().  Responsible for draining blkcg part.
654 */
655void blkcg_drain_queue(struct request_queue *q)
 
656{
657	lockdep_assert_held(q->queue_lock);
 
658
659	blk_throtl_drain(q);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
660}
661
662/**
663 * blkcg_exit_queue - exit and release blkcg part of request_queue
664 * @q: request_queue being released
665 *
666 * Called from blk_release_queue().  Responsible for exiting blkcg part.
667 */
668void blkcg_exit_queue(struct request_queue *q)
 
669{
670	spin_lock_irq(q->queue_lock);
671	blkg_destroy_all(q);
672	spin_unlock_irq(q->queue_lock);
673
674	blk_throtl_exit(q);
 
 
 
 
 
 
 
 
 
 
675}
676
677/*
678 * We cannot support shared io contexts, as we have no mean to support
679 * two tasks with the same ioc in two different groups without major rework
680 * of the main cic data structures.  For now we allow a task to change
681 * its cgroup only if it's the only owner of its ioc.
682 */
683static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
684{
685	struct task_struct *task;
686	struct io_context *ioc;
687	int ret = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
688
689	/* task_lock() is needed to avoid races with exit_io_context() */
690	cgroup_taskset_for_each(task, cgrp, tset) {
691		task_lock(task);
692		ioc = task->io_context;
693		if (ioc && atomic_read(&ioc->nr_tasks) > 1)
694			ret = -EINVAL;
695		task_unlock(task);
696		if (ret)
697			break;
 
 
 
 
 
 
 
 
 
 
 
 
 
698	}
 
 
 
 
 
 
 
 
 
 
 
 
699	return ret;
700}
701
702struct cgroup_subsys blkio_subsys = {
703	.name = "blkio",
704	.create = blkcg_create,
705	.can_attach = blkcg_can_attach,
706	.pre_destroy = blkcg_pre_destroy,
707	.destroy = blkcg_destroy,
708	.subsys_id = blkio_subsys_id,
709	.base_cftypes = blkcg_files,
710	.module = THIS_MODULE,
711};
712EXPORT_SYMBOL_GPL(blkio_subsys);
713
714/**
715 * blkcg_activate_policy - activate a blkcg policy on a request_queue
716 * @q: request_queue of interest
717 * @pol: blkcg policy to activate
718 *
719 * Activate @pol on @q.  Requires %GFP_KERNEL context.  @q goes through
720 * bypass mode to populate its blkgs with policy_data for @pol.
721 *
722 * Activation happens with @q bypassed, so nobody would be accessing blkgs
723 * from IO path.  Update of each blkg is protected by both queue and blkcg
724 * locks so that holding either lock and testing blkcg_policy_enabled() is
725 * always enough for dereferencing policy data.
726 *
727 * The caller is responsible for synchronizing [de]activations and policy
728 * [un]registerations.  Returns 0 on success, -errno on failure.
729 */
730int blkcg_activate_policy(struct request_queue *q,
731			  const struct blkcg_policy *pol)
732{
733	LIST_HEAD(pds);
734	struct blkcg_gq *blkg;
735	struct blkg_policy_data *pd, *n;
736	int cnt = 0, ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
737
738	if (blkcg_policy_enabled(q, pol))
739		return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
740
741	blk_queue_bypass_start(q);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
742
743	/* make sure the root blkg exists and count the existing blkgs */
744	spin_lock_irq(q->queue_lock);
 
 
 
 
 
 
 
 
745
746	rcu_read_lock();
747	blkg = __blkg_lookup_create(&blkcg_root, q);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
748	rcu_read_unlock();
 
 
749
750	if (IS_ERR(blkg)) {
751		ret = PTR_ERR(blkg);
752		goto out_unlock;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
753	}
754	q->root_blkg = blkg;
755
756	list_for_each_entry(blkg, &q->blkg_list, q_node)
757		cnt++;
 
 
 
 
 
 
758
759	spin_unlock_irq(q->queue_lock);
 
 
 
 
 
760
761	/* allocate policy_data for all existing blkgs */
762	while (cnt--) {
763		pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
764		if (!pd) {
765			ret = -ENOMEM;
766			goto out_free;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
767		}
768		list_add_tail(&pd->alloc_node, &pds);
 
 
769	}
 
 
770
771	/*
772	 * Install the allocated pds.  With @q bypassing, no new blkg
773	 * should have been created while the queue lock was dropped.
774	 */
775	spin_lock_irq(q->queue_lock);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
776
777	list_for_each_entry(blkg, &q->blkg_list, q_node) {
778		if (WARN_ON(list_empty(&pds))) {
779			/* umm... this shouldn't happen, just abort */
780			ret = -ENOMEM;
781			goto out_unlock;
782		}
783		pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
784		list_del_init(&pd->alloc_node);
785
786		/* grab blkcg lock too while installing @pd on @blkg */
787		spin_lock(&blkg->blkcg->lock);
788
789		blkg->pd[pol->plid] = pd;
790		pd->blkg = blkg;
791		pol->pd_init_fn(blkg);
792
793		spin_unlock(&blkg->blkcg->lock);
794	}
795
796	__set_bit(pol->plid, q->blkcg_pols);
797	ret = 0;
798out_unlock:
799	spin_unlock_irq(q->queue_lock);
800out_free:
801	blk_queue_bypass_end(q);
802	list_for_each_entry_safe(pd, n, &pds, alloc_node)
803		kfree(pd);
804	return ret;
805}
806EXPORT_SYMBOL_GPL(blkcg_activate_policy);
807
808/**
809 * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue
810 * @q: request_queue of interest
811 * @pol: blkcg policy to deactivate
812 *
813 * Deactivate @pol on @q.  Follows the same synchronization rules as
814 * blkcg_activate_policy().
815 */
816void blkcg_deactivate_policy(struct request_queue *q,
817			     const struct blkcg_policy *pol)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
818{
819	struct blkcg_gq *blkg;
 
 
820
821	if (!blkcg_policy_enabled(q, pol))
822		return;
 
 
 
 
 
 
823
824	blk_queue_bypass_start(q);
825	spin_lock_irq(q->queue_lock);
 
826
827	__clear_bit(pol->plid, q->blkcg_pols);
 
 
 
828
829	/* if no policy is left, no need for blkgs - shoot them down */
830	if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS))
831		blkg_destroy_all(q);
 
832
833	list_for_each_entry(blkg, &q->blkg_list, q_node) {
834		/* grab blkcg lock too while removing @pd from @blkg */
835		spin_lock(&blkg->blkcg->lock);
836
837		if (pol->pd_exit_fn)
838			pol->pd_exit_fn(blkg);
 
 
 
 
 
 
 
 
 
 
 
839
840		kfree(blkg->pd[pol->plid]);
841		blkg->pd[pol->plid] = NULL;
 
 
842
843		spin_unlock(&blkg->blkcg->lock);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
844	}
845
846	spin_unlock_irq(q->queue_lock);
847	blk_queue_bypass_end(q);
 
 
 
 
 
 
 
 
 
848}
849EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
850
851/**
852 * blkcg_policy_register - register a blkcg policy
853 * @pol: blkcg policy to register
854 *
855 * Register @pol with blkcg core.  Might sleep and @pol may be modified on
856 * successful registration.  Returns 0 on success and -errno on failure.
857 */
858int blkcg_policy_register(struct blkcg_policy *pol)
859{
860	int i, ret;
 
861
862	if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
863		return -EINVAL;
 
 
 
 
864
865	mutex_lock(&blkcg_pol_mutex);
 
866
867	/* find an empty slot */
868	ret = -ENOSPC;
869	for (i = 0; i < BLKCG_MAX_POLS; i++)
870		if (!blkcg_policy[i])
871			break;
872	if (i >= BLKCG_MAX_POLS)
873		goto out_unlock;
874
875	/* register and update blkgs */
876	pol->plid = i;
877	blkcg_policy[i] = pol;
878
879	/* everything is in place, add intf files for the new policy */
880	if (pol->cftypes)
881		WARN_ON(cgroup_add_cftypes(&blkio_subsys, pol->cftypes));
882	ret = 0;
883out_unlock:
884	mutex_unlock(&blkcg_pol_mutex);
885	return ret;
886}
887EXPORT_SYMBOL_GPL(blkcg_policy_register);
888
889/**
890 * blkcg_policy_unregister - unregister a blkcg policy
891 * @pol: blkcg policy to unregister
892 *
893 * Undo blkcg_policy_register(@pol).  Might sleep.
894 */
895void blkcg_policy_unregister(struct blkcg_policy *pol)
896{
897	mutex_lock(&blkcg_pol_mutex);
 
 
 
 
898
899	if (WARN_ON(blkcg_policy[pol->plid] != pol))
900		goto out_unlock;
 
 
 
 
 
901
902	/* kill the intf files first */
903	if (pol->cftypes)
904		cgroup_rm_cftypes(&blkio_subsys, pol->cftypes);
 
905
906	/* unregister and update blkgs */
907	blkcg_policy[pol->plid] = NULL;
908out_unlock:
909	mutex_unlock(&blkcg_pol_mutex);
910}
911EXPORT_SYMBOL_GPL(blkcg_policy_unregister);

   1/*
   2 * Common Block IO controller cgroup interface
   3 *
   4 * Based on ideas and code from CFQ, CFS and BFQ:
   5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
   6 *
   7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
   8 *		      Paolo Valente <paolo.valente@unimore.it>
   9 *
  10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
  11 * 	              Nauman Rafique <nauman@google.com>
  12 */
  13#include <linux/ioprio.h>
  14#include <linux/seq_file.h>
  15#include <linux/kdev_t.h>
  16#include <linux/module.h>
  17#include <linux/err.h>
  18#include <linux/blkdev.h>
  19#include <linux/slab.h>
 
 
 
  20#include "blk-cgroup.h"
  21#include <linux/genhd.h>
  22
  23#define MAX_KEY_LEN 100
  24
  25static DEFINE_SPINLOCK(blkio_list_lock);
  26static LIST_HEAD(blkio_list);
  27
  28struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
  29EXPORT_SYMBOL_GPL(blkio_root_cgroup);
  30
  31static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
  32						  struct cgroup *);
  33static int blkiocg_can_attach_task(struct cgroup *, struct task_struct *);
  34static void blkiocg_attach_task(struct cgroup *, struct task_struct *);
  35static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
  36static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
  37
  38/* for encoding cft->private value on file */
  39#define BLKIOFILE_PRIVATE(x, val)	(((x) << 16) | (val))
  40/* What policy owns the file, proportional or throttle */
  41#define BLKIOFILE_POLICY(val)		(((val) >> 16) & 0xffff)
  42#define BLKIOFILE_ATTR(val)		((val) & 0xffff)
  43
  44struct cgroup_subsys blkio_subsys = {
  45	.name = "blkio",
  46	.create = blkiocg_create,
  47	.can_attach_task = blkiocg_can_attach_task,
  48	.attach_task = blkiocg_attach_task,
  49	.destroy = blkiocg_destroy,
  50	.populate = blkiocg_populate,
  51#ifdef CONFIG_BLK_CGROUP
  52	/* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
  53	.subsys_id = blkio_subsys_id,
  54#endif
  55	.use_id = 1,
  56	.module = THIS_MODULE,
  57};
  58EXPORT_SYMBOL_GPL(blkio_subsys);
  59
  60static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
  61					    struct blkio_policy_node *pn)
  62{
  63	list_add(&pn->node, &blkcg->policy_list);
 
  64}
 
  65
  66static inline bool cftype_blkg_same_policy(struct cftype *cft,
  67			struct blkio_group *blkg)
  68{
  69	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
  70
  71	if (blkg->plid == plid)
  72		return 1;
  73
  74	return 0;
  75}
  76
  77/* Determines if policy node matches cgroup file being accessed */
  78static inline bool pn_matches_cftype(struct cftype *cft,
  79			struct blkio_policy_node *pn)
  80{
  81	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
  82	int fileid = BLKIOFILE_ATTR(cft->private);
  83
  84	return (plid == pn->plid && fileid == pn->fileid);
  85}
 
  86
  87/* Must be called with blkcg->lock held */
  88static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
  89{
  90	list_del(&pn->node);
  91}
  92
  93/* Must be called with blkcg->lock held */
  94static struct blkio_policy_node *
  95blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev,
  96		enum blkio_policy_id plid, int fileid)
 
 
 
  97{
  98	struct blkio_policy_node *pn;
  99
 100	list_for_each_entry(pn, &blkcg->policy_list, node) {
 101		if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid)
 102			return pn;
 103	}
 104
 105	return NULL;
 106}
 
 107
 108struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 109{
 110	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
 111			    struct blkio_cgroup, css);
 112}
 113EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
 114
 115struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
 116{
 117	return container_of(task_subsys_state(tsk, blkio_subsys_id),
 118			    struct blkio_cgroup, css);
 119}
 120EXPORT_SYMBOL_GPL(task_blkio_cgroup);
 121
 122static inline void
 123blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
 124{
 125	struct blkio_policy_type *blkiop;
 126
 127	list_for_each_entry(blkiop, &blkio_list, list) {
 128		/* If this policy does not own the blkg, do not send updates */
 129		if (blkiop->plid != blkg->plid)
 130			continue;
 131		if (blkiop->ops.blkio_update_group_weight_fn)
 132			blkiop->ops.blkio_update_group_weight_fn(blkg->key,
 133							blkg, weight);
 134	}
 135}
 136
 137static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
 138				int fileid)
 
 
 
 
 
 
 139{
 140	struct blkio_policy_type *blkiop;
 
 141
 142	list_for_each_entry(blkiop, &blkio_list, list) {
 
 
 
 
 
 
 
 
 
 
 
 
 143
 144		/* If this policy does not own the blkg, do not send updates */
 145		if (blkiop->plid != blkg->plid)
 146			continue;
 147
 148		if (fileid == BLKIO_THROTL_read_bps_device
 149		    && blkiop->ops.blkio_update_group_read_bps_fn)
 150			blkiop->ops.blkio_update_group_read_bps_fn(blkg->key,
 151								blkg, bps);
 152
 153		if (fileid == BLKIO_THROTL_write_bps_device
 154		    && blkiop->ops.blkio_update_group_write_bps_fn)
 155			blkiop->ops.blkio_update_group_write_bps_fn(blkg->key,
 156								blkg, bps);
 157	}
 158}
 159
 160static inline void blkio_update_group_iops(struct blkio_group *blkg,
 161			unsigned int iops, int fileid)
 162{
 163	struct blkio_policy_type *blkiop;
 164
 165	list_for_each_entry(blkiop, &blkio_list, list) {
 
 166
 167		/* If this policy does not own the blkg, do not send updates */
 168		if (blkiop->plid != blkg->plid)
 169			continue;
 170
 171		if (fileid == BLKIO_THROTL_read_iops_device
 172		    && blkiop->ops.blkio_update_group_read_iops_fn)
 173			blkiop->ops.blkio_update_group_read_iops_fn(blkg->key,
 174								blkg, iops);
 175
 176		if (fileid == BLKIO_THROTL_write_iops_device
 177		    && blkiop->ops.blkio_update_group_write_iops_fn)
 178			blkiop->ops.blkio_update_group_write_iops_fn(blkg->key,
 179								blkg,iops);
 180	}
 181}
 182
 183/*
 184 * Add to the appropriate stat variable depending on the request type.
 185 * This should be called with the blkg->stats_lock held.
 186 */
 187static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
 188				bool sync)
 189{
 190	if (direction)
 191		stat[BLKIO_STAT_WRITE] += add;
 192	else
 193		stat[BLKIO_STAT_READ] += add;
 194	if (sync)
 195		stat[BLKIO_STAT_SYNC] += add;
 196	else
 197		stat[BLKIO_STAT_ASYNC] += add;
 198}
 199
 200/*
 201 * Decrements the appropriate stat variable if non-zero depending on the
 202 * request type. Panics on value being zero.
 203 * This should be called with the blkg->stats_lock held.
 204 */
 205static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
 206{
 207	if (direction) {
 208		BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
 209		stat[BLKIO_STAT_WRITE]--;
 210	} else {
 211		BUG_ON(stat[BLKIO_STAT_READ] == 0);
 212		stat[BLKIO_STAT_READ]--;
 213	}
 214	if (sync) {
 215		BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
 216		stat[BLKIO_STAT_SYNC]--;
 217	} else {
 218		BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
 219		stat[BLKIO_STAT_ASYNC]--;
 220	}
 221}
 222
 223#ifdef CONFIG_DEBUG_BLK_CGROUP
 224/* This should be called with the blkg->stats_lock held. */
 225static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
 226						struct blkio_group *curr_blkg)
 227{
 228	if (blkio_blkg_waiting(&blkg->stats))
 229		return;
 230	if (blkg == curr_blkg)
 231		return;
 232	blkg->stats.start_group_wait_time = sched_clock();
 233	blkio_mark_blkg_waiting(&blkg->stats);
 234}
 235
 236/* This should be called with the blkg->stats_lock held. */
 237static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
 238{
 239	unsigned long long now;
 240
 241	if (!blkio_blkg_waiting(stats))
 242		return;
 
 
 
 
 
 
 
 243
 244	now = sched_clock();
 245	if (time_after64(now, stats->start_group_wait_time))
 246		stats->group_wait_time += now - stats->start_group_wait_time;
 247	blkio_clear_blkg_waiting(stats);
 248}
 249
 250/* This should be called with the blkg->stats_lock held. */
 251static void blkio_end_empty_time(struct blkio_group_stats *stats)
 
 
 
 
 
 
 
 
 252{
 253	unsigned long long now;
 254
 255	if (!blkio_blkg_empty(stats))
 256		return;
 257
 258	now = sched_clock();
 259	if (time_after64(now, stats->start_empty_time))
 260		stats->empty_time += now - stats->start_empty_time;
 261	blkio_clear_blkg_empty(stats);
 262}
 
 263
 264void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
 
 
 265{
 266	unsigned long flags;
 
 267
 268	spin_lock_irqsave(&blkg->stats_lock, flags);
 269	BUG_ON(blkio_blkg_idling(&blkg->stats));
 270	blkg->stats.start_idle_time = sched_clock();
 271	blkio_mark_blkg_idling(&blkg->stats);
 272	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 273}
 274EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
 
 
 
 
 
 
 
 
 
 
 
 
 275
 276void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
 277{
 278	unsigned long flags;
 279	unsigned long long now;
 280	struct blkio_group_stats *stats;
 281
 282	spin_lock_irqsave(&blkg->stats_lock, flags);
 283	stats = &blkg->stats;
 284	if (blkio_blkg_idling(stats)) {
 285		now = sched_clock();
 286		if (time_after64(now, stats->start_idle_time))
 287			stats->idle_time += now - stats->start_idle_time;
 288		blkio_clear_blkg_idling(stats);
 289	}
 290	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 291}
 292EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
 293
 294void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
 295{
 296	unsigned long flags;
 297	struct blkio_group_stats *stats;
 298
 299	spin_lock_irqsave(&blkg->stats_lock, flags);
 300	stats = &blkg->stats;
 301	stats->avg_queue_size_sum +=
 302			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
 303			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
 304	stats->avg_queue_size_samples++;
 305	blkio_update_group_wait_time(stats);
 306	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 307}
 308EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
 309
 310void blkiocg_set_start_empty_time(struct blkio_group *blkg)
 
 311{
 312	unsigned long flags;
 313	struct blkio_group_stats *stats;
 314
 315	spin_lock_irqsave(&blkg->stats_lock, flags);
 316	stats = &blkg->stats;
 317
 318	if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
 319			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
 320		spin_unlock_irqrestore(&blkg->stats_lock, flags);
 321		return;
 322	}
 323
 324	/*
 325	 * group is already marked empty. This can happen if cfqq got new
 326	 * request in parent group and moved to this group while being added
 327	 * to service tree. Just ignore the event and move on.
 328	 */
 329	if(blkio_blkg_empty(stats)) {
 330		spin_unlock_irqrestore(&blkg->stats_lock, flags);
 331		return;
 332	}
 333
 334	stats->start_empty_time = sched_clock();
 335	blkio_mark_blkg_empty(stats);
 336	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 337}
 338EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
 339
 340void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 341			unsigned long dequeue)
 342{
 343	blkg->stats.dequeue += dequeue;
 344}
 345EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
 346#else
 347static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
 348					struct blkio_group *curr_blkg) {}
 349static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
 350#endif
 351
 352void blkiocg_update_io_add_stats(struct blkio_group *blkg,
 353			struct blkio_group *curr_blkg, bool direction,
 354			bool sync)
 355{
 356	unsigned long flags;
 357
 358	spin_lock_irqsave(&blkg->stats_lock, flags);
 359	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
 360			sync);
 361	blkio_end_empty_time(&blkg->stats);
 362	blkio_set_start_group_wait_time(blkg, curr_blkg);
 363	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 364}
 365EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
 366
 367void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
 368						bool direction, bool sync)
 369{
 370	unsigned long flags;
 371
 372	spin_lock_irqsave(&blkg->stats_lock, flags);
 373	blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
 374					direction, sync);
 375	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 376}
 377EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
 378
 379void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
 380				unsigned long unaccounted_time)
 381{
 382	unsigned long flags;
 383
 384	spin_lock_irqsave(&blkg->stats_lock, flags);
 385	blkg->stats.time += time;
 386#ifdef CONFIG_DEBUG_BLK_CGROUP
 387	blkg->stats.unaccounted_time += unaccounted_time;
 388#endif
 389	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 390}
 391EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
 392
 393/*
 394 * should be called under rcu read lock or queue lock to make sure blkg pointer
 395 * is valid.
 396 */
 397void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
 398				uint64_t bytes, bool direction, bool sync)
 399{
 400	struct blkio_group_stats_cpu *stats_cpu;
 401	unsigned long flags;
 402
 403	/*
 404	 * Disabling interrupts to provide mutual exclusion between two
 405	 * writes on same cpu. It probably is not needed for 64bit. Not
 406	 * optimizing that case yet.
 407	 */
 408	local_irq_save(flags);
 409
 410	stats_cpu = this_cpu_ptr(blkg->stats_cpu);
 411
 412	u64_stats_update_begin(&stats_cpu->syncp);
 413	stats_cpu->sectors += bytes >> 9;
 414	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
 415			1, direction, sync);
 416	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
 417			bytes, direction, sync);
 418	u64_stats_update_end(&stats_cpu->syncp);
 419	local_irq_restore(flags);
 420}
 421EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
 422
 423void blkiocg_update_completion_stats(struct blkio_group *blkg,
 424	uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
 425{
 426	struct blkio_group_stats *stats;
 427	unsigned long flags;
 428	unsigned long long now = sched_clock();
 429
 430	spin_lock_irqsave(&blkg->stats_lock, flags);
 431	stats = &blkg->stats;
 432	if (time_after64(now, io_start_time))
 433		blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
 434				now - io_start_time, direction, sync);
 435	if (time_after64(io_start_time, start_time))
 436		blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
 437				io_start_time - start_time, direction, sync);
 438	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 439}
 440EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
 441
 442/*  Merged stats are per cpu.  */
 443void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
 444					bool sync)
 445{
 446	struct blkio_group_stats_cpu *stats_cpu;
 447	unsigned long flags;
 448
 449	/*
 450	 * Disabling interrupts to provide mutual exclusion between two
 451	 * writes on same cpu. It probably is not needed for 64bit. Not
 452	 * optimizing that case yet.
 453	 */
 454	local_irq_save(flags);
 455
 456	stats_cpu = this_cpu_ptr(blkg->stats_cpu);
 457
 458	u64_stats_update_begin(&stats_cpu->syncp);
 459	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1,
 460				direction, sync);
 461	u64_stats_update_end(&stats_cpu->syncp);
 462	local_irq_restore(flags);
 463}
 464EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
 465
 466/*
 467 * This function allocates the per cpu stats for blkio_group. Should be called
 468 * from sleepable context as alloc_per_cpu() requires that.
 
 
 469 */
 470int blkio_alloc_blkg_stats(struct blkio_group *blkg)
 471{
 472	/* Allocate memory for per cpu stats */
 473	blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
 474	if (!blkg->stats_cpu)
 475		return -ENOMEM;
 476	return 0;
 477}
 478EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats);
 479
 480void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 481		struct blkio_group *blkg, void *key, dev_t dev,
 482		enum blkio_policy_id plid)
 483{
 484	unsigned long flags;
 485
 486	spin_lock_irqsave(&blkcg->lock, flags);
 487	spin_lock_init(&blkg->stats_lock);
 488	rcu_assign_pointer(blkg->key, key);
 489	blkg->blkcg_id = css_id(&blkcg->css);
 490	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
 491	blkg->plid = plid;
 492	spin_unlock_irqrestore(&blkcg->lock, flags);
 493	/* Need to take css reference ? */
 494	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
 495	blkg->dev = dev;
 496}
 497EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
 498
 499static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
 500{
 501	hlist_del_init_rcu(&blkg->blkcg_node);
 502	blkg->blkcg_id = 0;
 503}
 504
 505/*
 506 * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
 507 * indicating that blk_group was unhashed by the time we got to it.
 508 */
 509int blkiocg_del_blkio_group(struct blkio_group *blkg)
 510{
 511	struct blkio_cgroup *blkcg;
 512	unsigned long flags;
 513	struct cgroup_subsys_state *css;
 514	int ret = 1;
 515
 516	rcu_read_lock();
 517	css = css_lookup(&blkio_subsys, blkg->blkcg_id);
 518	if (css) {
 519		blkcg = container_of(css, struct blkio_cgroup, css);
 520		spin_lock_irqsave(&blkcg->lock, flags);
 521		if (!hlist_unhashed(&blkg->blkcg_node)) {
 522			__blkiocg_del_blkio_group(blkg);
 523			ret = 0;
 524		}
 525		spin_unlock_irqrestore(&blkcg->lock, flags);
 526	}
 527
 528	rcu_read_unlock();
 529	return ret;
 530}
 531EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
 532
 533/* called under rcu_read_lock(). */
 534struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
 535{
 536	struct blkio_group *blkg;
 537	struct hlist_node *n;
 538	void *__key;
 539
 540	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
 541		__key = blkg->key;
 542		if (__key == key)
 543			return blkg;
 544	}
 545
 546	return NULL;
 547}
 548EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
 549
 550static void blkio_reset_stats_cpu(struct blkio_group *blkg)
 551{
 552	struct blkio_group_stats_cpu *stats_cpu;
 553	int i, j, k;
 
 554	/*
 555	 * Note: On 64 bit arch this should not be an issue. This has the
 556	 * possibility of returning some inconsistent value on 32bit arch
 557	 * as 64bit update on 32bit is non atomic. Taking care of this
 558	 * corner case makes code very complicated, like sending IPIs to
 559	 * cpus, taking care of stats of offline cpus etc.
 560	 *
 561	 * reset stats is anyway more of a debug feature and this sounds a
 562	 * corner case. So I am not complicating the code yet until and
 563	 * unless this becomes a real issue.
 564	 */
 565	for_each_possible_cpu(i) {
 566		stats_cpu = per_cpu_ptr(blkg->stats_cpu, i);
 567		stats_cpu->sectors = 0;
 568		for(j = 0; j < BLKIO_STAT_CPU_NR; j++)
 569			for (k = 0; k < BLKIO_STAT_TOTAL; k++)
 570				stats_cpu->stat_arr_cpu[j][k] = 0;
 571	}
 572}
 
 573
 574static int
 575blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 576{
 577	struct blkio_cgroup *blkcg;
 578	struct blkio_group *blkg;
 579	struct blkio_group_stats *stats;
 580	struct hlist_node *n;
 581	uint64_t queued[BLKIO_STAT_TOTAL];
 582	int i;
 583#ifdef CONFIG_DEBUG_BLK_CGROUP
 584	bool idling, waiting, empty;
 585	unsigned long long now = sched_clock();
 586#endif
 587
 588	blkcg = cgroup_to_blkio_cgroup(cgroup);
 589	spin_lock_irq(&blkcg->lock);
 
 
 
 
 
 
 590	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
 591		spin_lock(&blkg->stats_lock);
 592		stats = &blkg->stats;
 593#ifdef CONFIG_DEBUG_BLK_CGROUP
 594		idling = blkio_blkg_idling(stats);
 595		waiting = blkio_blkg_waiting(stats);
 596		empty = blkio_blkg_empty(stats);
 597#endif
 598		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
 599			queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
 600		memset(stats, 0, sizeof(struct blkio_group_stats));
 601		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
 602			stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
 603#ifdef CONFIG_DEBUG_BLK_CGROUP
 604		if (idling) {
 605			blkio_mark_blkg_idling(stats);
 606			stats->start_idle_time = now;
 607		}
 608		if (waiting) {
 609			blkio_mark_blkg_waiting(stats);
 610			stats->start_group_wait_time = now;
 611		}
 612		if (empty) {
 613			blkio_mark_blkg_empty(stats);
 614			stats->start_empty_time = now;
 615		}
 616#endif
 617		spin_unlock(&blkg->stats_lock);
 618
 619		/* Reset Per cpu stats which don't take blkg->stats_lock */
 620		blkio_reset_stats_cpu(blkg);
 621	}
 622
 623	spin_unlock_irq(&blkcg->lock);
 
 624	return 0;
 625}
 626
 627static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
 628				int chars_left, bool diskname_only)
 629{
 630	snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
 631	chars_left -= strlen(str);
 632	if (chars_left <= 0) {
 633		printk(KERN_WARNING
 634			"Possibly incorrect cgroup stat display format");
 635		return;
 636	}
 637	if (diskname_only)
 638		return;
 639	switch (type) {
 640	case BLKIO_STAT_READ:
 641		strlcat(str, " Read", chars_left);
 642		break;
 643	case BLKIO_STAT_WRITE:
 644		strlcat(str, " Write", chars_left);
 645		break;
 646	case BLKIO_STAT_SYNC:
 647		strlcat(str, " Sync", chars_left);
 648		break;
 649	case BLKIO_STAT_ASYNC:
 650		strlcat(str, " Async", chars_left);
 651		break;
 652	case BLKIO_STAT_TOTAL:
 653		strlcat(str, " Total", chars_left);
 654		break;
 655	default:
 656		strlcat(str, " Invalid", chars_left);
 657	}
 658}
 659
 660static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
 661				struct cgroup_map_cb *cb, dev_t dev)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 662{
 663	blkio_get_key_name(0, dev, str, chars_left, true);
 664	cb->fill(cb, str, val);
 665	return val;
 
 
 
 
 
 
 
 
 
 666}
 
 667
 668
 669static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg,
 670			enum stat_type_cpu type, enum stat_sub_type sub_type)
 
 
 
 
 
 
 671{
 672	int cpu;
 673	struct blkio_group_stats_cpu *stats_cpu;
 674	u64 val = 0, tval;
 675
 676	for_each_possible_cpu(cpu) {
 677		unsigned int start;
 678		stats_cpu  = per_cpu_ptr(blkg->stats_cpu, cpu);
 679
 680		do {
 681			start = u64_stats_fetch_begin(&stats_cpu->syncp);
 682			if (type == BLKIO_STAT_CPU_SECTORS)
 683				tval = stats_cpu->sectors;
 684			else
 685				tval = stats_cpu->stat_arr_cpu[type][sub_type];
 686		} while(u64_stats_fetch_retry(&stats_cpu->syncp, start));
 687
 688		val += tval;
 689	}
 690
 691	return val;
 692}
 
 693
 694static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg,
 695		struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type)
 
 
 
 
 
 
 
 
 696{
 697	uint64_t disk_total, val;
 698	char key_str[MAX_KEY_LEN];
 699	enum stat_sub_type sub_type;
 
 
 
 
 
 
 700
 701	if (type == BLKIO_STAT_CPU_SECTORS) {
 702		val = blkio_read_stat_cpu(blkg, type, 0);
 703		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev);
 704	}
 705
 706	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
 707			sub_type++) {
 708		blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
 709		val = blkio_read_stat_cpu(blkg, type, sub_type);
 710		cb->fill(cb, key_str, val);
 711	}
 712
 713	disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) +
 714			blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE);
 715
 716	blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
 717	cb->fill(cb, key_str, disk_total);
 718	return disk_total;
 719}
 720
 721/* This should be called with blkg->stats_lock held */
 722static uint64_t blkio_get_stat(struct blkio_group *blkg,
 723		struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
 724{
 725	uint64_t disk_total;
 726	char key_str[MAX_KEY_LEN];
 727	enum stat_sub_type sub_type;
 728
 729	if (type == BLKIO_STAT_TIME)
 730		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 731					blkg->stats.time, cb, dev);
 732#ifdef CONFIG_DEBUG_BLK_CGROUP
 733	if (type == BLKIO_STAT_UNACCOUNTED_TIME)
 734		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 735					blkg->stats.unaccounted_time, cb, dev);
 736	if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
 737		uint64_t sum = blkg->stats.avg_queue_size_sum;
 738		uint64_t samples = blkg->stats.avg_queue_size_samples;
 739		if (samples)
 740			do_div(sum, samples);
 741		else
 742			sum = 0;
 743		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
 744	}
 745	if (type == BLKIO_STAT_GROUP_WAIT_TIME)
 746		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 747					blkg->stats.group_wait_time, cb, dev);
 748	if (type == BLKIO_STAT_IDLE_TIME)
 749		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 750					blkg->stats.idle_time, cb, dev);
 751	if (type == BLKIO_STAT_EMPTY_TIME)
 752		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 753					blkg->stats.empty_time, cb, dev);
 754	if (type == BLKIO_STAT_DEQUEUE)
 755		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 756					blkg->stats.dequeue, cb, dev);
 757#endif
 758
 759	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
 760			sub_type++) {
 761		blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
 762		cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
 763	}
 764	disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
 765			blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
 766	blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
 767	cb->fill(cb, key_str, disk_total);
 768	return disk_total;
 769}
 
 770
 771static int blkio_check_dev_num(dev_t dev)
 
 
 
 
 
 
 
 
 
 772{
 773	int part = 0;
 774	struct gendisk *disk;
 775
 776	disk = get_gendisk(dev, &part);
 777	if (!disk || part)
 778		return -ENODEV;
 779
 780	return 0;
 781}
 
 782
 783static int blkio_policy_parse_and_set(char *buf,
 784	struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
 
 
 
 
 
 
 
 
 
 
 
 
 
 785{
 786	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
 787	int ret;
 788	unsigned long major, minor;
 789	int i = 0;
 790	dev_t dev;
 791	u64 temp;
 792
 793	memset(s, 0, sizeof(s));
 794
 795	while ((p = strsep(&buf, " ")) != NULL) {
 796		if (!*p)
 797			continue;
 798
 799		s[i++] = p;
 800
 801		/* Prevent from inputing too many things */
 802		if (i == 3)
 803			break;
 804	}
 805
 806	if (i != 2)
 807		return -EINVAL;
 808
 809	p = strsep(&s[0], ":");
 810	if (p != NULL)
 811		major_s = p;
 812	else
 813		return -EINVAL;
 814
 815	minor_s = s[0];
 816	if (!minor_s)
 817		return -EINVAL;
 818
 819	ret = strict_strtoul(major_s, 10, &major);
 820	if (ret)
 821		return -EINVAL;
 
 822
 823	ret = strict_strtoul(minor_s, 10, &minor);
 824	if (ret)
 825		return -EINVAL;
 826
 827	dev = MKDEV(major, minor);
 828
 829	ret = strict_strtoull(s[1], 10, &temp);
 830	if (ret)
 831		return -EINVAL;
 832
 833	/* For rule removal, do not check for device presence. */
 834	if (temp) {
 835		ret = blkio_check_dev_num(dev);
 836		if (ret)
 837			return ret;
 838	}
 839
 840	newpn->dev = dev;
 841
 842	switch (plid) {
 843	case BLKIO_POLICY_PROP:
 844		if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
 845		     temp > BLKIO_WEIGHT_MAX)
 846			return -EINVAL;
 847
 848		newpn->plid = plid;
 849		newpn->fileid = fileid;
 850		newpn->val.weight = temp;
 851		break;
 852	case BLKIO_POLICY_THROTL:
 853		switch(fileid) {
 854		case BLKIO_THROTL_read_bps_device:
 855		case BLKIO_THROTL_write_bps_device:
 856			newpn->plid = plid;
 857			newpn->fileid = fileid;
 858			newpn->val.bps = temp;
 859			break;
 860		case BLKIO_THROTL_read_iops_device:
 861		case BLKIO_THROTL_write_iops_device:
 862			if (temp > THROTL_IOPS_MAX)
 863				return -EINVAL;
 864
 865			newpn->plid = plid;
 866			newpn->fileid = fileid;
 867			newpn->val.iops = (unsigned int)temp;
 868			break;
 869		}
 870		break;
 871	default:
 872		BUG();
 873	}
 874
 
 
 
 875	return 0;
 876}
 
 877
 878unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
 879			      dev_t dev)
 
 
 
 
 
 
 
 880{
 881	struct blkio_policy_node *pn;
 
 
 
 
 882
 883	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
 884				BLKIO_PROP_weight_device);
 885	if (pn)
 886		return pn->val.weight;
 887	else
 888		return blkcg->weight;
 889}
 890EXPORT_SYMBOL_GPL(blkcg_get_weight);
 891
 892uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
 
 
 
 
 
 
 
 
 
 
 
 893{
 894	struct blkio_policy_node *pn;
 
 
 895
 896	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
 897				BLKIO_THROTL_read_bps_device);
 898	if (pn)
 899		return pn->val.bps;
 900	else
 901		return -1;
 902}
 
 
 
 
 
 
 
 903
 904uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
 905{
 906	struct blkio_policy_node *pn;
 907	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
 908				BLKIO_THROTL_write_bps_device);
 909	if (pn)
 910		return pn->val.bps;
 911	else
 912		return -1;
 913}
 914
 915unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
 916{
 917	struct blkio_policy_node *pn;
 918
 919	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
 920				BLKIO_THROTL_read_iops_device);
 921	if (pn)
 922		return pn->val.iops;
 923	else
 924		return -1;
 925}
 926
 927unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
 928{
 929	struct blkio_policy_node *pn;
 930	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
 931				BLKIO_THROTL_write_iops_device);
 932	if (pn)
 933		return pn->val.iops;
 934	else
 935		return -1;
 936}
 937
 938/* Checks whether user asked for deleting a policy rule */
 939static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
 940{
 941	switch(pn->plid) {
 942	case BLKIO_POLICY_PROP:
 943		if (pn->val.weight == 0)
 944			return 1;
 945		break;
 946	case BLKIO_POLICY_THROTL:
 947		switch(pn->fileid) {
 948		case BLKIO_THROTL_read_bps_device:
 949		case BLKIO_THROTL_write_bps_device:
 950			if (pn->val.bps == 0)
 951				return 1;
 952			break;
 953		case BLKIO_THROTL_read_iops_device:
 954		case BLKIO_THROTL_write_iops_device:
 955			if (pn->val.iops == 0)
 956				return 1;
 957		}
 958		break;
 959	default:
 960		BUG();
 961	}
 962
 963	return 0;
 
 
 
 
 
 
 
 
 
 
 
 964}
 965
 966static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
 967					struct blkio_policy_node *newpn)
 
 
 
 
 
 
 
 
 
 968{
 969	switch(oldpn->plid) {
 970	case BLKIO_POLICY_PROP:
 971		oldpn->val.weight = newpn->val.weight;
 972		break;
 973	case BLKIO_POLICY_THROTL:
 974		switch(newpn->fileid) {
 975		case BLKIO_THROTL_read_bps_device:
 976		case BLKIO_THROTL_write_bps_device:
 977			oldpn->val.bps = newpn->val.bps;
 978			break;
 979		case BLKIO_THROTL_read_iops_device:
 980		case BLKIO_THROTL_write_iops_device:
 981			oldpn->val.iops = newpn->val.iops;
 982		}
 983		break;
 984	default:
 985		BUG();
 986	}
 987}
 988
 989/*
 990 * Some rules/values in blkg have changed. Propagate those to respective
 991 * policies.
 
 
 992 */
 993static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
 994		struct blkio_group *blkg, struct blkio_policy_node *pn)
 995{
 996	unsigned int weight, iops;
 997	u64 bps;
 998
 999	switch(pn->plid) {
1000	case BLKIO_POLICY_PROP:
1001		weight = pn->val.weight ? pn->val.weight :
1002				blkcg->weight;
1003		blkio_update_group_weight(blkg, weight);
1004		break;
1005	case BLKIO_POLICY_THROTL:
1006		switch(pn->fileid) {
1007		case BLKIO_THROTL_read_bps_device:
1008		case BLKIO_THROTL_write_bps_device:
1009			bps = pn->val.bps ? pn->val.bps : (-1);
1010			blkio_update_group_bps(blkg, bps, pn->fileid);
1011			break;
1012		case BLKIO_THROTL_read_iops_device:
1013		case BLKIO_THROTL_write_iops_device:
1014			iops = pn->val.iops ? pn->val.iops : (-1);
1015			blkio_update_group_iops(blkg, iops, pn->fileid);
1016			break;
1017		}
1018		break;
1019	default:
1020		BUG();
1021	}
1022}
1023
1024/*
1025 * A policy node rule has been updated. Propagate this update to all the
1026 * block groups which might be affected by this update.
 
 
1027 */
1028static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
1029				struct blkio_policy_node *pn)
1030{
1031	struct blkio_group *blkg;
1032	struct hlist_node *n;
 
1033
1034	spin_lock(&blkio_list_lock);
1035	spin_lock_irq(&blkcg->lock);
1036
1037	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1038		if (pn->dev != blkg->dev || pn->plid != blkg->plid)
1039			continue;
1040		blkio_update_blkg_policy(blkcg, blkg, pn);
1041	}
1042
1043	spin_unlock_irq(&blkcg->lock);
1044	spin_unlock(&blkio_list_lock);
1045}
1046
1047static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
1048 				       const char *buffer)
 
 
 
 
 
1049{
 
 
1050	int ret = 0;
1051	char *buf;
1052	struct blkio_policy_node *newpn, *pn;
1053	struct blkio_cgroup *blkcg;
1054	int keep_newpn = 0;
1055	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1056	int fileid = BLKIOFILE_ATTR(cft->private);
1057
1058	buf = kstrdup(buffer, GFP_KERNEL);
1059	if (!buf)
1060		return -ENOMEM;
1061
1062	newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
1063	if (!newpn) {
1064		ret = -ENOMEM;
1065		goto free_buf;
1066	}
1067
1068	ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid);
1069	if (ret)
1070		goto free_newpn;
1071
1072	blkcg = cgroup_to_blkio_cgroup(cgrp);
1073
1074	spin_lock_irq(&blkcg->lock);
1075
1076	pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid);
1077	if (!pn) {
1078		if (!blkio_delete_rule_command(newpn)) {
1079			blkio_policy_insert_node(blkcg, newpn);
1080			keep_newpn = 1;
1081		}
1082		spin_unlock_irq(&blkcg->lock);
1083		goto update_io_group;
1084	}
1085
1086	if (blkio_delete_rule_command(newpn)) {
1087		blkio_policy_delete_node(pn);
1088		spin_unlock_irq(&blkcg->lock);
1089		goto update_io_group;
1090	}
1091	spin_unlock_irq(&blkcg->lock);
1092
1093	blkio_update_policy_rule(pn, newpn);
1094
1095update_io_group:
1096	blkio_update_policy_node_blkg(blkcg, newpn);
1097
1098free_newpn:
1099	if (!keep_newpn)
1100		kfree(newpn);
1101free_buf:
1102	kfree(buf);
1103	return ret;
1104}
1105
1106static void
1107blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1108{
1109	switch(pn->plid) {
1110		case BLKIO_POLICY_PROP:
1111			if (pn->fileid == BLKIO_PROP_weight_device)
1112				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
1113					MINOR(pn->dev), pn->val.weight);
1114			break;
1115		case BLKIO_POLICY_THROTL:
1116			switch(pn->fileid) {
1117			case BLKIO_THROTL_read_bps_device:
1118			case BLKIO_THROTL_write_bps_device:
1119				seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
1120					MINOR(pn->dev), pn->val.bps);
1121				break;
1122			case BLKIO_THROTL_read_iops_device:
1123			case BLKIO_THROTL_write_iops_device:
1124				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
1125					MINOR(pn->dev), pn->val.iops);
1126				break;
1127			}
1128			break;
1129		default:
1130			BUG();
1131	}
1132}
1133
1134/* cgroup files which read their data from policy nodes end up here */
1135static void blkio_read_policy_node_files(struct cftype *cft,
1136			struct blkio_cgroup *blkcg, struct seq_file *m)
1137{
1138	struct blkio_policy_node *pn;
1139
1140	if (!list_empty(&blkcg->policy_list)) {
1141		spin_lock_irq(&blkcg->lock);
1142		list_for_each_entry(pn, &blkcg->policy_list, node) {
1143			if (!pn_matches_cftype(cft, pn))
1144				continue;
1145			blkio_print_policy_node(m, pn);
1146		}
1147		spin_unlock_irq(&blkcg->lock);
1148	}
1149}
1150
1151static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
1152				struct seq_file *m)
1153{
1154	struct blkio_cgroup *blkcg;
1155	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1156	int name = BLKIOFILE_ATTR(cft->private);
1157
1158	blkcg = cgroup_to_blkio_cgroup(cgrp);
1159
1160	switch(plid) {
1161	case BLKIO_POLICY_PROP:
1162		switch(name) {
1163		case BLKIO_PROP_weight_device:
1164			blkio_read_policy_node_files(cft, blkcg, m);
1165			return 0;
1166		default:
1167			BUG();
1168		}
1169		break;
1170	case BLKIO_POLICY_THROTL:
1171		switch(name){
1172		case BLKIO_THROTL_read_bps_device:
1173		case BLKIO_THROTL_write_bps_device:
1174		case BLKIO_THROTL_read_iops_device:
1175		case BLKIO_THROTL_write_iops_device:
1176			blkio_read_policy_node_files(cft, blkcg, m);
1177			return 0;
1178		default:
1179			BUG();
1180		}
1181		break;
1182	default:
1183		BUG();
1184	}
1185
1186	return 0;
1187}
1188
1189static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
1190		struct cftype *cft, struct cgroup_map_cb *cb,
1191		enum stat_type type, bool show_total, bool pcpu)
1192{
1193	struct blkio_group *blkg;
1194	struct hlist_node *n;
1195	uint64_t cgroup_total = 0;
1196
1197	rcu_read_lock();
1198	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
1199		if (blkg->dev) {
1200			if (!cftype_blkg_same_policy(cft, blkg))
1201				continue;
1202			if (pcpu)
1203				cgroup_total += blkio_get_stat_cpu(blkg, cb,
1204						blkg->dev, type);
1205			else {
1206				spin_lock_irq(&blkg->stats_lock);
1207				cgroup_total += blkio_get_stat(blkg, cb,
1208						blkg->dev, type);
1209				spin_unlock_irq(&blkg->stats_lock);
1210			}
1211		}
1212	}
1213	if (show_total)
1214		cb->fill(cb, "Total", cgroup_total);
1215	rcu_read_unlock();
1216	return 0;
1217}
1218
1219/* All map kind of cgroup file get serviced by this function */
1220static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
1221				struct cgroup_map_cb *cb)
1222{
1223	struct blkio_cgroup *blkcg;
1224	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1225	int name = BLKIOFILE_ATTR(cft->private);
1226
1227	blkcg = cgroup_to_blkio_cgroup(cgrp);
1228
1229	switch(plid) {
1230	case BLKIO_POLICY_PROP:
1231		switch(name) {
1232		case BLKIO_PROP_time:
1233			return blkio_read_blkg_stats(blkcg, cft, cb,
1234						BLKIO_STAT_TIME, 0, 0);
1235		case BLKIO_PROP_sectors:
1236			return blkio_read_blkg_stats(blkcg, cft, cb,
1237						BLKIO_STAT_CPU_SECTORS, 0, 1);
1238		case BLKIO_PROP_io_service_bytes:
1239			return blkio_read_blkg_stats(blkcg, cft, cb,
1240					BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1241		case BLKIO_PROP_io_serviced:
1242			return blkio_read_blkg_stats(blkcg, cft, cb,
1243						BLKIO_STAT_CPU_SERVICED, 1, 1);
1244		case BLKIO_PROP_io_service_time:
1245			return blkio_read_blkg_stats(blkcg, cft, cb,
1246						BLKIO_STAT_SERVICE_TIME, 1, 0);
1247		case BLKIO_PROP_io_wait_time:
1248			return blkio_read_blkg_stats(blkcg, cft, cb,
1249						BLKIO_STAT_WAIT_TIME, 1, 0);
1250		case BLKIO_PROP_io_merged:
1251			return blkio_read_blkg_stats(blkcg, cft, cb,
1252						BLKIO_STAT_CPU_MERGED, 1, 1);
1253		case BLKIO_PROP_io_queued:
1254			return blkio_read_blkg_stats(blkcg, cft, cb,
1255						BLKIO_STAT_QUEUED, 1, 0);
1256#ifdef CONFIG_DEBUG_BLK_CGROUP
1257		case BLKIO_PROP_unaccounted_time:
1258			return blkio_read_blkg_stats(blkcg, cft, cb,
1259					BLKIO_STAT_UNACCOUNTED_TIME, 0, 0);
1260		case BLKIO_PROP_dequeue:
1261			return blkio_read_blkg_stats(blkcg, cft, cb,
1262						BLKIO_STAT_DEQUEUE, 0, 0);
1263		case BLKIO_PROP_avg_queue_size:
1264			return blkio_read_blkg_stats(blkcg, cft, cb,
1265					BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0);
1266		case BLKIO_PROP_group_wait_time:
1267			return blkio_read_blkg_stats(blkcg, cft, cb,
1268					BLKIO_STAT_GROUP_WAIT_TIME, 0, 0);
1269		case BLKIO_PROP_idle_time:
1270			return blkio_read_blkg_stats(blkcg, cft, cb,
1271						BLKIO_STAT_IDLE_TIME, 0, 0);
1272		case BLKIO_PROP_empty_time:
1273			return blkio_read_blkg_stats(blkcg, cft, cb,
1274						BLKIO_STAT_EMPTY_TIME, 0, 0);
1275#endif
1276		default:
1277			BUG();
1278		}
1279		break;
1280	case BLKIO_POLICY_THROTL:
1281		switch(name){
1282		case BLKIO_THROTL_io_service_bytes:
1283			return blkio_read_blkg_stats(blkcg, cft, cb,
1284						BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1285		case BLKIO_THROTL_io_serviced:
1286			return blkio_read_blkg_stats(blkcg, cft, cb,
1287						BLKIO_STAT_CPU_SERVICED, 1, 1);
1288		default:
1289			BUG();
1290		}
1291		break;
1292	default:
1293		BUG();
1294	}
 
1295
1296	return 0;
1297}
1298
1299static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val)
1300{
1301	struct blkio_group *blkg;
1302	struct hlist_node *n;
1303	struct blkio_policy_node *pn;
1304
1305	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
1306		return -EINVAL;
1307
1308	spin_lock(&blkio_list_lock);
1309	spin_lock_irq(&blkcg->lock);
1310	blkcg->weight = (unsigned int)val;
1311
1312	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1313		pn = blkio_policy_search_node(blkcg, blkg->dev,
1314				BLKIO_POLICY_PROP, BLKIO_PROP_weight_device);
1315		if (pn)
1316			continue;
1317
1318		blkio_update_group_weight(blkg, blkcg->weight);
1319	}
1320	spin_unlock_irq(&blkcg->lock);
1321	spin_unlock(&blkio_list_lock);
1322	return 0;
1323}
1324
1325static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
1326	struct blkio_cgroup *blkcg;
1327	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1328	int name = BLKIOFILE_ATTR(cft->private);
1329
1330	blkcg = cgroup_to_blkio_cgroup(cgrp);
1331
1332	switch(plid) {
1333	case BLKIO_POLICY_PROP:
1334		switch(name) {
1335		case BLKIO_PROP_weight:
1336			return (u64)blkcg->weight;
1337		}
1338		break;
1339	default:
1340		BUG();
1341	}
1342	return 0;
1343}
1344
1345static int
1346blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1347{
1348	struct blkio_cgroup *blkcg;
1349	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1350	int name = BLKIOFILE_ATTR(cft->private);
1351
1352	blkcg = cgroup_to_blkio_cgroup(cgrp);
1353
1354	switch(plid) {
1355	case BLKIO_POLICY_PROP:
1356		switch(name) {
1357		case BLKIO_PROP_weight:
1358			return blkio_weight_write(blkcg, val);
1359		}
1360		break;
1361	default:
1362		BUG();
1363	}
1364
1365	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1366}
 
1367
1368struct cftype blkio_files[] = {
1369	{
1370		.name = "weight_device",
1371		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1372				BLKIO_PROP_weight_device),
1373		.read_seq_string = blkiocg_file_read,
1374		.write_string = blkiocg_file_write,
1375		.max_write_len = 256,
1376	},
1377	{
1378		.name = "weight",
1379		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1380				BLKIO_PROP_weight),
1381		.read_u64 = blkiocg_file_read_u64,
1382		.write_u64 = blkiocg_file_write_u64,
1383	},
1384	{
1385		.name = "time",
1386		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1387				BLKIO_PROP_time),
1388		.read_map = blkiocg_file_read_map,
1389	},
1390	{
1391		.name = "sectors",
1392		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1393				BLKIO_PROP_sectors),
1394		.read_map = blkiocg_file_read_map,
1395	},
1396	{
1397		.name = "io_service_bytes",
1398		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1399				BLKIO_PROP_io_service_bytes),
1400		.read_map = blkiocg_file_read_map,
1401	},
1402	{
1403		.name = "io_serviced",
1404		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1405				BLKIO_PROP_io_serviced),
1406		.read_map = blkiocg_file_read_map,
1407	},
1408	{
1409		.name = "io_service_time",
1410		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1411				BLKIO_PROP_io_service_time),
1412		.read_map = blkiocg_file_read_map,
1413	},
1414	{
1415		.name = "io_wait_time",
1416		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1417				BLKIO_PROP_io_wait_time),
1418		.read_map = blkiocg_file_read_map,
1419	},
1420	{
1421		.name = "io_merged",
1422		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1423				BLKIO_PROP_io_merged),
1424		.read_map = blkiocg_file_read_map,
1425	},
1426	{
1427		.name = "io_queued",
1428		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1429				BLKIO_PROP_io_queued),
1430		.read_map = blkiocg_file_read_map,
1431	},
1432	{
1433		.name = "reset_stats",
1434		.write_u64 = blkiocg_reset_stats,
1435	},
1436#ifdef CONFIG_BLK_DEV_THROTTLING
1437	{
1438		.name = "throttle.read_bps_device",
1439		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1440				BLKIO_THROTL_read_bps_device),
1441		.read_seq_string = blkiocg_file_read,
1442		.write_string = blkiocg_file_write,
1443		.max_write_len = 256,
1444	},
1445
1446	{
1447		.name = "throttle.write_bps_device",
1448		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1449				BLKIO_THROTL_write_bps_device),
1450		.read_seq_string = blkiocg_file_read,
1451		.write_string = blkiocg_file_write,
1452		.max_write_len = 256,
1453	},
1454
1455	{
1456		.name = "throttle.read_iops_device",
1457		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1458				BLKIO_THROTL_read_iops_device),
1459		.read_seq_string = blkiocg_file_read,
1460		.write_string = blkiocg_file_write,
1461		.max_write_len = 256,
1462	},
1463
1464	{
1465		.name = "throttle.write_iops_device",
1466		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1467				BLKIO_THROTL_write_iops_device),
1468		.read_seq_string = blkiocg_file_read,
1469		.write_string = blkiocg_file_write,
1470		.max_write_len = 256,
1471	},
1472	{
1473		.name = "throttle.io_service_bytes",
1474		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1475				BLKIO_THROTL_io_service_bytes),
1476		.read_map = blkiocg_file_read_map,
1477	},
1478	{
1479		.name = "throttle.io_serviced",
1480		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1481				BLKIO_THROTL_io_serviced),
1482		.read_map = blkiocg_file_read_map,
1483	},
1484#endif /* CONFIG_BLK_DEV_THROTTLING */
1485
1486#ifdef CONFIG_DEBUG_BLK_CGROUP
1487	{
1488		.name = "avg_queue_size",
1489		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1490				BLKIO_PROP_avg_queue_size),
1491		.read_map = blkiocg_file_read_map,
1492	},
1493	{
1494		.name = "group_wait_time",
1495		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1496				BLKIO_PROP_group_wait_time),
1497		.read_map = blkiocg_file_read_map,
1498	},
1499	{
1500		.name = "idle_time",
1501		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1502				BLKIO_PROP_idle_time),
1503		.read_map = blkiocg_file_read_map,
1504	},
1505	{
1506		.name = "empty_time",
1507		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1508				BLKIO_PROP_empty_time),
1509		.read_map = blkiocg_file_read_map,
1510	},
1511	{
1512		.name = "dequeue",
1513		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1514				BLKIO_PROP_dequeue),
1515		.read_map = blkiocg_file_read_map,
1516	},
1517	{
1518		.name = "unaccounted_time",
1519		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1520				BLKIO_PROP_unaccounted_time),
1521		.read_map = blkiocg_file_read_map,
1522	},
1523#endif
1524};
1525
1526static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1527{
1528	return cgroup_add_files(cgroup, subsys, blkio_files,
1529				ARRAY_SIZE(blkio_files));
1530}
1531
1532static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1533{
1534	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
1535	unsigned long flags;
1536	struct blkio_group *blkg;
1537	void *key;
1538	struct blkio_policy_type *blkiop;
1539	struct blkio_policy_node *pn, *pntmp;
1540
1541	rcu_read_lock();
1542	do {
1543		spin_lock_irqsave(&blkcg->lock, flags);
1544
1545		if (hlist_empty(&blkcg->blkg_list)) {
1546			spin_unlock_irqrestore(&blkcg->lock, flags);
1547			break;
1548		}
1549
1550		blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
1551					blkcg_node);
1552		key = rcu_dereference(blkg->key);
1553		__blkiocg_del_blkio_group(blkg);
1554
1555		spin_unlock_irqrestore(&blkcg->lock, flags);
 
 
1556
1557		/*
1558		 * This blkio_group is being unlinked as associated cgroup is
1559		 * going away. Let all the IO controlling policies know about
1560		 * this event.
1561		 */
1562		spin_lock(&blkio_list_lock);
1563		list_for_each_entry(blkiop, &blkio_list, list) {
1564			if (blkiop->plid != blkg->plid)
1565				continue;
1566			blkiop->ops.blkio_unlink_group_fn(key, blkg);
1567		}
1568		spin_unlock(&blkio_list_lock);
1569	} while (1);
1570
1571	list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
1572		blkio_policy_delete_node(pn);
1573		kfree(pn);
1574	}
1575
1576	free_css_id(&blkio_subsys, &blkcg->css);
1577	rcu_read_unlock();
1578	if (blkcg != &blkio_root_cgroup)
1579		kfree(blkcg);
1580}
1581
1582static struct cgroup_subsys_state *
1583blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1584{
1585	struct blkio_cgroup *blkcg;
1586	struct cgroup *parent = cgroup->parent;
1587
1588	if (!parent) {
1589		blkcg = &blkio_root_cgroup;
1590		goto done;
1591	}
1592
1593	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1594	if (!blkcg)
1595		return ERR_PTR(-ENOMEM);
1596
1597	blkcg->weight = BLKIO_WEIGHT_DEFAULT;
1598done:
1599	spin_lock_init(&blkcg->lock);
1600	INIT_HLIST_HEAD(&blkcg->blkg_list);
1601
1602	INIT_LIST_HEAD(&blkcg->policy_list);
1603	return &blkcg->css;
1604}
 
1605
1606/*
1607 * We cannot support shared io contexts, as we have no mean to support
1608 * two tasks with the same ioc in two different groups without major rework
1609 * of the main cic data structures.  For now we allow a task to change
1610 * its cgroup only if it's the only owner of its ioc.
 
1611 */
1612static int blkiocg_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1613{
1614	struct io_context *ioc;
1615	int ret = 0;
1616
1617	/* task_lock() is needed to avoid races with exit_io_context() */
1618	task_lock(tsk);
1619	ioc = tsk->io_context;
1620	if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1621		ret = -EINVAL;
1622	task_unlock(tsk);
1623
1624	return ret;
1625}
1626
1627static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1628{
1629	struct io_context *ioc;
 
 
 
 
1630
1631	task_lock(tsk);
1632	ioc = tsk->io_context;
1633	if (ioc)
1634		ioc->cgroup_changed = 1;
1635	task_unlock(tsk);
 
 
 
 
 
 
1636}
 
1637
1638void blkio_policy_register(struct blkio_policy_type *blkiop)
 
 
 
 
 
 
1639{
1640	spin_lock(&blkio_list_lock);
1641	list_add_tail(&blkiop->list, &blkio_list);
1642	spin_unlock(&blkio_list_lock);
1643}
1644EXPORT_SYMBOL_GPL(blkio_policy_register);
1645
1646void blkio_policy_unregister(struct blkio_policy_type *blkiop)
1647{
1648	spin_lock(&blkio_list_lock);
1649	list_del_init(&blkiop->list);
1650	spin_unlock(&blkio_list_lock);
1651}
1652EXPORT_SYMBOL_GPL(blkio_policy_unregister);
1653
1654static int __init init_cgroup_blkio(void)
1655{
1656	return cgroup_load_subsys(&blkio_subsys);
1657}
1658
1659static void __exit exit_cgroup_blkio(void)
1660{
1661	cgroup_unload_subsys(&blkio_subsys);
 
1662}
1663
1664module_init(init_cgroup_blkio);
1665module_exit(exit_cgroup_blkio);
1666MODULE_LICENSE("GPL");