/* * Common Block IO controller cgroup interface * * Based on ideas and code from CFQ, CFS and BFQ: * Copyright (C) 2003 Jens Axboe * * Copyright (C) 2008 Fabio Checconi * Paolo Valente * * Copyright (C) 2009 Vivek Goyal * Nauman Rafique */ #include #include #include #include #include #include #include #include #include #include "blk-cgroup.h" #include "blk.h" #define MAX_KEY_LEN 100 static DEFINE_MUTEX(blkcg_pol_mutex); struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT }; EXPORT_SYMBOL_GPL(blkcg_root); static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) { return pol && test_bit(pol->plid, q->blkcg_pols); } /** * blkg_free - free a blkg * @blkg: blkg to free * * Free @blkg which may be partially allocated. */ static void blkg_free(struct blkcg_gq *blkg) { int i; if (!blkg) return; for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkg_policy_data *pd = blkg->pd[i]; if (!pd) continue; if (pol && pol->pd_exit_fn) pol->pd_exit_fn(blkg); kfree(pd); } blk_exit_rl(&blkg->rl); kfree(blkg); } /** * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with * @q: request_queue the new blkg is associated with * @gfp_mask: allocation mask to use * * Allocate a new blkg assocating @blkcg and @q. */ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, gfp_t gfp_mask) { struct blkcg_gq *blkg; int i; /* alloc and init base part */ blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node); if (!blkg) return NULL; blkg->q = q; INIT_LIST_HEAD(&blkg->q_node); blkg->blkcg = blkcg; blkg->refcnt = 1; /* root blkg uses @q->root_rl, init rl only for !root blkgs */ if (blkcg != &blkcg_root) { if (blk_init_rl(&blkg->rl, q, gfp_mask)) goto err_free; blkg->rl.blkg = blkg; } for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkg_policy_data *pd; if (!blkcg_policy_enabled(q, pol)) continue; /* alloc per-policy data and attach it to blkg */ pd = kzalloc_node(pol->pd_size, gfp_mask, q->node); if (!pd) goto err_free; blkg->pd[i] = pd; pd->blkg = blkg; /* invoke per-policy init */ if (blkcg_policy_enabled(blkg->q, pol)) pol->pd_init_fn(blkg); } return blkg; err_free: blkg_free(blkg); return NULL; } static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q) { struct blkcg_gq *blkg; blkg = rcu_dereference(blkcg->blkg_hint); if (blkg && blkg->q == q) return blkg; /* * Hint didn't match. Look up from the radix tree. Note that we * may not be holding queue_lock and thus are not sure whether * @blkg from blkg_tree has already been removed or not, so we * can't update hint to the lookup result. Leave it to the caller. */ blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); if (blkg && blkg->q == q) return blkg; return NULL; } /** * blkg_lookup - lookup blkg for the specified blkcg - q pair * @blkcg: blkcg of interest * @q: request_queue of interest * * Lookup blkg for the @blkcg - @q pair. This function should be called * under RCU read lock and is guaranteed to return %NULL if @q is bypassing * - see blk_queue_bypass_start() for details. */ struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) { WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(blk_queue_bypass(q))) return NULL; return __blkg_lookup(blkcg, q); } EXPORT_SYMBOL_GPL(blkg_lookup); /* * If @new_blkg is %NULL, this function tries to allocate a new one as * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return. */ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, struct request_queue *q, struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; int ret; WARN_ON_ONCE(!rcu_read_lock_held()); lockdep_assert_held(q->queue_lock); /* lookup and update hint on success, see __blkg_lookup() for details */ blkg = __blkg_lookup(blkcg, q); if (blkg) { rcu_assign_pointer(blkcg->blkg_hint, blkg); goto out_free; } /* blkg holds a reference to blkcg */ if (!css_tryget(&blkcg->css)) { blkg = ERR_PTR(-EINVAL); goto out_free; } /* allocate */ if (!new_blkg) { new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); if (unlikely(!new_blkg)) { blkg = ERR_PTR(-ENOMEM); goto out_put; } } blkg = new_blkg; /* insert */ spin_lock(&blkcg->lock); ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); if (likely(!ret)) { hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); list_add(&blkg->q_node, &q->blkg_list); } spin_unlock(&blkcg->lock); if (!ret) return blkg; blkg = ERR_PTR(ret); out_put: css_put(&blkcg->css); out_free: blkg_free(new_blkg); return blkg; } struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct request_queue *q) { /* * This could be the first entry point of blkcg implementation and * we shouldn't allow anything to go through for a bypassing queue. */ if (unlikely(blk_queue_bypass(q))) return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY); return __blkg_lookup_create(blkcg, q, NULL); } EXPORT_SYMBOL_GPL(blkg_lookup_create); static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; lockdep_assert_held(blkg->q->queue_lock); lockdep_assert_held(&blkcg->lock); /* Something wrong if we are trying to remove same group twice */ WARN_ON_ONCE(list_empty(&blkg->q_node)); WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); list_del_init(&blkg->q_node); hlist_del_init_rcu(&blkg->blkcg_node); /* * Both setting lookup hint to and clearing it from @blkg are done * under queue_lock. If it's not pointing to @blkg now, it never * will. Hint assignment itself can race safely. */ if (rcu_dereference_raw(blkcg->blkg_hint) == blkg) rcu_assign_pointer(blkcg->blkg_hint, NULL); /* * Put the reference taken at the time of creation so that when all * queues are gone, group can be destroyed. */ blkg_put(blkg); } /** * blkg_destroy_all - destroy all blkgs associated with a request_queue * @q: request_queue of interest * * Destroy all blkgs associated with @q. */ static void blkg_destroy_all(struct request_queue *q) { struct blkcg_gq *blkg, *n; lockdep_assert_held(q->queue_lock); list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; spin_lock(&blkcg->lock); blkg_destroy(blkg); spin_unlock(&blkcg->lock); } /* * root blkg is destroyed. Just clear the pointer since * root_rl does not take reference on root blkg. */ q->root_blkg = NULL; q->root_rl.blkg = NULL; } static void blkg_rcu_free(struct rcu_head *rcu_head) { blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head)); } void __blkg_release(struct blkcg_gq *blkg) { /* release the extra blkcg reference this blkg has been holding */ css_put(&blkg->blkcg->css); /* * A group is freed in rcu manner. But having an rcu lock does not * mean that one can access all the fields of blkg and assume these * are valid. For example, don't try to follow throtl_data and * request queue links. * * Having a reference to blkg under an rcu allows acess to only * values local to groups like group stats and group rate limits */ call_rcu(&blkg->rcu_head, blkg_rcu_free); } EXPORT_SYMBOL_GPL(__blkg_release); /* * The next function used by blk_queue_for_each_rl(). It's a bit tricky * because the root blkg uses @q->root_rl instead of its own rl. */ struct request_list *__blk_queue_next_rl(struct request_list *rl, struct request_queue *q) { struct list_head *ent; struct blkcg_gq *blkg; /* * Determine the current blkg list_head. The first entry is * root_rl which is off @q->blkg_list and mapped to the head. */ if (rl == &q->root_rl) { ent = &q->blkg_list; /* There are no more block groups, hence no request lists */ if (list_empty(ent)) return NULL; } else { blkg = container_of(rl, struct blkcg_gq, rl); ent = &blkg->q_node; } /* walk to the next list_head, skip root blkcg */ ent = ent->next; if (ent == &q->root_blkg->q_node) ent = ent->next; if (ent == &q->blkg_list) return NULL; blkg = container_of(ent, struct blkcg_gq, q_node); return &blkg->rl; } static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) { struct blkcg *blkcg = cgroup_to_blkcg(cgroup); struct blkcg_gq *blkg; struct hlist_node *n; int i; mutex_lock(&blkcg_pol_mutex); spin_lock_irq(&blkcg->lock); /* * Note that stat reset is racy - it doesn't synchronize against * stat updates. This is a debug feature which shouldn't exist * anyway. If you get hit by a race, retry. */ hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (blkcg_policy_enabled(blkg->q, pol) && pol->pd_reset_stats_fn) pol->pd_reset_stats_fn(blkg); } } spin_unlock_irq(&blkcg->lock); mutex_unlock(&blkcg_pol_mutex); return 0; } static const char *blkg_dev_name(struct blkcg_gq *blkg) { /* some drivers (floppy) instantiate a queue w/o disk registered */ if (blkg->q->backing_dev_info.dev) return dev_name(blkg->q->backing_dev_info.dev); return NULL; } /** * blkcg_print_blkgs - helper for printing per-blkg data * @sf: seq_file to print to * @blkcg: blkcg of interest * @prfill: fill function to print out a blkg * @pol: policy in question * @data: data to be passed to @prfill * @show_total: to print out sum of prfill return values or not * * This function invokes @prfill on each blkg of @blkcg if pd for the * policy specified by @pol exists. @prfill is invoked with @sf, the * policy data and @data. If @show_total is %true, the sum of the return * values from @prfill is printed with "Total" label at the end. * * This is to be used to construct print functions for * cftype->read_seq_string method. */ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int), const struct blkcg_policy *pol, int data, bool show_total) { struct blkcg_gq *blkg; struct hlist_node *n; u64 total = 0; spin_lock_irq(&blkcg->lock); hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) if (blkcg_policy_enabled(blkg->q, pol)) total += prfill(sf, blkg->pd[pol->plid], data); spin_unlock_irq(&blkcg->lock); if (show_total) seq_printf(sf, "Total %llu\n", (unsigned long long)total); } EXPORT_SYMBOL_GPL(blkcg_print_blkgs); /** * __blkg_prfill_u64 - prfill helper for a single u64 value * @sf: seq_file to print to * @pd: policy private data of interest * @v: value to print * * Print @v to @sf for the device assocaited with @pd. */ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) { const char *dname = blkg_dev_name(pd->blkg); if (!dname) return 0; seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v); return v; } EXPORT_SYMBOL_GPL(__blkg_prfill_u64); /** * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat * @sf: seq_file to print to * @pd: policy private data of interest * @rwstat: rwstat to print * * Print @rwstat to @sf for the device assocaited with @pd. */ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, const struct blkg_rwstat *rwstat) { static const char *rwstr[] = { [BLKG_RWSTAT_READ] = "Read", [BLKG_RWSTAT_WRITE] = "Write", [BLKG_RWSTAT_SYNC] = "Sync", [BLKG_RWSTAT_ASYNC] = "Async", }; const char *dname = blkg_dev_name(pd->blkg); u64 v; int i; if (!dname) return 0; for (i = 0; i < BLKG_RWSTAT_NR; i++) seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], (unsigned long long)rwstat->cnt[i]); v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE]; seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); return v; } /** * blkg_prfill_stat - prfill callback for blkg_stat * @sf: seq_file to print to * @pd: policy private data of interest * @off: offset to the blkg_stat in @pd * * prfill callback for printing a blkg_stat. */ u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off) { return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off)); } EXPORT_SYMBOL_GPL(blkg_prfill_stat); /** * blkg_prfill_rwstat - prfill callback for blkg_rwstat * @sf: seq_file to print to * @pd: policy private data of interest * @off: offset to the blkg_rwstat in @pd * * prfill callback for printing a blkg_rwstat. */ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off);