/* * Interface for controlling IO bandwidth on a request queue * * Copyright (C) 2010 Vivek Goyal */ #include #include #include #include #include #include "blk-cgroup.h" #include "blk.h" /* Max dispatch from a group in 1 round */ static int throtl_grp_quantum = 8; /* Total max dispatch from all groups in one round */ static int throtl_quantum = 32; /* Throttling is performed over 100ms slice and after that slice is renewed */ static unsigned long throtl_slice = HZ/10; /* 100 ms */ static struct blkcg_policy blkcg_policy_throtl; /* A workqueue to queue throttle related work */ static struct workqueue_struct *kthrotld_workqueue; static void throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay); struct throtl_rb_root { struct rb_root rb; struct rb_node *left; unsigned int count; unsigned long min_disptime; }; #define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \ .count = 0, .min_disptime = 0} #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) /* Per-cpu group stats */ struct tg_stats_cpu { /* total bytes transferred */ struct blkg_rwstat service_bytes; /* total IOs serviced, post merge */ struct blkg_rwstat serviced; }; struct throtl_grp { /* must be the first member */ struct blkg_policy_data pd; /* active throtl group service_tree member */ struct rb_node rb_node; /* * Dispatch time in jiffies. This is the estimated time when group * will unthrottle and is ready to dispatch more bio. It is used as * key to sort active groups in service tree. */ unsigned long disptime; unsigned int flags; /* Two lists for READ and WRITE */ struct bio_list bio_lists[2]; /* Number of queued bios on READ and WRITE lists */ unsigned int nr_queued[2]; /* bytes per second rate limits */ uint64_t bps[2]; /* IOPS limits */ unsigned int iops[2]; /* Number of bytes disptached in current slice */ uint64_t bytes_disp[2]; /* Number of bio's dispatched in current slice */ unsigned int io_disp[2]; /* When did we start a new slice */ unsigned long slice_start[2]; unsigned long slice_end[2]; /* Some throttle limits got updated for the group */ int limits_changed; /* Per cpu stats pointer */ struct tg_stats_cpu __percpu *stats_cpu; /* List of tgs waiting for per cpu stats memory to be allocated */ struct list_head stats_alloc_node; }; struct throtl_data { /* service tree for active throtl groups */ struct throtl_rb_root tg_service_tree; struct request_queue *queue; /* Total Number of queued bios on READ and WRITE lists */ unsigned int nr_queued[2]; /* * number of total undestroyed groups */ unsigned int nr_undestroyed_grps; /* Work for dispatching throttled bios */ struct delayed_work throtl_work; int limits_changed; }; /* list and work item to allocate percpu group stats */ static DEFINE_SPINLOCK(tg_stats_alloc_lock); static LIST_HEAD(tg_stats_alloc_list); static void tg_stats_alloc_fn(struct work_struct *); static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) { return pd ? container_of(pd, struct throtl_grp, pd) : NULL; } static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) { return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl)); } static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) { return pd_to_blkg(&tg->pd); } static inline struct throtl_grp *td_root_tg(struct throtl_data *td) { return blkg_to_tg(td->queue->root_blkg); } enum tg_state_flags { THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ }; #define THROTL_TG_FNS(name) \ static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \ { \ (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \ } \ static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \ { \ (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \ } \ static inline int throtl_tg_##name(const struct throtl_grp *tg) \ { \ return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \ } THROTL_TG_FNS(on_rr); #define throtl_log_tg(td, tg, fmt, args...) do { \ char __pbuf[128]; \ \ blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf)); \ blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \ } while (0) #define throtl_log(td, fmt, args...) \ blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) static inline unsigned int total_nr_queued(struct throtl_data *td) { return td->nr_queued[0] + td->nr_queued[1]; } /* * Worker for allocating per cpu stat for tgs. This is scheduled on the * system_wq once there are some groups on the alloc_list waiting for * allocation. */ static void tg_stats_alloc_fn(struct work_struct *work) { static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */ struct delayed_work *dwork = to_delayed_work(work); bool empty = false; alloc_stats: if (!stats_cpu) { stats_cpu = alloc_percpu(struct tg_stats_cpu); if (!stats_cpu) { /* allocation failed, try again after some time */ schedule_delayed_work(dwork, msecs_to_jiffies(10)); return; } } spin_lock_irq(&tg_stats_alloc_lock); if (!list_empty(&tg_stats_alloc_list)) { struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list, struct throtl_grp, stats_alloc_node); swap(tg->stats_cpu, stats_cpu); list_del_init(&tg->stats_alloc_node); } empty = list_empty(&tg_stats_alloc_list); spin_unlock_irq(&tg_stats_alloc_lock); if (!empty) goto alloc_stats; } static void throtl_pd_init(struct blkcg_gq *blkg) { struct throtl_grp *tg = blkg_to_tg(blkg); unsigned long flags; RB_CLEAR_NODE(&tg->rb_node); bio_list_init(&tg->bio_lists[0]); bio_list_init(&tg->bio_lists[1]); tg->limits_changed = false; tg->bps[READ] = -1; tg->bps[WRITE] = -1; tg->iops[READ] = -1; tg->iops[WRITE] = -1; /* * Ugh... We need to perform per-cpu allocation for tg->stats_cpu * but percpu allocator can't be called from IO path. Queue tg on * tg_stats_alloc_list and allocate from work item. */ spin_lock_irqsave(&tg_stats_alloc_lock, flags); list_add(&tg->stats_alloc_node, &tg_stats_alloc_list); schedule_delayed_work(&tg_stats_alloc_work, 0); spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); } static void throtl_pd_exit(struct blkcg_gq *blkg) { struct throtl_grp *tg = blkg_to_tg(blkg); unsigned long flags; spin_lock_irqsave(&tg_stats_alloc_lock, flags); list_del_init(&tg->stats_alloc_node); spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); free_percpu(tg->stats_cpu); } static void throtl_pd_reset_stats(struct blkcg_gq *blkg) { struct throtl_grp *tg = blkg_to_tg(blkg); int cpu; if (tg->stats_cpu == NULL) return; for_each_possible_cpu(cpu) { struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); blkg_rwstat_reset(&sc->service_bytes); blkg_rwstat_reset(&sc->serviced); } } static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td, struct blkcg *blkcg) { /* * This is the common case when there are no blkcgs. Avoid lookup * in this case */ if (blkcg == &blkcg_root) return td_root_tg(td); return blkg_to_tg(blkg_lookup(blkcg, td->queue)); } static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, struct blkcg *blkcg) { struct request_queue *q = td->queue; struct throtl_grp *tg = NULL; /* * This is the common case when there are no blkcgs. Avoid lookup * in this case */ if (blkcg == &blkcg_root) { tg = td_root_tg(td); } else { struct blkcg_gq *blkg; blkg = blkg_lookup_create(blkcg, q); /* if %NULL and @q is alive, fall back to root_tg */ if (!IS_ERR(blkg)) tg = blkg_to_tg(blkg); else if (!blk_queue_dying(q)) tg = td_root_tg(td); } return tg; } static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root) { /* Service tree is empty */ if (!root->count) return NULL; if (!root->left) root->left = rb_first(&root->rb); if (root->left) return rb_entry_tg(root->left);