/* * CFQ, or complete fairness queueing, disk scheduler. * * Based on ideas from a previously unfinished io * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli. * * Copyright (C) 2003 Jens Axboe */ #include #include #include #include #include #include #include #include #include "blk.h" #include "blk-cgroup.h" /* * tunables */ /* max queue in one round of service */ static const int cfq_quantum = 8; static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; /* maximum backwards seek, in KiB */ static const int cfq_back_max = 16 * 1024; /* penalty of a backwards seek */ static const int cfq_back_penalty = 2; static const int cfq_slice_sync = HZ / 10; static int cfq_slice_async = HZ / 25; static const int cfq_slice_async_rq = 2; static int cfq_slice_idle = HZ / 125; static int cfq_group_idle = HZ / 125; static const int cfq_target_latency = HZ * 3/10; /* 300 ms */ static const int cfq_hist_divisor = 4; /* * offset from end of service tree */ #define CFQ_IDLE_DELAY (HZ / 5) /* * below this threshold, we consider thinktime immediate */ #define CFQ_MIN_TT (2) #define CFQ_SLICE_SCALE (5) #define CFQ_HW_QUEUE_MIN (5) #define CFQ_SERVICE_SHIFT 12 #define CFQQ_SEEK_THR (sector_t)(8 * 100) #define CFQQ_CLOSE_THR (sector_t)(8 * 1024) #define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32) #define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) #define RQ_CIC(rq) icq_to_cic((rq)->elv.icq) #define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0]) #define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1]) static struct kmem_cache *cfq_pool; #define CFQ_PRIO_LISTS IOPRIO_BE_NR #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) #define sample_valid(samples) ((samples) > 80) #define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) struct cfq_ttime { unsigned long last_end_request; unsigned long ttime_total; unsigned long ttime_samples; unsigned long ttime_mean; }; /* * Most of our rbtree usage is for sorting with min extraction, so * if we cache the leftmost node we don't have to walk down the tree * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should * move this into the elevator for the rq sorting as well. */ struct cfq_rb_root { struct rb_root rb; struct rb_node *left; unsigned count; unsigned total_weight; u64 min_vdisktime; struct cfq_ttime ttime; }; #define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, \ .ttime = {.last_end_request = jiffies,},} /* * Per process-grouping structure */ struct cfq_queue { /* reference count */ int ref; /* various state flags, see below */ unsigned int flags; /* parent cfq_data */ struct cfq_data *cfqd; /* service_tree member */ struct rb_node rb_node; /* service_tree key */ unsigned long rb_key; /* prio tree member */ struct rb_node p_node; /* prio tree root we belong to, if any */ struct rb_root *p_root; /* sorted list of pending requests */ struct rb_root sort_list; /* if fifo isn't expired, next request to serve */ struct request *next_rq; /* requests queued in sort_list */ int queued[2]; /* currently allocated requests */ int allocated[2]; /* fifo list of requests in sort_list */ struct list_head fifo; /* time when queue got scheduled in to dispatch first request. */ unsigned long dispatch_start; unsigned int allocated_slice; unsigned int slice_dispatch; /* time when first request from queue completed and slice started. */ unsigned long slice_start; unsigned long slice_end; long slice_resid; /* pending priority requests */ int prio_pending; /* number of requests that are on the dispatch list or inside driver */ int dispatched; /* io prio of this group */ unsigned short ioprio, org_ioprio; unsigned short ioprio_class; pid_t pid; u32 seek_history; sector_t last_request_pos; struct cfq_rb_root *service_tree; struct cfq_queue *new_cfqq; struct cfq_group *cfqg; /* Number of sectors dispatched from queue in single dispatch round */ unsigned long nr_sectors; }; /* * First index in the service_trees. * IDLE is handled separately, so it has negative index */ enum wl_prio_t { BE_WORKLOAD = 0, RT_WORKLOAD = 1, IDLE_WORKLOAD = 2, CFQ_PRIO_NR, }; /* * Second index in the service_trees. */ enum wl_type_t { ASYNC_WORKLOAD = 0, SYNC_NOIDLE_WORKLOAD = 1, SYNC_WORKLOAD = 2 }; struct cfqg_stats { #ifdef CONFIG_CFQ_GROUP_IOSCHED /* total bytes transferred */ struct blkg_rwstat service_bytes; /* total IOs serviced, post merge */ struct blkg_rwstat serviced; /* number of ios merged */ struct blkg_rwstat merged; /* total time spent on device in ns, may not be accurate w/ queueing */ struct blkg_rwstat service_time; /* total time spent waiting in scheduler queue in ns */ struct blkg_rwstat wait_time; /* number of IOs queued up */ struct blkg_rwstat queued; /* total sectors transferred */ struct blkg_stat sectors; /* total disk time and nr sectors dispatched by this group */ struct blkg_stat time; #ifdef CONFIG_DEBUG_BLK_CGROUP /* time not charged to this cgroup */ struct blkg_stat unaccounted_time; /* sum of number of ios queued across all samples */ struct blkg_stat avg_queue_size_sum; /* count of samples taken for average */ struct blkg_stat avg_queue_size_samples; /* how many times this group has been removed from service tree */ struct blkg_stat dequeue; /* total time spent waiting for it to be assigned a timeslice. */ struct blkg_stat group_wait_time; /* time spent idling for this blkcg_gq */ struct blkg_stat idle_time; /* total time with empty current active q with other requests queued */ struct blkg_stat empty_time; /* fields after this shouldn't be cleared on stat reset */ uint64_t start_group_wait_time; uint64_t start_idle_time; uint64_t start_empty_time; uint16_t flags; #endif /* CONFIG_DEBUG_BLK_CGROUP */ #endif /* CONFIG_CFQ_GROUP_IOSCHED */ }; /* This is per cgroup per device grouping structure */ struct cfq_group { /* must be the first member */ struct blkg_policy_data pd; /* group service_tree member */ struct rb_node rb_node; /* group service_tree key */ u64 vdisktime; unsigned int weight; unsigned int new_weight; unsigned int dev_weight; /* number of cfqq currently on this group */ int nr_cfqq; /* * Per group busy queues average. Useful for workload slice calc. We * create the array for each prio class but at run time it is used * only for RT and BE class and slot for IDLE class remains unused. * This is primarily done to avoid confusion and a gcc warning. */ unsigned int busy_queues_avg[CFQ_PRIO_NR]; /* * rr lists of queues with requests. We maintain service trees for * RT and BE classes. These trees are subdivided in subclasses * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE * class there is no subclassification and all the cfq queues go on * a single tree service_tree_idle. * Counts are embedded in the cfq_rb_root */ struct cfq_rb_root service_trees[2][3]; struct cfq_rb_root service_tree_idle; unsigned long saved_workload_slice; enum wl_type_t saved_workload; enum wl_prio_t saved_serving_prio; /* number of requests that are on the dispatch list or inside driver */ int dispatched; struct cfq_ttime ttime; struct cfqg_stats stats; }; struct cfq_io_cq { struct io_cq icq; /* must be the first member */ struct cfq_queue *cfqq[2]; struct cfq_ttime ttime; int ioprio; /* the current ioprio */ #ifdef CONFIG_CFQ_GROUP_IOSCHED uint64_t blkcg_id; /* the current blkcg ID */ #endif }; /* * Per block device queue structure */ struct cfq_data { struct request_queue *queue; /* Root service tree for cfq_groups */ struct cfq_rb_root grp_service_tree; struct cfq_group *root_group; /* * The priority currently being served */ enum wl_prio_t serving_prio; enum wl_type_t serving_type; unsigned long workload_expires; struct cfq_group *serving_group; /* * Each priority tree is sorted by next_request position. These * trees are used when determining if two or more queues are * interleaving requests (see cfq_close_cooperator). */ struct rb_root prio_trees[CFQ_PRIO_LISTS]; unsigned int busy_queues; unsigned int busy_sync_queues; int rq_in_driver; int rq_in_flight[2]; /* * queue-depth detection */ int rq_queued; int hw_tag; /* * hw_tag can be * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection) * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth) * 0 => no NCQ */ int hw_tag_est_depth; unsigned int hw_tag_samples; /* * idle window management */ struct timer_list idle_slice_timer; struct work_struct unplug_work; struct cfq_queue *active_queue; struct cfq_io_cq *active_cic; /* * async queue for each priority case */ struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; struct cfq_queue *async_idle_cfqq; sector_t last_position; /* * tunables, see top of file */ unsigned int cfq_quantum; unsigned int cfq_fifo_expire[2]; unsigned int cfq_back_penalty; unsigned int cfq_back_max; unsigned int cfq_slice[2]; unsigned int cfq_slice_async_rq; unsigned int cfq_slice_idle; unsigned int cfq_group_idle; unsigned int cfq_latency; unsigned int cfq_target_latency; /* * Fallback dummy cfqq for extreme OOM conditions */ struct cfq_queue oom_cfqq; unsigned long last_delayed_sync; }; static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, enum wl_prio_t prio, enum wl_type_t type) { if (!cfqg) return NULL; if (prio == IDLE_WORKLOAD) return &cfqg->service_tree_idle; return &cfqg->service_trees[prio][type]; } enum cfqq_state_flags { CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */ CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */ CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */ CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ CFQ_CFQQ_FLAG_sync, /* synchronous queue */ CFQ_CFQQ_FLAG_coop, /* cfqq is shared */ CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */ CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */ CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */ }; #define CFQ_CFQQ_FNS(name) \ static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \ { \ (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \ } \ static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \ { \ (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \ } \ static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \ { \ return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \ } CFQ_CFQQ_FNS(on_rr); CFQ_CFQQ_FNS(wait_request); CFQ_CFQQ_FNS(must_dispatch); CFQ_CFQQ_FNS(must_alloc_slice); CFQ_CFQQ_FNS(fifo_expire); CFQ_CFQQ_FNS(idle_window); CFQ_CFQQ_FNS(prio_changed); CFQ_CFQQ_FNS(slice_new); CFQ_CFQQ_FNS(sync); CFQ_CFQQ_FNS(coop); CFQ_CFQQ_FNS(split_coop); CFQ_CFQQ_FNS(deep); CFQ_CFQQ_FNS(wait_busy); #undef CFQ_CFQQ_FNS static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd) { return pd ? container_of(pd, struct cfq_group, pd) : NULL; } static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) { return pd_to_blkg(&cfqg->pd); } #if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) /* cfqg stats flags */ enum cfqg_stats_flags { CFQG_stats_waiting = 0, CFQG_stats_idling, CFQG_stats_empty, }; #define CFQG_FLAG_FNS(name) \ static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats) \ { \ stats->flags |= (1 << CFQG_stats_##name); \ } \ static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats) \ { \ stats->flags &= ~(1 << CFQG_stats_##name); \ } \ static inline int cfqg_stats_##name(struct cfqg_stats *stats) \ { \ return (stats->flags & (1 << CFQG_stats_##name)) != 0; \ } \ CFQG_FLAG_FNS(waiting) CFQG_FLAG_FNS(idling) CFQG_FLAG_FNS(empty) #undef CFQG_FLAG_FNS /* This should be called with the queue_lock held. */ static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats) { unsigned long long now; if (!cfqg_stats_waiting(stats)) return; now = sched_clock(); if (time_after64(now, stats->start_group_wait_time)) blkg_stat_add(&stats->group_wait_time, now - stats->start_group_wait_time); cfqg_stats_clear_waiting(stats); } /* This should be called with the queue_lock held. */ static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { struct cfqg_stats *stats = &cfqg->stats; if (cfqg_stats_waiting(stats)) return; if (cfqg == curr_cfqg) return; stats->start_group_wait_time = sched_clock(); cfqg_stats_mark_waiting(stats); } /* This should be called with the queue_lock held. */ static void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { unsigned long long now; if (!cfqg_stats_empty(stats)) return; now = sched_clock(); if (time_after64(now, stats->start_empty_time)) blkg_stat_add(&stats->empty_time, now - stats->start_empty_time); cfqg_stats_clear_empty(stats); } static void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { blkg_stat_add(&cfqg->stats.dequeue, 1); } static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { struct cfqg_stats *stats = &cfqg->stats; if (blkg_rwstat_sum(&stats->queued)) return; /* * group is already marked empty. This can happen if cfqq got new * request in parent group and moved to this group while being added * to service tree. Just ignore the event and move on. */ if (cfqg_stats_empty(stats)) return; stats->start_empty_time = sched_clock(); cfqg_stats_mark_empty(stats); } static void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { struct cfqg_stats *stats = &cfqg->stats; if (cfqg_stats_idling(stats)) { unsigned long long now = sched_clock(); if (time_after64(now, stats->start_idle_time)) blkg_stat_add(&stats->idle_time, now - stats->start_idle_time); cfqg_stats_clear_idling(stats); } } static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { struct cfqg_stats *stats = &cfqg->stats; BUG_ON(cfqg_stats_idling(stats)); stats->start_idle_time = sched_clock(); cfqg_stats_mark_idling(stats); } static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { struct cfqg_stats *stats = &cfqg->stats; blkg_stat_add(&stats->avg_queue_size_sum, blkg_rwstat_sum(&stats->queued)); blkg_stat_add(&stats->avg_queue_size_samples, 1); cfqg_stats_update_group_wait_time(stats); } #else /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { } static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { } static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { } static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { } static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { } static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { } static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { } #endif /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ #ifdef CONFIG_CFQ_GROUP_IOSCHED static struct blkcg_policy blkcg_policy_cfq; static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) { return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); } static inline void cfqg_get(struct cfq_group *cfqg) { return blkg_get(cfqg_to_blkg(cfqg)); } static inline void cfqg_put(struct cfq_group *cfqg) { return blkg_put(cfqg_to_blkg(cfqg)); } #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) do { \ char __pbuf[128]; \ \ blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf)); \ blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ __pbuf, ##args); \ } while (0) #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do { \ char __pbuf[128]; \ \ blkg_path(cfqg_to_blkg(cfqg), __pbuf, sizeof(__pbuf)); \ blk_add_trace_msg((cfqd)->queue, "%s " fmt, __pbuf, ##args); \ } while (0) static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, struct cfq_group *curr_cfqg, int rw) { blkg_rwstat_add(&cfqg->stats.queued, rw, 1); cfqg_stats_end_empty_time(&cfqg->stats); cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg); } static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, unsigned long time, unsigned long unaccounted_time) { blkg_stat_add(&cfqg->stats.time, time); #ifdef CONFIG_DEBUG_BLK_CGROUP blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time); #endif } static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { blkg_rwstat_add(&cfqg->stats.queued, rw, -1); } static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { blkg_rwstat_add(&cfqg->stats.merged, rw, 1); } static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, uint64_t bytes, int rw) { blkg_stat_add(&cfqg->stats.sectors, bytes >> 9); blkg_rwstat_add(&cfqg->stats.serviced, rw, 1); blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes); } static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, uint64_t start_time, uint64_t io_start_time, int rw) { struct cfqg_stats *stats = &cfqg->stats; unsigned long long now = sched_clock();