/* * CFQ, or complete fairness queueing, disk scheduler. * * Based on ideas from a previously unfinished io * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli. * * Copyright (C) 2003 Jens Axboe */ #include #include #include #include #include #include #include #include #include "blk.h" #include "blk-cgroup.h" /* * tunables */ /* max queue in one round of service */ static const int cfq_quantum = 8; static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; /* maximum backwards seek, in KiB */ static const int cfq_back_max = 16 * 1024; /* penalty of a backwards seek */ static const int cfq_back_penalty = 2; static const int cfq_slice_sync = HZ / 10; static int cfq_slice_async = HZ / 25; static const int cfq_slice_async_rq = 2; static int cfq_slice_idle = HZ / 125; static int cfq_group_idle = HZ / 125; static const int cfq_target_latency = HZ * 3/10; /* 300 ms */ static const int cfq_hist_divisor = 4; /* * offset from end of service tree */ #define CFQ_IDLE_DELAY (HZ / 5) /* * below this threshold, we consider thinktime immediate */ #define CFQ_MIN_TT (2) #define CFQ_SLICE_SCALE (5) #define CFQ_HW_QUEUE_MIN (5) #define CFQ_SERVICE_SHIFT 12 #define CFQQ_SEEK_THR (sector_t)(8 * 100) #define CFQQ_CLOSE_THR (sector_t)(8 * 1024) #define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32) #define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) #define RQ_CIC(rq) icq_to_cic((rq)->elv.icq) #define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0]) #define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1]) static struct kmem_cache *cfq_pool; #define CFQ_PRIO_LISTS IOPRIO_BE_NR #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) #define sample_valid(samples) ((samples) > 80) #define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) struct cfq_ttime { unsigned long last_end_request; unsigned long ttime_total; unsigned long ttime_samples; unsigned long ttime_mean; }; /* * Most of our rbtree usage is for sorting with min extraction, so * if we cache the leftmost node we don't have to walk down the tree * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should * move this into the elevator for the rq sorting as well. */ struct cfq_rb_root { struct rb_root rb; struct rb_node *left; unsigned count; unsigned total_weight; u64 min_vdisktime; struct cfq_ttime ttime; }; #define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, \ .ttime = {.last_end_request = jiffies,},} /* * Per process-grouping structure */ struct cfq_queue { /* reference count */ int ref; /* various state flags, see below */ unsigned int flags; /* parent cfq_data */ struct cfq_data *cfqd; /* service_tree member */ struct rb_node rb_node; /* service_tree key */ unsigned long rb_key; /* prio tree member */ struct rb_node p_node; /* prio tree root we belong to, if any */ struct rb_root *p_root; /* sorted list of pending requests */ struct rb_root sort_list; /* if fifo isn't expired, next request to serve */ struct request *next_rq; /* requests queued in sort_list */ int queued[2]; /* currently allocated requests */ int allocated[2]; /* fifo list of requests in sort_list */ struct list_head fifo; /* time when queue got scheduled in to dispatch first request. */ unsigned long dispatch_start; unsigned int allocated_slice; unsigned int slice_dispatch; /* time when first request from queue completed and slice started. */ unsigned long slice_start; unsigned long slice_end; long slice_resid; /* pending priority requests */ int prio_pending; /* number of requests that are on the dispatch list or inside driver */ int dispatched; /* io prio of this group */ unsigned short ioprio, org_ioprio; unsigned short ioprio_class; pid_t pid; u32 seek_history; sector_t last_request_pos; struct cfq_rb_root *service_tree; struct cfq_queue *new_cfqq; struct cfq_group *cfqg; /* Number of sectors dispatched from queue in single dispatch round */ unsigned long nr_sectors; }; /* * First index in the service_trees. * IDLE is handled separately, so it has negative index */ enum wl_prio_t { BE_WORKLOAD = 0, RT_WORKLOAD = 1, IDLE_WORKLOAD = 2, CFQ_PRIO_NR, }; /* * Second index in the service_trees. */ enum wl_type_t { ASYNC_WORKLOAD = 0, SYNC_NOIDLE_WORKLOAD = 1, SYNC_WORKLOAD = 2 }; struct cfqg_stats { #ifdef CONFIG_CFQ_GROUP_IOSCHED /* total bytes transferred */ struct blkg_rwstat service_bytes; /* total IOs serviced, post merge */ struct blkg_rwstat serviced; /* number of ios merged */ struct blkg_rwstat merged; /* total time spent on device in ns, may not be accurate w/ queueing */ struct blkg_rwstat service_time; /* total time spent waiting in scheduler queue in ns */ struct blkg_rwstat wait_time; /* number of IOs queued up */ struct blkg_rwstat queued; /* total sectors transferred */ struct blkg_stat sectors; /* total disk time and nr sectors dispatched by this group */ struct blkg_stat time; #ifdef CONFIG_DEBUG_BLK_CGROUP /* time not charged to this cgroup */ struct blkg_stat unaccounted_time; /* sum of number of ios queued across all samples */ struct blkg_stat avg_queue_size_sum; /* count of samples taken for average */ struct blkg_stat avg_queue_size_samples; /* how many times this group has been removed from service tree */ struct blkg_stat dequeue; /* total time spent waiting for it to be assigned a timeslice. */ struct blkg_stat group_wait_time; /* time spent idling for this blkcg_gq */ struct blkg_stat idle_time; /* total time with empty current active q with other requests queued */ struct blkg_stat empty_time; /* fields after this shouldn't be cleared on stat reset */ uint64_t start_group_wait_time; uint64_t start_idle_time; uint64_t start_empty_time; uint16_t flags; #endif /* CONFIG_DEBUG_BLK_CGROUP */ #endif /* CONFIG_CFQ_GROUP_IOSCHED */ }; /* This is per cgroup per device grouping structure */ struct cfq_group { /* must be the first member */ struct blkg_policy_data pd; /* group service_tree member */ struct rb_node rb_node; /* group service_tree key */ u64 vdisktime; unsigned int weight; unsigned int new_weight; unsigned int dev_weight; /* number of cfqq currently on this group */ int nr_cfqq; /* * Per group busy queues average. Useful for workload slice calc. We * create the array for each prio class but at run time it is used * only for RT and BE class and slot for IDLE class remains unused. * This is primarily done to avoid confusion and a gcc warning. */ unsigned int busy_queues_avg[CFQ_PRIO_NR]; /* * rr lists of queues with requests. We maintain service trees for * RT and BE classes. These trees are subdivided in subclasses * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE * class there is no subclassification and all the cfq queues go on * a single tree service_tree_idle. * Counts are embedded in the cfq_rb_root */ struct cfq_rb_root service_trees[2][3]; struct cfq_rb_root service_tree_idle; unsigned long saved_workload_slice; enum wl_type_t saved_workload; enum wl_prio_t saved_serving_prio; /* number of requests that are on the dispatch list or inside driver */ int dispatched; struct cfq_ttime ttime; struct cfqg_stats stats; }; struct cfq_io_cq { struct io_cq icq; /* must be the first member */ struct cfq_queue *cfqq[2]; struct cfq_ttime ttime; int ioprio; /* the current ioprio */ #ifdef CONFIG_CFQ_GROUP_IOSCHED uint64_t blkcg_id; /* the current blkcg ID */ #endif }; /* * Per block device queue structure */ struct cfq_data { struct request_queue *queue; /* Root service tree for cfq_groups */ struct cfq_rb_root grp_service_tree; struct cfq_group *root_group; /* * The priority currently being served */ enum wl_prio_t serving_prio; enum wl_type_t serving_type; unsigned long workload_expires; struct cfq_group *serving_group; /* * Each priority tree is sorted by next_request position. These * trees are used when determining if two or more queues are * interleaving requests (see cfq_close_cooperator). */ struct rb_root prio_trees[CFQ_PRIO_LISTS]; unsigned int busy_queues; unsigned int busy_sync_queues; int rq_in_driver; int rq_in_flight[2]; /* * queue-depth detection */ int rq_queued; int hw_tag; /* * hw_tag can be * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection) * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth) * 0 => no NCQ */ int hw_tag_est_depth; unsigned int hw_tag_samples; /* * idle window management */ struct timer_list idle_slice_timer; struct work_struct unplug_work; struct cfq_queue *active_queue; struct cfq_io_cq *active_cic; /* * async queue for each priority case */ struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; struct cfq_queue *async_idle_cfqq; sector_t last_position; /* * tunables, see top of file */ unsigned int cfq_quantum; unsigned int cfq_fifo_expire[2]; unsigned int cfq_back_penalty; unsigned int cfq_back_max; unsigned int cfq_slice[2]; unsigned int cfq_slice_async_rq; unsigned int cfq_slice_idle; unsigned int cfq_group_idle; unsigned int cfq_latency; unsigned int cfq_target_latency; /* * Fallback dummy cfqq for extreme OOM conditions */ struct cfq_queue oom_cfqq; unsigned long last_delayed_sync; }; static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, enum wl_prio_t prio, enum wl_type_t type) { if (!cfqg) return NULL; if (prio == IDLE_WORKLOAD) return &cfqg->service_tree_idle; return &cfqg->service_trees[prio][type]; } enum cfqq_state_flags { CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */ CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */ CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */