main.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375
  1. /*
  2. * CFQ, or complete fairness queueing, disk scheduler.
  3. *
  4. * Based on ideas from a previously unfinished io
  5. * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
  6. *
  7. * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
  8. */
  9. #include <linux/module.h>
  10. #include <linux/slab.h>
  11. #include <linux/blkdev.h>
  12. #include <linux/elevator.h>
  13. #include <linux/jiffies.h>
  14. #include <linux/rbtree.h>
  15. #include <linux/ioprio.h>
  16. #include <linux/blktrace_api.h>
  17. #include "blk.h"
  18. #include "blk-cgroup.h"
  19. /*
  20. * tunables
  21. */
  22. /* max queue in one round of service */
  23. static const int cfq_quantum = 8;
  24. static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
  25. /* maximum backwards seek, in KiB */
  26. static const int cfq_back_max = 16 * 1024;
  27. /* penalty of a backwards seek */
  28. static const int cfq_back_penalty = 2;
  29. static const int cfq_slice_sync = HZ / 10;
  30. static int cfq_slice_async = HZ / 25;
  31. static const int cfq_slice_async_rq = 2;
  32. static int cfq_slice_idle = HZ / 125;
  33. static int cfq_group_idle = HZ / 125;
  34. static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
  35. static const int cfq_hist_divisor = 4;
  36. /*
  37. * offset from end of service tree
  38. */
  39. #define CFQ_IDLE_DELAY (HZ / 5)
  40. /*
  41. * below this threshold, we consider thinktime immediate
  42. */
  43. #define CFQ_MIN_TT (2)
  44. #define CFQ_SLICE_SCALE (5)
  45. #define CFQ_HW_QUEUE_MIN (5)
  46. #define CFQ_SERVICE_SHIFT 12
  47. #define CFQQ_SEEK_THR (sector_t)(8 * 100)
  48. #define CFQQ_CLOSE_THR (sector_t)(8 * 1024)
  49. #define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
  50. #define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)
  51. #define RQ_CIC(rq) icq_to_cic((rq)->elv.icq)
  52. #define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0])
  53. #define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1])
  54. static struct kmem_cache *cfq_pool;
  55. #define CFQ_PRIO_LISTS IOPRIO_BE_NR
  56. #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
  57. #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
  58. #define sample_valid(samples) ((samples) > 80)
  59. #define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
  60. struct cfq_ttime {
  61. unsigned long last_end_request;
  62. unsigned long ttime_total;
  63. unsigned long ttime_samples;
  64. unsigned long ttime_mean;
  65. };
  66. /*
  67. * Most of our rbtree usage is for sorting with min extraction, so
  68. * if we cache the leftmost node we don't have to walk down the tree
  69. * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
  70. * move this into the elevator for the rq sorting as well.
  71. */
  72. struct cfq_rb_root {
  73. struct rb_root rb;
  74. struct rb_node *left;
  75. unsigned count;
  76. unsigned total_weight;
  77. u64 min_vdisktime;
  78. struct cfq_ttime ttime;
  79. };
  80. #define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, \
  81. .ttime = {.last_end_request = jiffies,},}
  82. /*
  83. * Per process-grouping structure
  84. */
  85. struct cfq_queue {
  86. /* reference count */
  87. int ref;
  88. /* various state flags, see below */
  89. unsigned int flags;
  90. /* parent cfq_data */
  91. struct cfq_data *cfqd;
  92. /* service_tree member */
  93. struct rb_node rb_node;
  94. /* service_tree key */
  95. unsigned long rb_key;
  96. /* prio tree member */
  97. struct rb_node p_node;
  98. /* prio tree root we belong to, if any */
  99. struct rb_root *p_root;
  100. /* sorted list of pending requests */
  101. struct rb_root sort_list;
  102. /* if fifo isn't expired, next request to serve */
  103. struct request *next_rq;
  104. /* requests queued in sort_list */
  105. int queued[2];
  106. /* currently allocated requests */
  107. int allocated[2];
  108. /* fifo list of requests in sort_list */
  109. struct list_head fifo;
  110. /* time when queue got scheduled in to dispatch first request. */
  111. unsigned long dispatch_start;
  112. unsigned int allocated_slice;
  113. unsigned int slice_dispatch;
  114. /* time when first request from queue completed and slice started. */
  115. unsigned long slice_start;
  116. unsigned long slice_end;
  117. long slice_resid;
  118. /* pending priority requests */
  119. int prio_pending;
  120. /* number of requests that are on the dispatch list or inside driver */
  121. int dispatched;
  122. /* io prio of this group */
  123. unsigned short ioprio, org_ioprio;
  124. unsigned short ioprio_class;
  125. pid_t pid;
  126. u32 seek_history;
  127. sector_t last_request_pos;
  128. struct cfq_rb_root *service_tree;
  129. struct cfq_queue *new_cfqq;
  130. struct cfq_group *cfqg;
  131. /* Number of sectors dispatched from queue in single dispatch round */
  132. unsigned long nr_sectors;
  133. };
  134. /*
  135. * First index in the service_trees.
  136. * IDLE is handled separately, so it has negative index
  137. */
  138. enum wl_prio_t {
  139. BE_WORKLOAD = 0,
  140. RT_WORKLOAD = 1,
  141. IDLE_WORKLOAD = 2,
  142. CFQ_PRIO_NR,
  143. };
  144. /*
  145. * Second index in the service_trees.
  146. */
  147. enum wl_type_t {
  148. ASYNC_WORKLOAD = 0,
  149. SYNC_NOIDLE_WORKLOAD = 1,
  150. SYNC_WORKLOAD = 2
  151. };
  152. struct cfqg_stats {
  153. #ifdef CONFIG_CFQ_GROUP_IOSCHED
  154. /* total bytes transferred */
  155. struct blkg_rwstat service_bytes;
  156. /* total IOs serviced, post merge */
  157. struct blkg_rwstat serviced;
  158. /* number of ios merged */
  159. struct blkg_rwstat merged;
  160. /* total time spent on device in ns, may not be accurate w/ queueing */
  161. struct blkg_rwstat service_time;
  162. /* total time spent waiting in scheduler queue in ns */
  163. struct blkg_rwstat wait_time;
  164. /* number of IOs queued up */
  165. struct blkg_rwstat queued;
  166. /* total sectors transferred */
  167. struct blkg_stat sectors;
  168. /* total disk time and nr sectors dispatched by this group */
  169. struct blkg_stat time;
  170. #ifdef CONFIG_DEBUG_BLK_CGROUP
  171. /* time not charged to this cgroup */
  172. struct blkg_stat unaccounted_time;
  173. /* sum of number of ios queued across all samples */
  174. struct blkg_stat avg_queue_size_sum;
  175. /* count of samples taken for average */
  176. struct blkg_stat avg_queue_size_samples;
  177. /* how many times this group has been removed from service tree */
  178. struct blkg_stat dequeue;
  179. /* total time spent waiting for it to be assigned a timeslice. */
  180. struct blkg_stat group_wait_time;
  181. /* time spent idling for this blkcg_gq */
  182. struct blkg_stat idle_time;
  183. /* total time with empty current active q with other requests queued */
  184. struct blkg_stat empty_time;
  185. /* fields after this shouldn't be cleared on stat reset */
  186. uint64_t start_group_wait_time;
  187. uint64_t start_idle_time;
  188. uint64_t start_empty_time;
  189. uint16_t flags;
  190. #endif /* CONFIG_DEBUG_BLK_CGROUP */
  191. #endif /* CONFIG_CFQ_GROUP_IOSCHED */
  192. };
  193. /* This is per cgroup per device grouping structure */
  194. struct cfq_group {
  195. /* must be the first member */
  196. struct blkg_policy_data pd;
  197. /* group service_tree member */
  198. struct rb_node rb_node;
  199. /* group service_tree key */
  200. u64 vdisktime;
  201. unsigned int weight;
  202. unsigned int new_weight;
  203. unsigned int dev_weight;
  204. /* number of cfqq currently on this group */
  205. int nr_cfqq;
  206. /*
  207. * Per group busy queues average. Useful for workload slice calc. We
  208. * create the array for each prio class but at run time it is used
  209. * only for RT and BE class and slot for IDLE class remains unused.
  210. * This is primarily done to avoid confusion and a gcc warning.
  211. */
  212. unsigned int busy_queues_avg[CFQ_PRIO_NR];
  213. /*
  214. * rr lists of queues with requests. We maintain service trees for
  215. * RT and BE classes. These trees are subdivided in subclasses
  216. * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
  217. * class there is no subclassification and all the cfq queues go on
  218. * a single tree service_tree_idle.
  219. * Counts are embedded in the cfq_rb_root
  220. */
  221. struct cfq_rb_root service_trees[2][3];
  222. struct cfq_rb_root service_tree_idle;
  223. unsigned long saved_workload_slice;
  224. enum wl_type_t saved_workload;
  225. enum wl_prio_t saved_serving_prio;
  226. /* number of requests that are on the dispatch list or inside driver */
  227. int dispatched;
  228. struct cfq_ttime ttime;
  229. struct cfqg_stats stats;
  230. };
  231. struct cfq_io_cq {
  232. struct io_cq icq; /* must be the first member */
  233. struct cfq_queue *cfqq[2];
  234. struct cfq_ttime ttime;
  235. int ioprio; /* the current ioprio */
  236. #ifdef CONFIG_CFQ_GROUP_IOSCHED
  237. uint64_t blkcg_id; /* the current blkcg ID */
  238. #endif
  239. };
  240. /*
  241. * Per block device queue structure
  242. */
  243. struct cfq_data {
  244. struct request_queue *queue;
  245. /* Root service tree for cfq_groups */
  246. struct cfq_rb_root grp_service_tree;
  247. struct cfq_group *root_group;
  248. /*
  249. * The priority currently being served
  250. */
  251. enum wl_prio_t serving_prio;
  252. enum wl_type_t serving_type;
  253. unsigned long workload_expires;
  254. struct cfq_group *serving_group;
  255. /*
  256. * Each priority tree is sorted by next_request position. These
  257. * trees are used when determining if two or more queues are
  258. * interleaving requests (see cfq_close_cooperator).
  259. */
  260. struct rb_root prio_trees[CFQ_PRIO_LISTS];
  261. unsigned int busy_queues;
  262. unsigned int busy_sync_queues;
  263. int rq_in_driver;
  264. int rq_in_flight[2];
  265. /*
  266. * queue-depth detection
  267. */
  268. int rq_queued;
  269. int hw_tag;
  270. /*
  271. * hw_tag can be
  272. * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
  273. * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
  274. * 0 => no NCQ
  275. */
  276. int hw_tag_est_depth;
  277. unsigned int hw_tag_samples;
  278. /*
  279. * idle window management
  280. */
  281. struct timer_list idle_slice_timer;
  282. struct work_struct unplug_work;
  283. struct cfq_queue *active_queue;
  284. struct cfq_io_cq *active_cic;
  285. /*
  286. * async queue for each priority case
  287. */
  288. struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
  289. struct cfq_queue *async_idle_cfqq;
  290. sector_t last_position;
  291. /*
  292. * tunables, see top of file
  293. */
  294. unsigned int cfq_quantum;
  295. unsigned int cfq_fifo_expire[2];
  296. unsigned int cfq_back_penalty;
  297. unsigned int cfq_back_max;
  298. unsigned int cfq_slice[2];
  299. unsigned int cfq_slice_async_rq;
  300. unsigned int cfq_slice_idle;
  301. unsigned int cfq_group_idle;
  302. unsigned int cfq_latency;
  303. unsigned int cfq_target_latency;
  304. /*
  305. * Fallback dummy cfqq for extreme OOM conditions
  306. */
  307. struct cfq_queue oom_cfqq;
  308. unsigned long last_delayed_sync;
  309. };
  310. static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
  311. static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
  312. enum wl_prio_t prio,
  313. enum wl_type_t type)
  314. {
  315. if (!cfqg)
  316. return NULL;
  317. if (prio == IDLE_WORKLOAD)
  318. return &cfqg->service_tree_idle;
  319. return &cfqg->service_trees[prio][type];
  320. }
  321. enum cfqq_state_flags {
  322. CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */
  323. CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */
  324. CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */
  325. CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
  326. CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
  327. CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */