| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633 | /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de> * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> *	-  July2000 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001 *//* * This handles all read/write requests to block devices */#include <linux/kernel.h>#include <linux/module.h>#include <linux/backing-dev.h>#include <linux/bio.h>#include <linux/blkdev.h>#include <linux/highmem.h>#include <linux/mm.h>#include <linux/kernel_stat.h>#include <linux/string.h>#include <linux/init.h>#include <linux/completion.h>#include <linux/slab.h>#include <linux/swap.h>#include <linux/writeback.h>#include <linux/task_io_accounting_ops.h>#include <linux/fault-inject.h>#include <linux/list_sort.h>#include <linux/delay.h>#include <linux/ratelimit.h>#define CREATE_TRACE_POINTS#include <trace/events/block.h>#include "blk.h"#include "blk-cgroup.h"EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);DEFINE_IDA(blk_queue_ida);/* * For the allocated request tables */static struct kmem_cache *request_cachep;/* * For queue allocation */struct kmem_cache *blk_requestq_cachep;/* * Controlling structure to kblockd */static struct workqueue_struct *kblockd_workqueue;static void drive_stat_acct(struct request *rq, int new_io){	struct hd_struct *part;	int rw = rq_data_dir(rq);	int cpu;	if (!blk_do_io_stat(rq))		return;	cpu = part_stat_lock();	if (!new_io) {		part = rq->part;		part_stat_inc(cpu, part, merges[rw]);	} else {		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));		if (!hd_struct_try_get(part)) {			/*			 * The partition is already being removed,			 * the request will be accounted on the disk only			 *			 * We take a reference on disk->part0 although that			 * partition will never be deleted, so we can treat			 * it as any other partition.			 */			part = &rq->rq_disk->part0;			hd_struct_get(part);		}		part_round_stats(cpu, part);		part_inc_in_flight(part, rw);		rq->part = part;	}	part_stat_unlock();}void blk_queue_congestion_threshold(struct request_queue *q){	int nr;	nr = q->nr_requests - (q->nr_requests / 8) + 1;	if (nr > q->nr_requests)		nr = q->nr_requests;	q->nr_congestion_on = nr;	nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;	if (nr < 1)		nr = 1;	q->nr_congestion_off = nr;}/** * blk_get_backing_dev_info - get the address of a queue's backing_dev_info * @bdev:	device * * Locates the passed device's request queue and returns the address of its * backing_dev_info * * Will return NULL if the request queue cannot be located. */struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev){	struct backing_dev_info *ret = NULL;	struct request_queue *q = bdev_get_queue(bdev);	if (q)		ret = &q->backing_dev_info;	return ret;}EXPORT_SYMBOL(blk_get_backing_dev_info);void blk_rq_init(struct request_queue *q, struct request *rq){	memset(rq, 0, sizeof(*rq));	INIT_LIST_HEAD(&rq->queuelist);	INIT_LIST_HEAD(&rq->timeout_list);	rq->cpu = -1;	rq->q = q;	rq->__sector = (sector_t) -1;	INIT_HLIST_NODE(&rq->hash);	RB_CLEAR_NODE(&rq->rb_node);	rq->cmd = rq->__cmd;	rq->cmd_len = BLK_MAX_CDB;	rq->tag = -1;	rq->ref_count = 1;	rq->start_time = jiffies;	set_start_time_ns(rq);	rq->part = NULL;}EXPORT_SYMBOL(blk_rq_init);static void req_bio_endio(struct request *rq, struct bio *bio,			  unsigned int nbytes, int error){	if (error)		clear_bit(BIO_UPTODATE, &bio->bi_flags);	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))		error = -EIO;	if (unlikely(nbytes > bio->bi_size)) {		printk(KERN_ERR "%s: want %u bytes done, %u left\n",		       __func__, nbytes, bio->bi_size);		nbytes = bio->bi_size;	}	if (unlikely(rq->cmd_flags & REQ_QUIET))		set_bit(BIO_QUIET, &bio->bi_flags);	bio->bi_size -= nbytes;	bio->bi_sector += (nbytes >> 9);	if (bio_integrity(bio))		bio_integrity_advance(bio, nbytes);	/* don't actually finish bio if it's part of flush sequence */	if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))		bio_endio(bio, error);}void blk_dump_rq_flags(struct request *rq, char *msg){	int bit;	printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg,		rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,		rq->cmd_flags);	printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",	       (unsigned long long)blk_rq_pos(rq),	       blk_rq_sectors(rq), blk_rq_cur_sectors(rq));	printk(KERN_INFO "  bio %p, biotail %p, buffer %p, len %u\n",	       rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq));	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {		printk(KERN_INFO "  cdb: ");		for (bit = 0; bit < BLK_MAX_CDB; bit++)			printk("%02x ", rq->cmd[bit]);		printk("\n");	}}EXPORT_SYMBOL(blk_dump_rq_flags);static void blk_delay_work(struct work_struct *work){	struct request_queue *q;	q = container_of(work, struct request_queue, delay_work.work);	spin_lock_irq(q->queue_lock);	__blk_run_queue(q);	spin_unlock_irq(q->queue_lock);}/** * blk_delay_queue - restart queueing after defined interval * @q:		The &struct request_queue in question * @msecs:	Delay in msecs * * Description: *   Sometimes queueing needs to be postponed for a little while, to allow *   resources to come back. This function will make sure that queueing is *   restarted around the specified time. Queue lock must be held. */void blk_delay_queue(struct request_queue *q, unsigned long msecs){	if (likely(!blk_queue_dead(q)))		queue_delayed_work(kblockd_workqueue, &q->delay_work,				   msecs_to_jiffies(msecs));}EXPORT_SYMBOL(blk_delay_queue);/** * blk_start_queue - restart a previously stopped queue * @q:    The &struct request_queue in question * * Description: *   blk_start_queue() will clear the stop flag on the queue, and call *   the request_fn for the queue if it was in a stopped state when *   entered. Also see blk_stop_queue(). Queue lock must be held. **/void blk_start_queue(struct request_queue *q){	WARN_ON(!irqs_disabled());	queue_flag_clear(QUEUE_FLAG_STOPPED, q);	__blk_run_queue(q);}EXPORT_SYMBOL(blk_start_queue);/** * blk_stop_queue - stop a queue * @q:    The &struct request_queue in question * * Description: *   The Linux block layer assumes that a block driver will consume all *   entries on the request queue when the request_fn strategy is called. *   Often this will not happen, because of hardware limitations (queue *   depth settings). If a device driver gets a 'queue full' response, *   or if it simply chooses not to queue more I/O at one point, it can *   call this function to prevent the request_fn from being called until *   the driver has signalled it's ready to go again. This happens by calling *   blk_start_queue() to restart queue operations. Queue lock must be held. **/void blk_stop_queue(struct request_queue *q){	cancel_delayed_work(&q->delay_work);	queue_flag_set(QUEUE_FLAG_STOPPED, q);}EXPORT_SYMBOL(blk_stop_queue);/** * blk_sync_queue - cancel any pending callbacks on a queue * @q: the queue * * Description: *     The block layer may perform asynchronous callback activity *     on a queue, such as calling the unplug function after a timeout. *     A block device may call blk_sync_queue to ensure that any *     such activity is cancelled, thus allowing it to release resources *     that the callbacks might use. The caller must already have made sure *     that its ->make_request_fn will not re-add plugging prior to calling *     this function. * *     This function does not cancel any asynchronous activity arising *     out of elevator or throttling code. That would require elevaotor_exit() *     and blkcg_exit_queue() to be called with queue lock initialized. * */void blk_sync_queue(struct request_queue *q){	del_timer_sync(&q->timeout);	cancel_delayed_work_sync(&q->delay_work);}EXPORT_SYMBOL(blk_sync_queue);/** * __blk_run_queue_uncond - run a queue whether or not it has been stopped * @q:	The queue to run * * Description: *    Invoke request handling on a queue if there are any pending requests. *    May be used to restart request handling after a request has completed. *    This variant runs the queue whether or not the queue has been *    stopped. Must be called with the queue lock held and interrupts *    disabled. See also @blk_run_queue. */inline void __blk_run_queue_uncond(struct request_queue *q){	if (unlikely(blk_queue_dead(q)))		return;	/*	 * Some request_fn implementations, e.g. scsi_request_fn(), unlock	 * the queue lock internally. As a result multiple threads may be	 * running such a request function concurrently. Keep track of the	 * number of active request_fn invocations such that blk_drain_queue()	 * can wait until all these request_fn calls have finished.	 */	q->request_fn_active++;	q->request_fn(q);	q->request_fn_active--;}/** * __blk_run_queue - run a single device queue * @q:	The queue to run * * Description: *    See @blk_run_queue. This variant must be called with the queue lock *    held and interrupts disabled. */void __blk_run_queue(struct request_queue *q){	if (unlikely(blk_queue_stopped(q)))		return;	__blk_run_queue_uncond(q);}EXPORT_SYMBOL(__blk_run_queue);/** * blk_run_queue_async - run a single device queue in workqueue context * @q:	The queue to run * * Description: *    Tells kblockd to perform the equivalent of @blk_run_queue on behalf *    of us. The caller must hold the queue lock. */void blk_run_queue_async(struct request_queue *q){	if (likely(!blk_queue_stopped(q) && !blk_queue_dead(q)))		mod_delayed_work(kblockd_workqueue, &q->delay_work, 0);}EXPORT_SYMBOL(blk_run_queue_async);/** * blk_run_queue - run a single device queue * @q: The queue to run * * Description: *    Invoke request handling on this queue, if it has pending work to do. *    May be used to restart queueing when a request has completed. */void blk_run_queue(struct request_queue *q){	unsigned long flags;	spin_lock_irqsave(q->queue_lock, flags);	__blk_run_queue(q);	spin_unlock_irqrestore(q->queue_lock, flags);}EXPORT_SYMBOL(blk_run_queue);void blk_put_queue(struct request_queue *q){	kobject_put(&q->kobj);}EXPORT_SYMBOL(blk_put_queue);/** * __blk_drain_queue - drain requests from request_queue * @q: queue to drain * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV * * Drain requests from @q.  If @drain_all is set, all requests are drained. * If not, only ELVPRIV requests are drained.  The caller is responsible * for ensuring that no new requests which need to be drained are queued. */static void __blk_drain_queue(struct request_queue *q, bool drain_all)	__releases(q->queue_lock)	__acquires(q->queue_lock){	int i;	lockdep_assert_held(q->queue_lock);	while (true) {		bool drain = false;		/*		 * The caller might be trying to drain @q before its		 * elevator is initialized.		 */		if (q->elevator)			elv_drain_elevator(q);		blkcg_drain_queue(q);		/*		 * This function might be called on a queue which failed		 * driver init after queue creation or is not yet fully		 * active yet.  Some drivers (e.g. fd and loop) get unhappy		 * in such cases.  Kick queue iff dispatch queue has		 * something on it and @q has request_fn set.		 */		if (!list_empty(&q->queue_head) && q->request_fn)			__blk_run_queue(q);		drain |= q->nr_rqs_elvpriv;		drain |= q->request_fn_active;		/*		 * Unfortunately, requests are queued at and tracked from		 * multiple places and there's no single counter which can		 * be drained.  Check all the queues and counters.		 */		if (drain_all) {			drain |= !list_empty(&q->queue_head);			for (i = 0; i < 2; i++) {				drain |= q->nr_rqs[i];				drain |= q->in_flight[i];				drain |= !list_empty(&q->flush_queue[i]);			}		}		if (!drain)			break;		spin_unlock_irq(q->queue_lock);		msleep(10);		spin_lock_irq(q->queue_lock);	}	/*	 * With queue marked dead, any woken up waiter will fail the	 * allocation path, so the wakeup chaining is lost and we're	 * left with hung waiters. We need to wake up those waiters.	 */	if (q->request_fn) {		struct request_list *rl;		blk_queue_for_each_rl(rl, q)			for (i = 0; i < ARRAY_SIZE(rl->wait); i++)				wake_up_all(&rl->wait[i]);	}}/** * blk_queue_bypass_start - enter queue bypass mode * @q: queue of interest * * In bypass mode, only the dispatch FIFO queue of @q is used.  This * function makes @q enter bypass mode and drains all requests which were * throttled or issued before.  On return, it's guaranteed that no request * is being throttled or has ELVPRIV set and blk_queue_bypass() %true * inside queue or RCU read lock. */void blk_queue_bypass_start(struct request_queue *q){	bool drain;	spin_lock_irq(q->queue_lock);	drain = !q->bypass_depth++;	queue_flag_set(QUEUE_FLAG_BYPASS, q);	spin_unlock_irq(q->queue_lock);	if (drain) {		spin_lock_irq(q->queue_lock);		__blk_drain_queue(q, false);		spin_unlock_irq(q->queue_lock);		/* ensure blk_queue_bypass() is %true inside RCU read lock */		synchronize_rcu();	}}EXPORT_SYMBOL_GPL(blk_queue_bypass_start);/** * blk_queue_bypass_end - leave queue bypass mode * @q: queue of interest * * Leave bypass mode and restore the normal queueing behavior. */void blk_queue_bypass_end(struct request_queue *q){	spin_lock_irq(q->queue_lock);	if (!--q->bypass_depth)		queue_flag_clear(QUEUE_FLAG_BYPASS, q);	WARN_ON_ONCE(q->bypass_depth < 0);	spin_unlock_irq(q->queue_lock);}EXPORT_SYMBOL_GPL(blk_queue_bypass_end);/** * blk_cleanup_queue - shutdown a request queue * @q: request queue to shutdown * * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and * put it.  All future requests will be failed immediately with -ENODEV. */void blk_cleanup_queue(struct request_queue *q){	spinlock_t *lock = q->queue_lock;	/* mark @q DYING, no new request or merges will be allowed afterwards */	mutex_lock(&q->sysfs_lock);	queue_flag_set_unlocked(QUEUE_FLAG_DYING, q);	spin_lock_irq(lock);	/*	 * A dying queue is permanently in bypass mode till released.  Note	 * that, unlike blk_queue_bypass_start(), we aren't performing	 * synchronize_rcu() after entering bypass mode to avoid the delay	 * as some drivers create and destroy a lot of queues while	 * probing.  This is still safe because blk_release_queue() will be	 * called only after the queue refcnt drops to zero and nothing,	 * RCU or not, would be traversing the queue by then.	 */	q->bypass_depth++;	queue_flag_set(QUEUE_FLAG_BYPASS, q);	queue_flag_set(QUEUE_FLAG_NOMERGES, q);	queue_flag_set(QUEUE_FLAG_NOXMERGES, q);	queue_flag_set(QUEUE_FLAG_DYING, q);	spin_unlock_irq(lock);	mutex_unlock(&q->sysfs_lock);	/*	 * Drain all requests queued before DYING marking. Set DEAD flag to	 * prevent that q->request_fn() gets invoked after draining finished.	 */	spin_lock_irq(lock);	__blk_drain_queue(q, true);	queue_flag_set(QUEUE_FLAG_DEAD, q);	spin_unlock_irq(lock);	/* @q won't process any more request, flush async actions */	del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);	blk_sync_queue(q);	spin_lock_irq(lock);	if (q->queue_lock != &q->__queue_lock)		q->queue_lock = &q->__queue_lock;	spin_unlock_irq(lock);	/* @q is and will stay empty, shutdown and put */	blk_put_queue(q);}EXPORT_SYMBOL(blk_cleanup_queue);int blk_init_rl(struct request_list *rl, struct request_queue *q,		gfp_t gfp_mask){	if (unlikely(rl->rq_pool))		return 0;	rl->q = q;	rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;	rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;	init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);	init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);	rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,					  mempool_free_slab, request_cachep,					  gfp_mask, q->node);	if (!rl->rq_pool)		return -ENOMEM;	return 0;}void blk_exit_rl(struct request_list *rl){	if (rl->rq_pool)		mempool_destroy(rl->rq_pool);}struct request_queue *blk_alloc_queue(gfp_t gfp_mask){	return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE);}EXPORT_SYMBOL(blk_alloc_queue);struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id){	struct request_queue *q;	int err;	q = kmem_cache_alloc_node(blk_requestq_cachep,				gfp_mask | __GFP_ZERO, node_id);	if (!q)		return NULL;	q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);	if (q->id < 0)		goto fail_q;	q->backing_dev_info.ra_pages =			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;	q->backing_dev_info.state = 0;	q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;	q->backing_dev_info.name = "block";	q->node = node_id;	err = bdi_init(&q->backing_dev_info);	if (err)		goto fail_id;	setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,		    laptop_mode_timer_fn, (unsigned long) q);	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);	INIT_LIST_HEAD(&q->queue_head);	INIT_LIST_HEAD(&q->timeout_list);	INIT_LIST_HEAD(&q->icq_list);#ifdef CONFIG_BLK_CGROUP	INIT_LIST_HEAD(&q->blkg_list);#endif	INIT_LIST_HEAD(&q->flush_queue[0]);	INIT_LIST_HEAD(&q->flush_queue[1]);	INIT_LIST_HEAD(&q->flush_data_in_flight);
 |