blk-mq.c 84.1 KB
Newer Older
1 2 3 4 5 6
/*
 * Block multiqueue core code
 *
 * Copyright (C) 2013-2014 Jens Axboe
 * Copyright (C) 2013-2014 Christoph Hellwig
 */
7 8 9 10 11
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
12
#include <linux/kmemleak.h>
13 14 15 16 17 18 19 20 21 22
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/smp.h>
#include <linux/llist.h>
#include <linux/list_sort.h>
#include <linux/cpu.h>
#include <linux/cache.h>
#include <linux/sched/sysctl.h>
23
#include <linux/sched/topology.h>
24
#include <linux/sched/signal.h>
25
#include <linux/delay.h>
26
#include <linux/crash_dump.h>
27
#include <linux/prefetch.h>
28 29 30 31 32 33

#include <trace/events/block.h>

#include <linux/blk-mq.h>
#include "blk.h"
#include "blk-mq.h"
34
#include "blk-mq-debugfs.h"
35
#include "blk-mq-tag.h"
36
#include "blk-pm.h"
37
#include "blk-stat.h"
38
#include "blk-mq-sched.h"
39
#include "blk-rq-qos.h"
40

41 42 43
static void blk_mq_poll_stats_start(struct request_queue *q);
static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);

44 45 46 47
static int blk_mq_poll_stats_bkt(const struct request *rq)
{
	int ddir, bytes, bucket;

48
	ddir = rq_data_dir(rq);
49 50 51 52 53 54 55 56 57 58 59 60
	bytes = blk_rq_bytes(rq);

	bucket = ddir + 2*(ilog2(bytes) - 9);

	if (bucket < 0)
		return -1;
	else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
		return ddir + BLK_MQ_POLL_STATS_BKTS - 2;

	return bucket;
}

61 62 63
/*
 * Check if any of the ctx's have pending work in this hardware queue
 */
64
static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
65
{
66 67
	return !list_empty_careful(&hctx->dispatch) ||
		sbitmap_any_bit_set(&hctx->ctx_map) ||
68
			blk_mq_sched_has_work(hctx);
69 70
}

71 72 73 74 75 76
/*
 * Mark this ctx as having pending work in this hardware queue
 */
static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
				     struct blk_mq_ctx *ctx)
{
77 78 79 80
	const int bit = ctx->index_hw[hctx->type];

	if (!sbitmap_test_bit(&hctx->ctx_map, bit))
		sbitmap_set_bit(&hctx->ctx_map, bit);
81 82 83 84 85
}

static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
				      struct blk_mq_ctx *ctx)
{
86 87 88
	const int bit = ctx->index_hw[hctx->type];

	sbitmap_clear_bit(&hctx->ctx_map, bit);
89 90
}

91 92 93 94 95
struct mq_inflight {
	struct hd_struct *part;
	unsigned int *inflight;
};

96
static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
97 98 99 100 101
				  struct request *rq, void *priv,
				  bool reserved)
{
	struct mq_inflight *mi = priv;

102
	/*
103
	 * index[0] counts the specific partition that was asked for.
104 105 106
	 */
	if (rq->part == mi->part)
		mi->inflight[0]++;
107 108

	return true;
109 110
}

111
unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part)
112
{
113
	unsigned inflight[2];
114 115
	struct mq_inflight mi = { .part = part, .inflight = inflight, };

116
	inflight[0] = inflight[1] = 0;
117
	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
118 119

	return inflight[0];
120 121
}

122
static bool blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
123 124 125 126 127 128 129
				     struct request *rq, void *priv,
				     bool reserved)
{
	struct mq_inflight *mi = priv;

	if (rq->part == mi->part)
		mi->inflight[rq_data_dir(rq)]++;
130 131

	return true;
132 133 134 135 136 137 138 139 140 141 142
}

void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
			 unsigned int inflight[2])
{
	struct mq_inflight mi = { .part = part, .inflight = inflight, };

	inflight[0] = inflight[1] = 0;
	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi);
}

143
void blk_freeze_queue_start(struct request_queue *q)
144
{
145
	int freeze_depth;
146

147 148
	freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
	if (freeze_depth == 1) {
149
		percpu_ref_kill(&q->q_usage_counter);
150
		if (queue_is_mq(q))
151
			blk_mq_run_hw_queues(q, false);
152
	}
153
}
154
EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
155

156
void blk_mq_freeze_queue_wait(struct request_queue *q)
157
{
158
	wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
159
}
160
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
161

162 163 164 165 166 167 168 169
int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
				     unsigned long timeout)
{
	return wait_event_timeout(q->mq_freeze_wq,
					percpu_ref_is_zero(&q->q_usage_counter),
					timeout);
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
170

171 172 173 174
/*
 * Guarantee no request is in use, so we can change any data structure of
 * the queue afterward.
 */
175
void blk_freeze_queue(struct request_queue *q)
176
{
177 178 179 180 181 182 183
	/*
	 * In the !blk_mq case we are only calling this to kill the
	 * q_usage_counter, otherwise this increases the freeze depth
	 * and waits for it to return to zero.  For this reason there is
	 * no blk_unfreeze_queue(), and blk_freeze_queue() is not
	 * exported to drivers as the only user for unfreeze is blk_mq.
	 */
184
	blk_freeze_queue_start(q);
185 186
	blk_mq_freeze_queue_wait(q);
}
187 188 189 190 191 192 193 194 195

void blk_mq_freeze_queue(struct request_queue *q)
{
	/*
	 * ...just an alias to keep freeze and unfreeze actions balanced
	 * in the blk_mq_* namespace
	 */
	blk_freeze_queue(q);
}
196
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
197

198
void blk_mq_unfreeze_queue(struct request_queue *q)
199
{
200
	int freeze_depth;
201

202 203 204
	freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
	WARN_ON_ONCE(freeze_depth < 0);
	if (!freeze_depth) {
205
		percpu_ref_resurrect(&q->q_usage_counter);
206
		wake_up_all(&q->mq_freeze_wq);
207
	}
208
}
209
EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
210

211 212 213 214 215 216
/*
 * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
 * mpt3sas driver such that this function can be removed.
 */
void blk_mq_quiesce_queue_nowait(struct request_queue *q)
{
217
	blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
218 219 220
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);

221
/**
222
 * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
223 224 225
 * @q: request queue.
 *
 * Note: this function does not prevent that the struct request end_io()
226 227 228
 * callback function is invoked. Once this function is returned, we make
 * sure no dispatch can happen until the queue is unquiesced via
 * blk_mq_unquiesce_queue().
229 230 231 232 233 234 235
 */
void blk_mq_quiesce_queue(struct request_queue *q)
{
	struct blk_mq_hw_ctx *hctx;
	unsigned int i;
	bool rcu = false;

236
	blk_mq_quiesce_queue_nowait(q);
237

238 239
	queue_for_each_hw_ctx(q, hctx, i) {
		if (hctx->flags & BLK_MQ_F_BLOCKING)
240
			synchronize_srcu(hctx->srcu);
241 242 243 244 245 246 247 248
		else
			rcu = true;
	}
	if (rcu)
		synchronize_rcu();
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);

249 250 251 252 253 254 255 256 257
/*
 * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
 * @q: request queue.
 *
 * This function recovers queue into the state before quiescing
 * which is done by blk_mq_quiesce_queue.
 */
void blk_mq_unquiesce_queue(struct request_queue *q)
{
258
	blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
259

260 261
	/* dispatch requests which are inserted during quiescing */
	blk_mq_run_hw_queues(q, true);
262 263 264
}
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);

265 266 267 268 269 270 271 272 273 274
void blk_mq_wake_waiters(struct request_queue *q)
{
	struct blk_mq_hw_ctx *hctx;
	unsigned int i;

	queue_for_each_hw_ctx(q, hctx, i)
		if (blk_mq_hw_queue_mapped(hctx))
			blk_mq_tag_wakeup_all(hctx->tags, true);
}

275 276 277 278 279 280
bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
{
	return blk_mq_has_free_tags(hctx->tags);
}
EXPORT_SYMBOL(blk_mq_can_queue);

281 282 283 284 285 286 287 288 289
/*
 * Only need start/end time stamping if we have stats enabled, or using
 * an IO scheduler.
 */
static inline bool blk_mq_need_time_stamp(struct request *rq)
{
	return (rq->rq_flags & RQF_IO_STAT) || rq->q->elevator;
}

290 291
static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
		unsigned int tag, unsigned int op)
292
{
293 294
	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
	struct request *rq = tags->static_rqs[tag];
295
	req_flags_t rq_flags = 0;
296

297 298 299 300
	if (data->flags & BLK_MQ_REQ_INTERNAL) {
		rq->tag = -1;
		rq->internal_tag = tag;
	} else {
301
		if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
302
			rq_flags = RQF_MQ_INFLIGHT;
303 304 305 306 307 308 309
			atomic_inc(&data->hctx->nr_active);
		}
		rq->tag = tag;
		rq->internal_tag = -1;
		data->hctx->tags->rqs[rq->tag] = rq;
	}

310
	/* csd/requeue_work/fifo_time is initialized before use */
311 312
	rq->q = data->q;
	rq->mq_ctx = data->ctx;
313
	rq->mq_hctx = data->hctx;
314
	rq->rq_flags = rq_flags;
315
	rq->cmd_flags = op;
316 317
	if (data->flags & BLK_MQ_REQ_PREEMPT)
		rq->rq_flags |= RQF_PREEMPT;
318
	if (blk_queue_io_stat(data->q))
319
		rq->rq_flags |= RQF_IO_STAT;
320
	INIT_LIST_HEAD(&rq->queuelist);
321 322 323 324
	INIT_HLIST_NODE(&rq->hash);
	RB_CLEAR_NODE(&rq->rb_node);
	rq->rq_disk = NULL;
	rq->part = NULL;
325 326 327 328
	if (blk_mq_need_time_stamp(rq))
		rq->start_time_ns = ktime_get_ns();
	else
		rq->start_time_ns = 0;
329
	rq->io_start_time_ns = 0;
330 331 332 333 334 335 336
	rq->nr_phys_segments = 0;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
	rq->nr_integrity_segments = 0;
#endif
	rq->special = NULL;
	/* tag was already set */
	rq->extra_len = 0;
337
	WRITE_ONCE(rq->deadline, 0);
338

339 340
	rq->timeout = 0;

341 342 343 344
	rq->end_io = NULL;
	rq->end_io_data = NULL;
	rq->next_rq = NULL;

345
	data->ctx->rq_dispatched[op_is_sync(op)]++;
346
	refcount_set(&rq->ref, 1);
347
	return rq;
348 349
}

350
static struct request *blk_mq_get_request(struct request_queue *q,
351 352
					  struct bio *bio,
					  struct blk_mq_alloc_data *data)
353 354 355
{
	struct elevator_queue *e = q->elevator;
	struct request *rq;
356
	unsigned int tag;
357
	bool put_ctx_on_error = false;
358 359 360

	blk_queue_enter_live(q);
	data->q = q;
361 362 363 364
	if (likely(!data->ctx)) {
		data->ctx = blk_mq_get_ctx(q);
		put_ctx_on_error = true;
	}
365
	if (likely(!data->hctx))
366 367 368
		data->hctx = blk_mq_map_queue(q, data->cmd_flags,
						data->ctx->cpu);
	if (data->cmd_flags & REQ_NOWAIT)
369
		data->flags |= BLK_MQ_REQ_NOWAIT;
370 371 372 373 374 375

	if (e) {
		data->flags |= BLK_MQ_REQ_INTERNAL;

		/*
		 * Flush requests are special and go directly to the
376 377
		 * dispatch list. Don't include reserved tags in the
		 * limiting, as it isn't useful.
378
		 */
379 380
		if (!op_is_flush(data->cmd_flags) &&
		    e->type->ops.limit_depth &&
381
		    !(data->flags & BLK_MQ_REQ_RESERVED))
382
			e->type->ops.limit_depth(data->cmd_flags, data);
383 384
	} else {
		blk_mq_tag_busy(data->hctx);
385 386
	}

387 388
	tag = blk_mq_get_tag(data);
	if (tag == BLK_MQ_TAG_FAIL) {
389 390
		if (put_ctx_on_error) {
			blk_mq_put_ctx(data->ctx);
391 392
			data->ctx = NULL;
		}
393 394
		blk_queue_exit(q);
		return NULL;
395 396
	}

397 398
	rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags);
	if (!op_is_flush(data->cmd_flags)) {
399
		rq->elv.icq = NULL;
400
		if (e && e->type->ops.prepare_request) {
401 402
			if (e->type->icq_cache)
				blk_mq_sched_assign_ioc(rq);
403

404
			e->type->ops.prepare_request(rq, bio);
405
			rq->rq_flags |= RQF_ELVPRIV;
406
		}
407 408 409
	}
	data->hctx->queued++;
	return rq;
410 411
}

412
struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
413
		blk_mq_req_flags_t flags)
414
{
415
	struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
416
	struct request *rq;
417
	int ret;
418

419
	ret = blk_queue_enter(q, flags);
420 421
	if (ret)
		return ERR_PTR(ret);
422

423
	rq = blk_mq_get_request(q, NULL, &alloc_data);
424
	blk_queue_exit(q);
425

426
	if (!rq)
427
		return ERR_PTR(-EWOULDBLOCK);
428

429 430
	blk_mq_put_ctx(alloc_data.ctx);

431 432 433
	rq->__data_len = 0;
	rq->__sector = (sector_t) -1;
	rq->bio = rq->biotail = NULL;
434 435
	return rq;
}
436
EXPORT_SYMBOL(blk_mq_alloc_request);
437

438
struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
439
	unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
440
{
441
	struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
442
	struct request *rq;
443
	unsigned int cpu;
444 445 446 447 448 449 450 451 452 453 454 455 456 457
	int ret;

	/*
	 * If the tag allocator sleeps we could get an allocation for a
	 * different hardware context.  No need to complicate the low level
	 * allocator for this for the rare use case of a command tied to
	 * a specific queue.
	 */
	if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
		return ERR_PTR(-EINVAL);

	if (hctx_idx >= q->nr_hw_queues)
		return ERR_PTR(-EIO);

458
	ret = blk_queue_enter(q, flags);
459 460 461
	if (ret)
		return ERR_PTR(ret);

462 463 464 465
	/*
	 * Check if the hardware context is actually mapped to anything.
	 * If not tell the caller that it should skip this queue.
	 */
466 467 468 469
	alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
	if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
		blk_queue_exit(q);
		return ERR_PTR(-EXDEV);
470
	}
471
	cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
472
	alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
473

474
	rq = blk_mq_get_request(q, NULL, &alloc_data);
475
	blk_queue_exit(q);
476

477 478 479 480
	if (!rq)
		return ERR_PTR(-EWOULDBLOCK);

	return rq;
481 482 483
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);

484 485 486 487
static void __blk_mq_free_request(struct request *rq)
{
	struct request_queue *q = rq->q;
	struct blk_mq_ctx *ctx = rq->mq_ctx;
488
	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
489 490
	const int sched_tag = rq->internal_tag;

491
	blk_pm_mark_last_busy(rq);
492
	rq->mq_hctx = NULL;
493 494 495 496 497 498 499 500
	if (rq->tag != -1)
		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
	if (sched_tag != -1)
		blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
	blk_mq_sched_restart(hctx);
	blk_queue_exit(q);
}

501
void blk_mq_free_request(struct request *rq)
502 503
{
	struct request_queue *q = rq->q;
504 505
	struct elevator_queue *e = q->elevator;
	struct blk_mq_ctx *ctx = rq->mq_ctx;
506
	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
507

508
	if (rq->rq_flags & RQF_ELVPRIV) {
509 510
		if (e && e->type->ops.finish_request)
			e->type->ops.finish_request(rq);
511 512 513 514 515
		if (rq->elv.icq) {
			put_io_context(rq->elv.icq->ioc);
			rq->elv.icq = NULL;
		}
	}
516

517
	ctx->rq_completed[rq_is_sync(rq)]++;
518
	if (rq->rq_flags & RQF_MQ_INFLIGHT)
519
		atomic_dec(&hctx->nr_active);
520

521 522 523
	if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
		laptop_io_completion(q->backing_dev_info);

524
	rq_qos_done(q, rq);
525

526 527 528
	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
	if (refcount_dec_and_test(&rq->ref))
		__blk_mq_free_request(rq);
529
}
530
EXPORT_SYMBOL_GPL(blk_mq_free_request);
531

532
inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
533
{
534 535 536 537
	u64 now = 0;

	if (blk_mq_need_time_stamp(rq))
		now = ktime_get_ns();
538

539 540
	if (rq->rq_flags & RQF_STATS) {
		blk_mq_poll_stats_start(rq->q);
541
		blk_stat_add(rq, now);
542 543
	}

544 545 546
	if (rq->internal_tag != -1)
		blk_mq_sched_completed_request(rq, now);

547
	blk_account_io_done(rq, now);
548

Christoph Hellwig's avatar
Christoph Hellwig committed
549
	if (rq->end_io) {
550
		rq_qos_done(rq->q, rq);
551
		rq->end_io(rq, error);
Christoph Hellwig's avatar
Christoph Hellwig committed
552 553 554
	} else {
		if (unlikely(blk_bidi_rq(rq)))
			blk_mq_free_request(rq->next_rq);
555
		blk_mq_free_request(rq);
Christoph Hellwig's avatar
Christoph Hellwig committed
556
	}
557
}
558
EXPORT_SYMBOL(__blk_mq_end_request);
559

560
void blk_mq_end_request(struct request *rq, blk_status_t error)
561 562 563
{
	if (blk_update_request(rq, error, blk_rq_bytes(rq)))
		BUG();
564
	__blk_mq_end_request(rq, error);
565
}
566
EXPORT_SYMBOL(blk_mq_end_request);
567

568
static void __blk_mq_complete_request_remote(void *data)
569
{
570
	struct request *rq = data;
571
	struct request_queue *q = rq->q;
572

573
	q->mq_ops->complete(rq);
574 575
}

576
static void __blk_mq_complete_request(struct request *rq)
577 578
{
	struct blk_mq_ctx *ctx = rq->mq_ctx;
579
	struct request_queue *q = rq->q;
580
	bool shared = false;
581 582
	int cpu;

583
	WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
584 585 586 587 588 589 590 591 592
	/*
	 * Most of single queue controllers, there is only one irq vector
	 * for handling IO completion, and the only irq's affinity is set
	 * as all possible CPUs. On most of ARCHs, this affinity means the
	 * irq is handled on one specific CPU.
	 *
	 * So complete IO reqeust in softirq context in case of single queue
	 * for not degrading IO performance by irqsoff latency.
	 */
593
	if (q->nr_hw_queues == 1) {
594 595 596 597
		__blk_complete_request(rq);
		return;
	}

598 599 600 601 602 603
	/*
	 * For a polled request, always complete locallly, it's pointless
	 * to redirect the completion.
	 */
	if ((rq->cmd_flags & REQ_HIPRI) ||
	    !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) {
604
		q->mq_ops->complete(rq);
605 606
		return;
	}
607 608

	cpu = get_cpu();
609
	if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
610 611 612
		shared = cpus_share_cache(cpu, ctx->cpu);

	if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
613
		rq->csd.func = __blk_mq_complete_request_remote;
614 615
		rq->csd.info = rq;
		rq->csd.flags = 0;
616
		smp_call_function_single_async(ctx->cpu, &rq->csd);
617
	} else {
618
		q->mq_ops->complete(rq);
619
	}
620 621
	put_cpu();
}
622

623
static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
624
	__releases(hctx->srcu)
625 626 627 628
{
	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
		rcu_read_unlock();
	else
629
		srcu_read_unlock(hctx->srcu, srcu_idx);
630 631 632
}

static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
633
	__acquires(hctx->srcu)
634
{
635 636 637
	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
		/* shut up gcc false positive */
		*srcu_idx = 0;
638
		rcu_read_lock();
639
	} else
640
		*srcu_idx = srcu_read_lock(hctx->srcu);
641 642
}

643 644 645 646 647 648 649 650
/**
 * blk_mq_complete_request - end I/O on a request
 * @rq:		the request being processed
 *
 * Description:
 *	Ends all I/O on a request. It does not handle partial completions.
 *	The actual completion happens out-of-order, through a IPI handler.
 **/
651
bool blk_mq_complete_request(struct request *rq)
652
{
653
	if (unlikely(blk_should_fake_timeout(rq->q)))
654
		return false;
655
	__blk_mq_complete_request(rq);
656
	return true;
657 658
}
EXPORT_SYMBOL(blk_mq_complete_request);
659

660 661
int blk_mq_request_started(struct request *rq)
{
662
	return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
663 664 665
}
EXPORT_SYMBOL_GPL(blk_mq_request_started);

666
void blk_mq_start_request(struct request *rq)
667 668 669
{
	struct request_queue *q = rq->q;

670 671
	blk_mq_sched_started_request(rq);

672 673
	trace_block_rq_issue(q, rq);

674
	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
675 676 677 678
		rq->io_start_time_ns = ktime_get_ns();
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
		rq->throtl_size = blk_rq_sectors(rq);
#endif
679
		rq->rq_flags |= RQF_STATS;
680
		rq_qos_issue(q, rq);
681 682
	}

683
	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
684

685
	blk_add_timer(rq);
686
	WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
687 688 689 690 691 692 693 694 695

	if (q->dma_drain_size && blk_rq_bytes(rq)) {
		/*
		 * Make sure space for the drain appears.  We know we can do
		 * this because max_hw_segments has been adjusted to be one
		 * fewer than the device can handle.
		 */
		rq->nr_phys_segments++;
	}
696
}
697
EXPORT_SYMBOL(blk_mq_start_request);
698

699
static void __blk_mq_requeue_request(struct request *rq)
700 701 702
{
	struct request_queue *q = rq->q;

703 704
	blk_mq_put_driver_tag(rq);

705
	trace_block_rq_requeue(q, rq);
706
	rq_qos_requeue(q, rq);
707

708 709
	if (blk_mq_request_started(rq)) {
		WRITE_ONCE(rq->state, MQ_RQ_IDLE);
710
		rq->rq_flags &= ~RQF_TIMED_OUT;
711 712 713
		if (q->dma_drain_size && blk_rq_bytes(rq))
			rq->nr_phys_segments--;
	}
714 715
}

716
void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
717 718 719
{
	__blk_mq_requeue_request(rq);

720 721 722
	/* this request will be re-inserted to io scheduler queue */
	blk_mq_sched_requeue_request(rq);

723
	BUG_ON(!list_empty(&rq->queuelist));
724
	blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
725 726 727
}
EXPORT_SYMBOL(blk_mq_requeue_request);

728 729 730
static void blk_mq_requeue_work(struct work_struct *work)
{
	struct request_queue *q =
731
		container_of(work, struct request_queue, requeue_work.work);
732 733 734
	LIST_HEAD(rq_list);
	struct request *rq, *next;

735
	spin_lock_irq(&q->requeue_lock);
736
	list_splice_init(&q->requeue_list, &rq_list);
737
	spin_unlock_irq(&q->requeue_lock);
738 739

	list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
740
		if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP)))
741 742
			continue;

743
		rq->rq_flags &= ~RQF_SOFTBARRIER;
744
		list_del_init(&rq->queuelist);
745 746 747 748 749 750 751 752 753
		/*
		 * If RQF_DONTPREP, rq has contained some driver specific
		 * data, so insert it to hctx dispatch list to avoid any
		 * merge.
		 */
		if (rq->rq_flags & RQF_DONTPREP)
			blk_mq_request_bypass_insert(rq, false);
		else
			blk_mq_sched_insert_request(rq, true, false, false);
754 755 756 757 758
	}

	while (!list_empty(&rq_list)) {
		rq = list_entry(rq_list.next, struct request, queuelist);
		list_del_init(&rq->queuelist);
759
		blk_mq_sched_insert_request(rq, false, false, false);
760 761
	}

762
	blk_mq_run_hw_queues(q, false);
763 764
}

765 766
void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
				bool kick_requeue_list)
767 768 769 770 771 772
{
	struct request_queue *q = rq->q;
	unsigned long flags;

	/*
	 * We abuse this flag that is otherwise used by the I/O scheduler to
773
	 * request head insertion from the workqueue.
774
	 */
775
	BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
776 777 778

	spin_lock_irqsave(&q->requeue_lock, flags);
	if (at_head) {
779
		rq->rq_flags |= RQF_SOFTBARRIER;
780 781 782 783 784
		list_add(&rq->queuelist, &q->requeue_list);
	} else {
		list_add_tail(&rq->queuelist, &q->requeue_list);
	}
	spin_unlock_irqrestore(&q->requeue_lock, flags);
785 786 787

	if (kick_requeue_list)
		blk_mq_kick_requeue_list(q);
788 789 790 791 792
}
EXPORT_SYMBOL(blk_mq_add_to_requeue_list);

void blk_mq_kick_requeue_list(struct request_queue *q)
{
793
	kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
794 795 796
}
EXPORT_SYMBOL(blk_mq_kick_requeue_list);

797 798 799
void blk_mq_delay_kick_requeue_list(struct request_queue *q,
				    unsigned long msecs)
{
800 801
	kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
				    msecs_to_jiffies(msecs));
802 803 804
}
EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);

805 806
struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
{
807 808
	if (tag < tags->nr_tags) {
		prefetch(tags->rqs[tag]);
809
		return tags->rqs[tag];
810
	}
811 812

	return NULL;
813 814 815
}
EXPORT_SYMBOL(blk_mq_tag_to_rq);

816 817
static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
			       void *priv, bool reserved)
818 819
{
	/*
820 821
	 * If we find a request that is inflight and the queue matches,
	 * we know the queue is busy. Return false to stop the iteration.
822
	 */
823
	if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) {
824 825 826 827 828 829 830 831 832
		bool *busy = priv;

		*busy = true;
		return false;
	}

	return true;
}

833
bool blk_mq_queue_inflight(struct request_queue *q)
834 835 836
{
	bool busy = false;

837
	blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
838 839
	return busy;
}
840
EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
841

842
static void blk_mq_rq_timed_out(struct request *req, bool reserved)
843
{
844
	req->rq_flags |= RQF_TIMED_OUT;
845 846 847 848 849 850 851
	if (req->q->mq_ops->timeout) {
		enum blk_eh_timer_return ret;

		ret = req->q->mq_ops->timeout(req, reserved);
		if (ret == BLK_EH_DONE)
			return;
		WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
852
	}
853 854

	blk_add_timer(req);
855
}
856

857
static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
858
{
859
	unsigned long deadline;
860

861 862
	if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
		return false;
863 864
	if (rq->rq_flags & RQF_TIMED_OUT)
		return false;
865

866
	deadline = READ_ONCE(rq->deadline);
867 868
	if (time_after_eq(jiffies, deadline))
		return true;
869

870 871 872 873 874
	if (*next == 0)
		*next = deadline;
	else if (time_after(*next, deadline))
		*next = deadline;
	return false;
875 876
}

877
static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
878 879
		struct request *rq, void *priv, bool reserved)
{
880 881 882 883 884 885 886
	unsigned long *next = priv;

	/*
	 * Just do a quick check if it is expired before locking the request in
	 * so we're not unnecessarilly synchronizing across CPUs.
	 */
	if (!blk_mq_req_expired(rq, next))
887
		return true;
888 889 890 891 892 893 894 895 896 897 898

	/*
	 * We have reason to believe the request may be expired. Take a
	 * reference on the request to lock this request lifetime into its
	 * currently allocated context to prevent it from being reallocated in
	 * the event the completion by-passes this timeout handler.
	 *
	 * If the reference was already released, then the driver beat the
	 * timeout handler to posting a natural completion.
	 */
	if (!refcount_inc_not_zero(&rq->ref))
899
		return true;
900

901
	/*
902 903 904 905
	 * The request is now locked and cannot be reallocated underneath the
	 * timeout handler's processing. Re-verify this exact request is truly
	 * expired; if it is not expired, then the request was completed and
	 * reallocated as a new request.
906
	 */
907
	if (blk_mq_req_expired(rq, next))
908
		blk_mq_rq_timed_out(rq, reserved);
909 910
	if (refcount_dec_and_test(&rq->ref))
		__blk_mq_free_request(rq);
911 912

	return true;
913 914
}

915
static void blk_mq_timeout_work(struct work_struct *work)
916
{
917 918
	struct request_queue *q =
		container_of(work, struct request_queue, timeout_work);
919
	unsigned long next = 0;
920
	struct blk_mq_hw_ctx *hctx;
921
	int i;
922

923 924 925 926 927 928 929 930 931
	/* A deadlock might occur if a request is stuck requiring a
	 * timeout at the same time a queue freeze is waiting
	 * completion, since the timeout code would not be able to
	 * acquire the queue reference here.
	 *
	 * That's why we don't use blk_queue_enter here; instead, we use
	 * percpu_ref_tryget directly, because we need to be able to
	 * obtain a reference even in the short window between the queue
	 * starting to freeze, by dropping the first reference in
932
	 * blk_freeze_queue_start, and the moment the last request is
933 934 935 936
	 * consumed, marked by the instant q_usage_counter reaches
	 * zero.
	 */
	if (!percpu_ref_tryget(&q->q_usage_counter))
937 938
		return;

939
	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
940

941 942
	if (next != 0) {
		mod_timer(&q->timeout, next);
943
	} else {
944 945 946 947 948 949
		/*
		 * Request timeouts are handled as a forward rolling timer. If
		 * we end up here it means that no requests are pending and
		 * also that no request has been pending for a while. Mark
		 * each hctx as idle.
		 */
950 951 952 953 954
		queue_for_each_hw_ctx(q, hctx, i) {
			/* the hctx may be unmapped, so check it here */
			if (blk_mq_hw_queue_mapped(hctx))
				blk_mq_tag_idle(hctx);
		}
955
	}
956
	blk_queue_exit(q);
957 958
}

959 960 961 962 963 964 965 966 967 968
struct flush_busy_ctx_data {
	struct blk_mq_hw_ctx *hctx;
	struct list_head *list;
};

static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
{
	struct flush_busy_ctx_data *flush_data = data;
	struct blk_mq_hw_ctx *hctx = flush_data->hctx;
	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
969
	enum hctx_type type = hctx->type;
970 971

	spin_lock(&ctx->lock);
972
	list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
973
	sbitmap_clear_bit(sb, bitnr);
974 975 976 977
	spin_unlock(&ctx->lock);
	return true;
}

978 979 980 981
/*
 * Process software queues that have been marked busy, splicing them
 * to the for-dispatch
 */
982
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
983
{
984 985 986 987
	struct flush_busy_ctx_data data = {
		.hctx = hctx,
		.list = list,
	};
988

989
	sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
990
}
991
EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
992

993 994 995 996 997 998 999 1000 1001 1002 1003
struct dispatch_rq_data {
	struct blk_mq_hw_ctx *hctx;
	struct request *rq;
};

static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
		void *data)
{
	struct dispatch_rq_data *dispatch_data = data;
	struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
1004
	enum hctx_type type = hctx->type;
1005 1006

	spin_lock(&ctx->lock);
1007 1008
	if (!list_empty(&ctx->rq_lists[type])) {
		dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
1009
		list_del_init(&dispatch_data->rq->queuelist);
1010
		if (list_empty(&ctx->rq_lists[type]))
1011 1012 1013 1014 1015 1016 1017 1018 1019 1020
			sbitmap_clear_bit(sb, bitnr);
	}
	spin_unlock(&ctx->lock);

	return !dispatch_data->rq;
}

struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
					struct blk_mq_ctx *start)
{
1021
	unsigned off = start ? start->index_hw[hctx->type] : 0;
1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032
	struct dispatch_rq_data data = {
		.hctx = hctx,
		.rq   = NULL,
	};

	__sbitmap_for_each_set(&hctx->ctx_map, off,
			       dispatch_rq_from_ctx, &data);

	return data.rq;
}

1033 1034 1035 1036
static inline unsigned int queued_to_index(unsigned int queued)
{
	if (!queued)
		return 0;
1037

1038
	return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
1039 1040
}

1041
bool blk_mq_get_driver_tag(struct request *rq)
1042 1043 1044
{
	struct blk_mq_alloc_data data = {
		.q = rq->q,
1045
		.hctx = rq->mq_hctx,
1046
		.flags = BLK_MQ_REQ_NOWAIT,
1047
		.cmd_flags = rq->cmd_flags,
1048
	};
1049
	bool shared;
1050

1051 1052
	if (rq->tag != -1)
		goto done;
1053

1054 1055 1056
	if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
		data.flags |= BLK_MQ_REQ_RESERVED;

1057
	shared = blk_mq_tag_busy(data.hctx);
1058 1059
	rq->tag = blk_mq_get_tag(&data);
	if (rq->tag >= 0) {
1060
		if (shared) {
1061 1062 1063
			rq->rq_flags |= RQF_MQ_INFLIGHT;
			atomic_inc(&data.hctx->nr_active);
		}
1064 1065 1066
		data.hctx->tags->rqs[rq->tag] = rq;
	}

1067 1068
done:
	return rq->tag != -1;
1069 1070
}

1071 1072
static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
				int flags, void *key)
1073 1074 1075 1076 1077
{
	struct blk_mq_hw_ctx *hctx;

	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);

1078
	spin_lock(&hctx->dispatch_wait_lock);
1079
	list_del_init(&wait->entry);
1080 1081
	spin_unlock(&hctx->dispatch_wait_lock);

1082 1083 1084 1085
	blk_mq_run_hw_queue(hctx, true);
	return 1;
}

1086 1087
/*
 * Mark us waiting for a tag. For shared tags, this involves hooking us into
1088 1089
 * the tag wakeups. For non-shared tags, we can simply mark us needing a
 * restart. For both cases, take care to check the condition again after
1090 1091
 * marking us as waiting.
 */
1092
static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
1093
				 struct request *rq)
1094
{
1095
	struct wait_queue_head *wq;
1096 1097
	wait_queue_entry_t *wait;
	bool ret;
1098

1099 1100 1101
	if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) {
		if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
			set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
1102

1103 1104 1105 1106 1107 1108 1109 1110
		/*
		 * It's possible that a tag was freed in the window between the
		 * allocation failure and adding the hardware queue to the wait
		 * queue.
		 *
		 * Don't clear RESTART here, someone else could have set it.
		 * At most this will cost an extra queue run.
		 */
1111
		return blk_mq_get_driver_tag(rq);
1112 1113
	}

1114
	wait = &hctx->dispatch_wait;
1115 1116 1117
	if (!list_empty_careful(&wait->entry))
		return false;

1118 1119 1120 1121
	wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait;

	spin_lock_irq(&wq->lock);
	spin_lock(&hctx->dispatch_wait_lock);
1122
	if (!list_empty(&wait->entry)) {
1123 1124
		spin_unlock(&hctx->dispatch_wait_lock);
		spin_unlock_irq(&wq->lock);
1125
		return false;
1126 1127
	}

1128 1129
	wait->flags &= ~WQ_FLAG_EXCLUSIVE;
	__add_wait_queue(wq, wait);
1130

1131
	/*
1132 1133 1134
	 * It's possible that a tag was freed in the window between the
	 * allocation failure and adding the hardware queue to the wait
	 * queue.
1135
	 */
1136
	ret = blk_mq_get_driver_tag(rq);
1137
	if (!ret) {
1138 1139
		spin_unlock(&hctx->dispatch_wait_lock);
		spin_unlock_irq(&wq->lock);
1140
		return false;
1141
	}
1142 1143 1144 1145 1146 1147

	/*
	 * We got a tag, remove ourselves from the wait queue to ensure
	 * someone else gets the wakeup.
	 */
	list_del_init(&wait->entry);
1148 1149
	spin_unlock(&hctx->dispatch_wait_lock);
	spin_unlock_irq(&wq->lock);
1150 1151

	return true;
1152 1153
}

1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182
#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT  8
#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR  4
/*
 * Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
 * - EWMA is one simple way to compute running average value
 * - weight(7/8 and 1/8) is applied so that it can decrease exponentially
 * - take 4 as factor for avoiding to get too small(0) result, and this
 *   factor doesn't matter because EWMA decreases exponentially
 */
static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
{
	unsigned int ewma;

	if (hctx->queue->elevator)
		return;

	ewma = hctx->dispatch_busy;

	if (!ewma && !busy)
		return;

	ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
	if (busy)
		ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
	ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;

	hctx->dispatch_busy = ewma;
}

1183 1184
#define BLK_MQ_RESOURCE_DELAY	3		/* ms units */

1185 1186 1187
/*
 * Returns true if we did some work AND can potentially do more.
 */
1188
bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *