Linux Audio

Check our new training course

Loading...
Note: File does not exist in v5.9.
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * Userspace block device - block device which IO is handled from userspace
   4 *
   5 * Take full use of io_uring passthrough command for communicating with
   6 * ublk userspace daemon(ublksrvd) for handling basic IO request.
   7 *
   8 * Copyright 2022 Ming Lei <ming.lei@redhat.com>
   9 *
  10 * (part of code stolen from loop.c)
  11 */
  12#include <linux/module.h>
  13#include <linux/moduleparam.h>
  14#include <linux/sched.h>
  15#include <linux/fs.h>
  16#include <linux/pagemap.h>
  17#include <linux/file.h>
  18#include <linux/stat.h>
  19#include <linux/errno.h>
  20#include <linux/major.h>
  21#include <linux/wait.h>
  22#include <linux/blkdev.h>
  23#include <linux/init.h>
  24#include <linux/swap.h>
  25#include <linux/slab.h>
  26#include <linux/compat.h>
  27#include <linux/mutex.h>
  28#include <linux/writeback.h>
  29#include <linux/completion.h>
  30#include <linux/highmem.h>
  31#include <linux/sysfs.h>
  32#include <linux/miscdevice.h>
  33#include <linux/falloc.h>
  34#include <linux/uio.h>
  35#include <linux/ioprio.h>
  36#include <linux/sched/mm.h>
  37#include <linux/uaccess.h>
  38#include <linux/cdev.h>
  39#include <linux/io_uring.h>
  40#include <linux/blk-mq.h>
  41#include <linux/delay.h>
  42#include <linux/mm.h>
  43#include <asm/page.h>
  44#include <linux/task_work.h>
  45#include <uapi/linux/ublk_cmd.h>
  46
  47#define UBLK_MINORS		(1U << MINORBITS)
  48
  49/* All UBLK_F_* have to be included into UBLK_F_ALL */
  50#define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
  51		| UBLK_F_URING_CMD_COMP_IN_TASK \
  52		| UBLK_F_NEED_GET_DATA \
  53		| UBLK_F_USER_RECOVERY \
  54		| UBLK_F_USER_RECOVERY_REISSUE)
  55
  56/* All UBLK_PARAM_TYPE_* should be included here */
  57#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
  58
  59struct ublk_rq_data {
  60	struct llist_node node;
  61	struct callback_head work;
  62};
  63
  64struct ublk_uring_cmd_pdu {
  65	struct ublk_queue *ubq;
  66};
  67
  68/*
  69 * io command is active: sqe cmd is received, and its cqe isn't done
  70 *
  71 * If the flag is set, the io command is owned by ublk driver, and waited
  72 * for incoming blk-mq request from the ublk block device.
  73 *
  74 * If the flag is cleared, the io command will be completed, and owned by
  75 * ublk server.
  76 */
  77#define UBLK_IO_FLAG_ACTIVE	0x01
  78
  79/*
  80 * IO command is completed via cqe, and it is being handled by ublksrv, and
  81 * not committed yet
  82 *
  83 * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
  84 * cross verification
  85 */
  86#define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
  87
  88/*
  89 * IO command is aborted, so this flag is set in case of
  90 * !UBLK_IO_FLAG_ACTIVE.
  91 *
  92 * After this flag is observed, any pending or new incoming request
  93 * associated with this io command will be failed immediately
  94 */
  95#define UBLK_IO_FLAG_ABORTED 0x04
  96
  97/*
  98 * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
  99 * get data buffer address from ublksrv.
 100 *
 101 * Then, bio data could be copied into this data buffer for a WRITE request
 102 * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
 103 */
 104#define UBLK_IO_FLAG_NEED_GET_DATA 0x08
 105
 106struct ublk_io {
 107	/* userspace buffer address from io cmd */
 108	__u64	addr;
 109	unsigned int flags;
 110	int res;
 111
 112	struct io_uring_cmd *cmd;
 113};
 114
 115struct ublk_queue {
 116	int q_id;
 117	int q_depth;
 118
 119	unsigned long flags;
 120	struct task_struct	*ubq_daemon;
 121	char *io_cmd_buf;
 122
 123	struct llist_head	io_cmds;
 124
 125	unsigned long io_addr;	/* mapped vm address */
 126	unsigned int max_io_sz;
 127	bool force_abort;
 128	unsigned short nr_io_ready;	/* how many ios setup */
 129	struct ublk_device *dev;
 130	struct ublk_io ios[];
 131};
 132
 133#define UBLK_DAEMON_MONITOR_PERIOD	(5 * HZ)
 134
 135struct ublk_device {
 136	struct gendisk		*ub_disk;
 137
 138	char	*__queues;
 139
 140	unsigned int	queue_size;
 141	struct ublksrv_ctrl_dev_info	dev_info;
 142
 143	struct blk_mq_tag_set	tag_set;
 144
 145	struct cdev		cdev;
 146	struct device		cdev_dev;
 147
 148#define UB_STATE_OPEN		0
 149#define UB_STATE_USED		1
 150	unsigned long		state;
 151	int			ub_number;
 152
 153	struct mutex		mutex;
 154
 155	spinlock_t		mm_lock;
 156	struct mm_struct	*mm;
 157
 158	struct ublk_params	params;
 159
 160	struct completion	completion;
 161	unsigned int		nr_queues_ready;
 162	atomic_t		nr_aborted_queues;
 163
 164	/*
 165	 * Our ubq->daemon may be killed without any notification, so
 166	 * monitor each queue's daemon periodically
 167	 */
 168	struct delayed_work	monitor_work;
 169	struct work_struct	quiesce_work;
 170	struct work_struct	stop_work;
 171};
 172
 173/* header of ublk_params */
 174struct ublk_params_header {
 175	__u32	len;
 176	__u32	types;
 177};
 178
 179static dev_t ublk_chr_devt;
 180static struct class *ublk_chr_class;
 181
 182static DEFINE_IDR(ublk_index_idr);
 183static DEFINE_SPINLOCK(ublk_idr_lock);
 184static wait_queue_head_t ublk_idr_wq;	/* wait until one idr is freed */
 185
 186static DEFINE_MUTEX(ublk_ctl_mutex);
 187
 188static struct miscdevice ublk_misc;
 189
 190static void ublk_dev_param_basic_apply(struct ublk_device *ub)
 191{
 192	struct request_queue *q = ub->ub_disk->queue;
 193	const struct ublk_param_basic *p = &ub->params.basic;
 194
 195	blk_queue_logical_block_size(q, 1 << p->logical_bs_shift);
 196	blk_queue_physical_block_size(q, 1 << p->physical_bs_shift);
 197	blk_queue_io_min(q, 1 << p->io_min_shift);
 198	blk_queue_io_opt(q, 1 << p->io_opt_shift);
 199
 200	blk_queue_write_cache(q, p->attrs & UBLK_ATTR_VOLATILE_CACHE,
 201			p->attrs & UBLK_ATTR_FUA);
 202	if (p->attrs & UBLK_ATTR_ROTATIONAL)
 203		blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
 204	else
 205		blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
 206
 207	blk_queue_max_hw_sectors(q, p->max_sectors);
 208	blk_queue_chunk_sectors(q, p->chunk_sectors);
 209	blk_queue_virt_boundary(q, p->virt_boundary_mask);
 210
 211	if (p->attrs & UBLK_ATTR_READ_ONLY)
 212		set_disk_ro(ub->ub_disk, true);
 213
 214	set_capacity(ub->ub_disk, p->dev_sectors);
 215}
 216
 217static void ublk_dev_param_discard_apply(struct ublk_device *ub)
 218{
 219	struct request_queue *q = ub->ub_disk->queue;
 220	const struct ublk_param_discard *p = &ub->params.discard;
 221
 222	q->limits.discard_alignment = p->discard_alignment;
 223	q->limits.discard_granularity = p->discard_granularity;
 224	blk_queue_max_discard_sectors(q, p->max_discard_sectors);
 225	blk_queue_max_write_zeroes_sectors(q,
 226			p->max_write_zeroes_sectors);
 227	blk_queue_max_discard_segments(q, p->max_discard_segments);
 228}
 229
 230static int ublk_validate_params(const struct ublk_device *ub)
 231{
 232	/* basic param is the only one which must be set */
 233	if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
 234		const struct ublk_param_basic *p = &ub->params.basic;
 235
 236		if (p->logical_bs_shift > PAGE_SHIFT)
 237			return -EINVAL;
 238
 239		if (p->logical_bs_shift > p->physical_bs_shift)
 240			return -EINVAL;
 241
 242		if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
 243			return -EINVAL;
 244	} else
 245		return -EINVAL;
 246
 247	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
 248		const struct ublk_param_discard *p = &ub->params.discard;
 249
 250		/* So far, only support single segment discard */
 251		if (p->max_discard_sectors && p->max_discard_segments != 1)
 252			return -EINVAL;
 253
 254		if (!p->discard_granularity)
 255			return -EINVAL;
 256	}
 257
 258	return 0;
 259}
 260
 261static int ublk_apply_params(struct ublk_device *ub)
 262{
 263	if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
 264		return -EINVAL;
 265
 266	ublk_dev_param_basic_apply(ub);
 267
 268	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
 269		ublk_dev_param_discard_apply(ub);
 270
 271	return 0;
 272}
 273
 274static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq)
 275{
 276	if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) &&
 277			!(ubq->flags & UBLK_F_URING_CMD_COMP_IN_TASK))
 278		return true;
 279	return false;
 280}
 281
 282static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
 283{
 284	if (ubq->flags & UBLK_F_NEED_GET_DATA)
 285		return true;
 286	return false;
 287}
 288
 289static struct ublk_device *ublk_get_device(struct ublk_device *ub)
 290{
 291	if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
 292		return ub;
 293	return NULL;
 294}
 295
 296static void ublk_put_device(struct ublk_device *ub)
 297{
 298	put_device(&ub->cdev_dev);
 299}
 300
 301static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
 302		int qid)
 303{
 304       return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
 305}
 306
 307static inline bool ublk_rq_has_data(const struct request *rq)
 308{
 309	return rq->bio && bio_has_data(rq->bio);
 310}
 311
 312static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
 313		int tag)
 314{
 315	return (struct ublksrv_io_desc *)
 316		&(ubq->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]);
 317}
 318
 319static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
 320{
 321	return ublk_get_queue(ub, q_id)->io_cmd_buf;
 322}
 323
 324static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id)
 325{
 326	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
 327
 328	return round_up(ubq->q_depth * sizeof(struct ublksrv_io_desc),
 329			PAGE_SIZE);
 330}
 331
 332static inline bool ublk_queue_can_use_recovery_reissue(
 333		struct ublk_queue *ubq)
 334{
 335	if ((ubq->flags & UBLK_F_USER_RECOVERY) &&
 336			(ubq->flags & UBLK_F_USER_RECOVERY_REISSUE))
 337		return true;
 338	return false;
 339}
 340
 341static inline bool ublk_queue_can_use_recovery(
 342		struct ublk_queue *ubq)
 343{
 344	if (ubq->flags & UBLK_F_USER_RECOVERY)
 345		return true;
 346	return false;
 347}
 348
 349static inline bool ublk_can_use_recovery(struct ublk_device *ub)
 350{
 351	if (ub->dev_info.flags & UBLK_F_USER_RECOVERY)
 352		return true;
 353	return false;
 354}
 355
 356static void ublk_free_disk(struct gendisk *disk)
 357{
 358	struct ublk_device *ub = disk->private_data;
 359
 360	clear_bit(UB_STATE_USED, &ub->state);
 361	put_device(&ub->cdev_dev);
 362}
 363
 364static const struct block_device_operations ub_fops = {
 365	.owner =	THIS_MODULE,
 366	.free_disk =	ublk_free_disk,
 367};
 368
 369#define UBLK_MAX_PIN_PAGES	32
 370
 371struct ublk_map_data {
 372	const struct ublk_queue *ubq;
 373	const struct request *rq;
 374	const struct ublk_io *io;
 375	unsigned max_bytes;
 376};
 377
 378struct ublk_io_iter {
 379	struct page *pages[UBLK_MAX_PIN_PAGES];
 380	unsigned pg_off;	/* offset in the 1st page in pages */
 381	int nr_pages;		/* how many page pointers in pages */
 382	struct bio *bio;
 383	struct bvec_iter iter;
 384};
 385
 386static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data,
 387		unsigned max_bytes, bool to_vm)
 388{
 389	const unsigned total = min_t(unsigned, max_bytes,
 390			PAGE_SIZE - data->pg_off +
 391			((data->nr_pages - 1) << PAGE_SHIFT));
 392	unsigned done = 0;
 393	unsigned pg_idx = 0;
 394
 395	while (done < total) {
 396		struct bio_vec bv = bio_iter_iovec(data->bio, data->iter);
 397		const unsigned int bytes = min3(bv.bv_len, total - done,
 398				(unsigned)(PAGE_SIZE - data->pg_off));
 399		void *bv_buf = bvec_kmap_local(&bv);
 400		void *pg_buf = kmap_local_page(data->pages[pg_idx]);
 401
 402		if (to_vm)
 403			memcpy(pg_buf + data->pg_off, bv_buf, bytes);
 404		else
 405			memcpy(bv_buf, pg_buf + data->pg_off, bytes);
 406
 407		kunmap_local(pg_buf);
 408		kunmap_local(bv_buf);
 409
 410		/* advance page array */
 411		data->pg_off += bytes;
 412		if (data->pg_off == PAGE_SIZE) {
 413			pg_idx += 1;
 414			data->pg_off = 0;
 415		}
 416
 417		done += bytes;
 418
 419		/* advance bio */
 420		bio_advance_iter_single(data->bio, &data->iter, bytes);
 421		if (!data->iter.bi_size) {
 422			data->bio = data->bio->bi_next;
 423			if (data->bio == NULL)
 424				break;
 425			data->iter = data->bio->bi_iter;
 426		}
 427	}
 428
 429	return done;
 430}
 431
 432static inline int ublk_copy_user_pages(struct ublk_map_data *data,
 433		bool to_vm)
 434{
 435	const unsigned int gup_flags = to_vm ? FOLL_WRITE : 0;
 436	const unsigned long start_vm = data->io->addr;
 437	unsigned int done = 0;
 438	struct ublk_io_iter iter = {
 439		.pg_off	= start_vm & (PAGE_SIZE - 1),
 440		.bio	= data->rq->bio,
 441		.iter	= data->rq->bio->bi_iter,
 442	};
 443	const unsigned int nr_pages = round_up(data->max_bytes +
 444			(start_vm & (PAGE_SIZE - 1)), PAGE_SIZE) >> PAGE_SHIFT;
 445
 446	while (done < nr_pages) {
 447		const unsigned to_pin = min_t(unsigned, UBLK_MAX_PIN_PAGES,
 448				nr_pages - done);
 449		unsigned i, len;
 450
 451		iter.nr_pages = get_user_pages_fast(start_vm +
 452				(done << PAGE_SHIFT), to_pin, gup_flags,
 453				iter.pages);
 454		if (iter.nr_pages <= 0)
 455			return done == 0 ? iter.nr_pages : done;
 456		len = ublk_copy_io_pages(&iter, data->max_bytes, to_vm);
 457		for (i = 0; i < iter.nr_pages; i++) {
 458			if (to_vm)
 459				set_page_dirty(iter.pages[i]);
 460			put_page(iter.pages[i]);
 461		}
 462		data->max_bytes -= len;
 463		done += iter.nr_pages;
 464	}
 465
 466	return done;
 467}
 468
 469static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
 470		struct ublk_io *io)
 471{
 472	const unsigned int rq_bytes = blk_rq_bytes(req);
 473	/*
 474	 * no zero copy, we delay copy WRITE request data into ublksrv
 475	 * context and the big benefit is that pinning pages in current
 476	 * context is pretty fast, see ublk_pin_user_pages
 477	 */
 478	if (req_op(req) != REQ_OP_WRITE && req_op(req) != REQ_OP_FLUSH)
 479		return rq_bytes;
 480
 481	if (ublk_rq_has_data(req)) {
 482		struct ublk_map_data data = {
 483			.ubq	=	ubq,
 484			.rq	=	req,
 485			.io	=	io,
 486			.max_bytes =	rq_bytes,
 487		};
 488
 489		ublk_copy_user_pages(&data, true);
 490
 491		return rq_bytes - data.max_bytes;
 492	}
 493	return rq_bytes;
 494}
 495
 496static int ublk_unmap_io(const struct ublk_queue *ubq,
 497		const struct request *req,
 498		struct ublk_io *io)
 499{
 500	const unsigned int rq_bytes = blk_rq_bytes(req);
 501
 502	if (req_op(req) == REQ_OP_READ && ublk_rq_has_data(req)) {
 503		struct ublk_map_data data = {
 504			.ubq	=	ubq,
 505			.rq	=	req,
 506			.io	=	io,
 507			.max_bytes =	io->res,
 508		};
 509
 510		WARN_ON_ONCE(io->res > rq_bytes);
 511
 512		ublk_copy_user_pages(&data, false);
 513
 514		return io->res - data.max_bytes;
 515	}
 516	return rq_bytes;
 517}
 518
 519static inline unsigned int ublk_req_build_flags(struct request *req)
 520{
 521	unsigned flags = 0;
 522
 523	if (req->cmd_flags & REQ_FAILFAST_DEV)
 524		flags |= UBLK_IO_F_FAILFAST_DEV;
 525
 526	if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
 527		flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
 528
 529	if (req->cmd_flags & REQ_FAILFAST_DRIVER)
 530		flags |= UBLK_IO_F_FAILFAST_DRIVER;
 531
 532	if (req->cmd_flags & REQ_META)
 533		flags |= UBLK_IO_F_META;
 534
 535	if (req->cmd_flags & REQ_FUA)
 536		flags |= UBLK_IO_F_FUA;
 537
 538	if (req->cmd_flags & REQ_NOUNMAP)
 539		flags |= UBLK_IO_F_NOUNMAP;
 540
 541	if (req->cmd_flags & REQ_SWAP)
 542		flags |= UBLK_IO_F_SWAP;
 543
 544	return flags;
 545}
 546
 547static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
 548{
 549	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
 550	struct ublk_io *io = &ubq->ios[req->tag];
 551	u32 ublk_op;
 552
 553	switch (req_op(req)) {
 554	case REQ_OP_READ:
 555		ublk_op = UBLK_IO_OP_READ;
 556		break;
 557	case REQ_OP_WRITE:
 558		ublk_op = UBLK_IO_OP_WRITE;
 559		break;
 560	case REQ_OP_FLUSH:
 561		ublk_op = UBLK_IO_OP_FLUSH;
 562		break;
 563	case REQ_OP_DISCARD:
 564		ublk_op = UBLK_IO_OP_DISCARD;
 565		break;
 566	case REQ_OP_WRITE_ZEROES:
 567		ublk_op = UBLK_IO_OP_WRITE_ZEROES;
 568		break;
 569	default:
 570		return BLK_STS_IOERR;
 571	}
 572
 573	/* need to translate since kernel may change */
 574	iod->op_flags = ublk_op | ublk_req_build_flags(req);
 575	iod->nr_sectors = blk_rq_sectors(req);
 576	iod->start_sector = blk_rq_pos(req);
 577	iod->addr = io->addr;
 578
 579	return BLK_STS_OK;
 580}
 581
 582static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
 583		struct io_uring_cmd *ioucmd)
 584{
 585	return (struct ublk_uring_cmd_pdu *)&ioucmd->pdu;
 586}
 587
 588static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq)
 589{
 590	return ubq->ubq_daemon->flags & PF_EXITING;
 591}
 592
 593/* todo: handle partial completion */
 594static void ublk_complete_rq(struct request *req)
 595{
 596	struct ublk_queue *ubq = req->mq_hctx->driver_data;
 597	struct ublk_io *io = &ubq->ios[req->tag];
 598	unsigned int unmapped_bytes;
 599
 600	/* failed read IO if nothing is read */
 601	if (!io->res && req_op(req) == REQ_OP_READ)
 602		io->res = -EIO;
 603
 604	if (io->res < 0) {
 605		blk_mq_end_request(req, errno_to_blk_status(io->res));
 606		return;
 607	}
 608
 609	/*
 610	 * FLUSH or DISCARD usually won't return bytes returned, so end them
 611	 * directly.
 612	 *
 613	 * Both the two needn't unmap.
 614	 */
 615	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) {
 616		blk_mq_end_request(req, BLK_STS_OK);
 617		return;
 618	}
 619
 620	/* for READ request, writing data in iod->addr to rq buffers */
 621	unmapped_bytes = ublk_unmap_io(ubq, req, io);
 622
 623	/*
 624	 * Extremely impossible since we got data filled in just before
 625	 *
 626	 * Re-read simply for this unlikely case.
 627	 */
 628	if (unlikely(unmapped_bytes < io->res))
 629		io->res = unmapped_bytes;
 630
 631	if (blk_update_request(req, BLK_STS_OK, io->res))
 632		blk_mq_requeue_request(req, true);
 633	else
 634		__blk_mq_end_request(req, BLK_STS_OK);
 635}
 636
 637/*
 638 * Since __ublk_rq_task_work always fails requests immediately during
 639 * exiting, __ublk_fail_req() is only called from abort context during
 640 * exiting. So lock is unnecessary.
 641 *
 642 * Also aborting may not be started yet, keep in mind that one failed
 643 * request may be issued by block layer again.
 644 */
 645static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
 646		struct request *req)
 647{
 648	WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
 649
 650	if (!(io->flags & UBLK_IO_FLAG_ABORTED)) {
 651		io->flags |= UBLK_IO_FLAG_ABORTED;
 652		if (ublk_queue_can_use_recovery_reissue(ubq))
 653			blk_mq_requeue_request(req, false);
 654		else
 655			blk_mq_end_request(req, BLK_STS_IOERR);
 656	}
 657}
 658
 659static void ubq_complete_io_cmd(struct ublk_io *io, int res)
 660{
 661	/* mark this cmd owned by ublksrv */
 662	io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
 663
 664	/*
 665	 * clear ACTIVE since we are done with this sqe/cmd slot
 666	 * We can only accept io cmd in case of being not active.
 667	 */
 668	io->flags &= ~UBLK_IO_FLAG_ACTIVE;
 669
 670	/* tell ublksrv one io request is coming */
 671	io_uring_cmd_done(io->cmd, res, 0);
 672}
 673
 674#define UBLK_REQUEUE_DELAY_MS	3
 675
 676static inline void __ublk_abort_rq(struct ublk_queue *ubq,
 677		struct request *rq)
 678{
 679	/* We cannot process this rq so just requeue it. */
 680	if (ublk_queue_can_use_recovery(ubq))
 681		blk_mq_requeue_request(rq, false);
 682	else
 683		blk_mq_end_request(rq, BLK_STS_IOERR);
 684
 685	mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0);
 686}
 687
 688static inline void __ublk_rq_task_work(struct request *req)
 689{
 690	struct ublk_queue *ubq = req->mq_hctx->driver_data;
 691	int tag = req->tag;
 692	struct ublk_io *io = &ubq->ios[tag];
 693	unsigned int mapped_bytes;
 694
 695	pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n",
 696			__func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
 697			ublk_get_iod(ubq, req->tag)->addr);
 698
 699	/*
 700	 * Task is exiting if either:
 701	 *
 702	 * (1) current != ubq_daemon.
 703	 * io_uring_cmd_complete_in_task() tries to run task_work
 704	 * in a workqueue if ubq_daemon(cmd's task) is PF_EXITING.
 705	 *
 706	 * (2) current->flags & PF_EXITING.
 707	 */
 708	if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) {
 709		__ublk_abort_rq(ubq, req);
 710		return;
 711	}
 712
 713	if (ublk_need_get_data(ubq) &&
 714			(req_op(req) == REQ_OP_WRITE ||
 715			req_op(req) == REQ_OP_FLUSH)) {
 716		/*
 717		 * We have not handled UBLK_IO_NEED_GET_DATA command yet,
 718		 * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
 719		 * and notify it.
 720		 */
 721		if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) {
 722			io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
 723			pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n",
 724					__func__, io->cmd->cmd_op, ubq->q_id,
 725					req->tag, io->flags);
 726			ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA);
 727			return;
 728		}
 729		/*
 730		 * We have handled UBLK_IO_NEED_GET_DATA command,
 731		 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
 732		 * do the copy work.
 733		 */
 734		io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
 735		/* update iod->addr because ublksrv may have passed a new io buffer */
 736		ublk_get_iod(ubq, req->tag)->addr = io->addr;
 737		pr_devel("%s: update iod->addr: op %d, qid %d tag %d io_flags %x addr %llx\n",
 738				__func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
 739				ublk_get_iod(ubq, req->tag)->addr);
 740	}
 741
 742	mapped_bytes = ublk_map_io(ubq, req, io);
 743
 744	/* partially mapped, update io descriptor */
 745	if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
 746		/*
 747		 * Nothing mapped, retry until we succeed.
 748		 *
 749		 * We may never succeed in mapping any bytes here because
 750		 * of OOM. TODO: reserve one buffer with single page pinned
 751		 * for providing forward progress guarantee.
 752		 */
 753		if (unlikely(!mapped_bytes)) {
 754			blk_mq_requeue_request(req, false);
 755			blk_mq_delay_kick_requeue_list(req->q,
 756					UBLK_REQUEUE_DELAY_MS);
 757			return;
 758		}
 759
 760		ublk_get_iod(ubq, req->tag)->nr_sectors =
 761			mapped_bytes >> 9;
 762	}
 763
 764	ubq_complete_io_cmd(io, UBLK_IO_RES_OK);
 765}
 766
 767static inline void ublk_forward_io_cmds(struct ublk_queue *ubq)
 768{
 769	struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds);
 770	struct ublk_rq_data *data, *tmp;
 771
 772	io_cmds = llist_reverse_order(io_cmds);
 773	llist_for_each_entry_safe(data, tmp, io_cmds, node)
 774		__ublk_rq_task_work(blk_mq_rq_from_pdu(data));
 775}
 776
 777static inline void ublk_abort_io_cmds(struct ublk_queue *ubq)
 778{
 779	struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds);
 780	struct ublk_rq_data *data, *tmp;
 781
 782	llist_for_each_entry_safe(data, tmp, io_cmds, node)
 783		__ublk_abort_rq(ubq, blk_mq_rq_from_pdu(data));
 784}
 785
 786static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd)
 787{
 788	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
 789	struct ublk_queue *ubq = pdu->ubq;
 790
 791	ublk_forward_io_cmds(ubq);
 792}
 793
 794static void ublk_rq_task_work_fn(struct callback_head *work)
 795{
 796	struct ublk_rq_data *data = container_of(work,
 797			struct ublk_rq_data, work);
 798	struct request *req = blk_mq_rq_from_pdu(data);
 799	struct ublk_queue *ubq = req->mq_hctx->driver_data;
 800
 801	ublk_forward_io_cmds(ubq);
 802}
 803
 804static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
 805{
 806	struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
 807	struct ublk_io *io;
 808
 809	if (!llist_add(&data->node, &ubq->io_cmds))
 810		return;
 811
 812	io = &ubq->ios[rq->tag];
 813	/*
 814	 * If the check pass, we know that this is a re-issued request aborted
 815	 * previously in monitor_work because the ubq_daemon(cmd's task) is
 816	 * PF_EXITING. We cannot call io_uring_cmd_complete_in_task() anymore
 817	 * because this ioucmd's io_uring context may be freed now if no inflight
 818	 * ioucmd exists. Otherwise we may cause null-deref in ctx->fallback_work.
 819	 *
 820	 * Note: monitor_work sets UBLK_IO_FLAG_ABORTED and ends this request(releasing
 821	 * the tag). Then the request is re-started(allocating the tag) and we are here.
 822	 * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED
 823	 * guarantees that here is a re-issued request aborted previously.
 824	 */
 825	if (unlikely(io->flags & UBLK_IO_FLAG_ABORTED)) {
 826		ublk_abort_io_cmds(ubq);
 827	} else if (ublk_can_use_task_work(ubq)) {
 828		if (task_work_add(ubq->ubq_daemon, &data->work,
 829					TWA_SIGNAL_NO_IPI))
 830			ublk_abort_io_cmds(ubq);
 831	} else {
 832		struct io_uring_cmd *cmd = io->cmd;
 833		struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
 834
 835		pdu->ubq = ubq;
 836		io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
 837	}
 838}
 839
 840static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
 841		const struct blk_mq_queue_data *bd)
 842{
 843	struct ublk_queue *ubq = hctx->driver_data;
 844	struct request *rq = bd->rq;
 845	blk_status_t res;
 846
 847	/* fill iod to slot in io cmd buffer */
 848	res = ublk_setup_iod(ubq, rq);
 849	if (unlikely(res != BLK_STS_OK))
 850		return BLK_STS_IOERR;
 851
 852	/* With recovery feature enabled, force_abort is set in
 853	 * ublk_stop_dev() before calling del_gendisk(). We have to
 854	 * abort all requeued and new rqs here to let del_gendisk()
 855	 * move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
 856	 * to avoid UAF on io_uring ctx.
 857	 *
 858	 * Note: force_abort is guaranteed to be seen because it is set
 859	 * before request queue is unqiuesced.
 860	 */
 861	if (ublk_queue_can_use_recovery(ubq) && unlikely(ubq->force_abort))
 862		return BLK_STS_IOERR;
 863
 864	blk_mq_start_request(bd->rq);
 865
 866	if (unlikely(ubq_daemon_is_dying(ubq))) {
 867		__ublk_abort_rq(ubq, rq);
 868		return BLK_STS_OK;
 869	}
 870
 871	ublk_queue_cmd(ubq, rq);
 872
 873	return BLK_STS_OK;
 874}
 875
 876static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
 877		unsigned int hctx_idx)
 878{
 879	struct ublk_device *ub = driver_data;
 880	struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
 881
 882	hctx->driver_data = ubq;
 883	return 0;
 884}
 885
 886static int ublk_init_rq(struct blk_mq_tag_set *set, struct request *req,
 887		unsigned int hctx_idx, unsigned int numa_node)
 888{
 889	struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
 890
 891	init_task_work(&data->work, ublk_rq_task_work_fn);
 892	return 0;
 893}
 894
 895static const struct blk_mq_ops ublk_mq_ops = {
 896	.queue_rq       = ublk_queue_rq,
 897	.init_hctx	= ublk_init_hctx,
 898	.init_request   = ublk_init_rq,
 899};
 900
 901static int ublk_ch_open(struct inode *inode, struct file *filp)
 902{
 903	struct ublk_device *ub = container_of(inode->i_cdev,
 904			struct ublk_device, cdev);
 905
 906	if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
 907		return -EBUSY;
 908	filp->private_data = ub;
 909	return 0;
 910}
 911
 912static int ublk_ch_release(struct inode *inode, struct file *filp)
 913{
 914	struct ublk_device *ub = filp->private_data;
 915
 916	clear_bit(UB_STATE_OPEN, &ub->state);
 917	return 0;
 918}
 919
 920/* map pre-allocated per-queue cmd buffer to ublksrv daemon */
 921static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
 922{
 923	struct ublk_device *ub = filp->private_data;
 924	size_t sz = vma->vm_end - vma->vm_start;
 925	unsigned max_sz = UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc);
 926	unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
 927	int q_id, ret = 0;
 928
 929	spin_lock(&ub->mm_lock);
 930	if (!ub->mm)
 931		ub->mm = current->mm;
 932	if (current->mm != ub->mm)
 933		ret = -EINVAL;
 934	spin_unlock(&ub->mm_lock);
 935
 936	if (ret)
 937		return ret;
 938
 939	if (vma->vm_flags & VM_WRITE)
 940		return -EPERM;
 941
 942	end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
 943	if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
 944		return -EINVAL;
 945
 946	q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
 947	pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
 948			__func__, q_id, current->pid, vma->vm_start,
 949			phys_off, (unsigned long)sz);
 950
 951	if (sz != ublk_queue_cmd_buf_size(ub, q_id))
 952		return -EINVAL;
 953
 954	pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
 955	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
 956}
 957
 958static void ublk_commit_completion(struct ublk_device *ub,
 959		struct ublksrv_io_cmd *ub_cmd)
 960{
 961	u32 qid = ub_cmd->q_id, tag = ub_cmd->tag;
 962	struct ublk_queue *ubq = ublk_get_queue(ub, qid);
 963	struct ublk_io *io = &ubq->ios[tag];
 964	struct request *req;
 965
 966	/* now this cmd slot is owned by nbd driver */
 967	io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
 968	io->res = ub_cmd->result;
 969
 970	/* find the io request and complete */
 971	req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
 972
 973	if (req && likely(!blk_should_fake_timeout(req->q)))
 974		ublk_complete_rq(req);
 975}
 976
 977/*
 978 * When ->ubq_daemon is exiting, either new request is ended immediately,
 979 * or any queued io command is drained, so it is safe to abort queue
 980 * lockless
 981 */
 982static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
 983{
 984	int i;
 985
 986	if (!ublk_get_device(ub))
 987		return;
 988
 989	for (i = 0; i < ubq->q_depth; i++) {
 990		struct ublk_io *io = &ubq->ios[i];
 991
 992		if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
 993			struct request *rq;
 994
 995			/*
 996			 * Either we fail the request or ublk_rq_task_work_fn
 997			 * will do it
 998			 */
 999			rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
1000			if (rq)
1001				__ublk_fail_req(ubq, io, rq);
1002		}
1003	}
1004	ublk_put_device(ub);
1005}
1006
1007static void ublk_daemon_monitor_work(struct work_struct *work)
1008{
1009	struct ublk_device *ub =
1010		container_of(work, struct ublk_device, monitor_work.work);
1011	int i;
1012
1013	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
1014		struct ublk_queue *ubq = ublk_get_queue(ub, i);
1015
1016		if (ubq_daemon_is_dying(ubq)) {
1017			if (ublk_queue_can_use_recovery(ubq))
1018				schedule_work(&ub->quiesce_work);
1019			else
1020				schedule_work(&ub->stop_work);
1021
1022			/* abort queue is for making forward progress */
1023			ublk_abort_queue(ub, ubq);
1024		}
1025	}
1026
1027	/*
1028	 * We can't schedule monitor work after ub's state is not UBLK_S_DEV_LIVE.
1029	 * after ublk_remove() or __ublk_quiesce_dev() is started.
1030	 *
1031	 * No need ub->mutex, monitor work are canceled after state is marked
1032	 * as not LIVE, so new state is observed reliably.
1033	 */
1034	if (ub->dev_info.state == UBLK_S_DEV_LIVE)
1035		schedule_delayed_work(&ub->monitor_work,
1036				UBLK_DAEMON_MONITOR_PERIOD);
1037}
1038
1039static inline bool ublk_queue_ready(struct ublk_queue *ubq)
1040{
1041	return ubq->nr_io_ready == ubq->q_depth;
1042}
1043
1044static void ublk_cancel_queue(struct ublk_queue *ubq)
1045{
1046	int i;
1047
1048	if (!ublk_queue_ready(ubq))
1049		return;
1050
1051	for (i = 0; i < ubq->q_depth; i++) {
1052		struct ublk_io *io = &ubq->ios[i];
1053
1054		if (io->flags & UBLK_IO_FLAG_ACTIVE)
1055			io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0);
1056	}
1057
1058	/* all io commands are canceled */
1059	ubq->nr_io_ready = 0;
1060}
1061
1062/* Cancel all pending commands, must be called after del_gendisk() returns */
1063static void ublk_cancel_dev(struct ublk_device *ub)
1064{
1065	int i;
1066
1067	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1068		ublk_cancel_queue(ublk_get_queue(ub, i));
1069}
1070
1071static bool ublk_check_inflight_rq(struct request *rq, void *data)
1072{
1073	bool *idle = data;
1074
1075	if (blk_mq_request_started(rq)) {
1076		*idle = false;
1077		return false;
1078	}
1079	return true;
1080}
1081
1082static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
1083{
1084	bool idle;
1085
1086	WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
1087	while (true) {
1088		idle = true;
1089		blk_mq_tagset_busy_iter(&ub->tag_set,
1090				ublk_check_inflight_rq, &idle);
1091		if (idle)
1092			break;
1093		msleep(UBLK_REQUEUE_DELAY_MS);
1094	}
1095}
1096
1097static void __ublk_quiesce_dev(struct ublk_device *ub)
1098{
1099	pr_devel("%s: quiesce ub: dev_id %d state %s\n",
1100			__func__, ub->dev_info.dev_id,
1101			ub->dev_info.state == UBLK_S_DEV_LIVE ?
1102			"LIVE" : "QUIESCED");
1103	blk_mq_quiesce_queue(ub->ub_disk->queue);
1104	ublk_wait_tagset_rqs_idle(ub);
1105	ub->dev_info.state = UBLK_S_DEV_QUIESCED;
1106	ublk_cancel_dev(ub);
1107	/* we are going to release task_struct of ubq_daemon and resets
1108	 * ->ubq_daemon to NULL. So in monitor_work, check on ubq_daemon causes UAF.
1109	 * Besides, monitor_work is not necessary in QUIESCED state since we have
1110	 * already scheduled quiesce_work and quiesced all ubqs.
1111	 *
1112	 * Do not let monitor_work schedule itself if state it QUIESCED. And we cancel
1113	 * it here and re-schedule it in END_USER_RECOVERY to avoid UAF.
1114	 */
1115	cancel_delayed_work_sync(&ub->monitor_work);
1116}
1117
1118static void ublk_quiesce_work_fn(struct work_struct *work)
1119{
1120	struct ublk_device *ub =
1121		container_of(work, struct ublk_device, quiesce_work);
1122
1123	mutex_lock(&ub->mutex);
1124	if (ub->dev_info.state != UBLK_S_DEV_LIVE)
1125		goto unlock;
1126	__ublk_quiesce_dev(ub);
1127 unlock:
1128	mutex_unlock(&ub->mutex);
1129}
1130
1131static void ublk_unquiesce_dev(struct ublk_device *ub)
1132{
1133	int i;
1134
1135	pr_devel("%s: unquiesce ub: dev_id %d state %s\n",
1136			__func__, ub->dev_info.dev_id,
1137			ub->dev_info.state == UBLK_S_DEV_LIVE ?
1138			"LIVE" : "QUIESCED");
1139	/* quiesce_work has run. We let requeued rqs be aborted
1140	 * before running fallback_wq. "force_abort" must be seen
1141	 * after request queue is unqiuesced. Then del_gendisk()
1142	 * can move on.
1143	 */
1144	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1145		ublk_get_queue(ub, i)->force_abort = true;
1146
1147	blk_mq_unquiesce_queue(ub->ub_disk->queue);
1148	/* We may have requeued some rqs in ublk_quiesce_queue() */
1149	blk_mq_kick_requeue_list(ub->ub_disk->queue);
1150}
1151
1152static void ublk_stop_dev(struct ublk_device *ub)
1153{
1154	mutex_lock(&ub->mutex);
1155	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
1156		goto unlock;
1157	if (ublk_can_use_recovery(ub)) {
1158		if (ub->dev_info.state == UBLK_S_DEV_LIVE)
1159			__ublk_quiesce_dev(ub);
1160		ublk_unquiesce_dev(ub);
1161	}
1162	del_gendisk(ub->ub_disk);
1163	ub->dev_info.state = UBLK_S_DEV_DEAD;
1164	ub->dev_info.ublksrv_pid = -1;
1165	put_disk(ub->ub_disk);
1166	ub->ub_disk = NULL;
1167 unlock:
1168	ublk_cancel_dev(ub);
1169	mutex_unlock(&ub->mutex);
1170	cancel_delayed_work_sync(&ub->monitor_work);
1171}
1172
1173/* device can only be started after all IOs are ready */
1174static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
1175{
1176	mutex_lock(&ub->mutex);
1177	ubq->nr_io_ready++;
1178	if (ublk_queue_ready(ubq)) {
1179		ubq->ubq_daemon = current;
1180		get_task_struct(ubq->ubq_daemon);
1181		ub->nr_queues_ready++;
1182	}
1183	if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues)
1184		complete_all(&ub->completion);
1185	mutex_unlock(&ub->mutex);
1186}
1187
1188static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id,
1189		int tag)
1190{
1191	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
1192	struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
1193
1194	ublk_queue_cmd(ubq, req);
1195}
1196
1197static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
1198{
1199	struct ublksrv_io_cmd *ub_cmd = (struct ublksrv_io_cmd *)cmd->cmd;
1200	struct ublk_device *ub = cmd->file->private_data;
1201	struct ublk_queue *ubq;
1202	struct ublk_io *io;
1203	u32 cmd_op = cmd->cmd_op;
1204	unsigned tag = ub_cmd->tag;
1205	int ret = -EINVAL;
1206
1207	pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
1208			__func__, cmd->cmd_op, ub_cmd->q_id, tag,
1209			ub_cmd->result);
1210
1211	if (!(issue_flags & IO_URING_F_SQE128))
1212		goto out;
1213
1214	if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
1215		goto out;
1216
1217	ubq = ublk_get_queue(ub, ub_cmd->q_id);
1218	if (!ubq || ub_cmd->q_id != ubq->q_id)
1219		goto out;
1220
1221	if (ubq->ubq_daemon && ubq->ubq_daemon != current)
1222		goto out;
1223
1224	if (tag >= ubq->q_depth)
1225		goto out;
1226
1227	io = &ubq->ios[tag];
1228
1229	/* there is pending io cmd, something must be wrong */
1230	if (io->flags & UBLK_IO_FLAG_ACTIVE) {
1231		ret = -EBUSY;
1232		goto out;
1233	}
1234
1235	/*
1236	 * ensure that the user issues UBLK_IO_NEED_GET_DATA
1237	 * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
1238	 */
1239	if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
1240			^ (cmd_op == UBLK_IO_NEED_GET_DATA))
1241		goto out;
1242
1243	switch (cmd_op) {
1244	case UBLK_IO_FETCH_REQ:
1245		/* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
1246		if (ublk_queue_ready(ubq)) {
1247			ret = -EBUSY;
1248			goto out;
1249		}
1250		/*
1251		 * The io is being handled by server, so COMMIT_RQ is expected
1252		 * instead of FETCH_REQ
1253		 */
1254		if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
1255			goto out;
1256		/* FETCH_RQ has to provide IO buffer */
1257		if (!ub_cmd->addr)
1258			goto out;
1259		io->cmd = cmd;
1260		io->flags |= UBLK_IO_FLAG_ACTIVE;
1261		io->addr = ub_cmd->addr;
1262
1263		ublk_mark_io_ready(ub, ubq);
1264		break;
1265	case UBLK_IO_COMMIT_AND_FETCH_REQ:
1266		/* FETCH_RQ has to provide IO buffer */
1267		if (!ub_cmd->addr)
1268			goto out;
1269		if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
1270			goto out;
1271		io->addr = ub_cmd->addr;
1272		io->flags |= UBLK_IO_FLAG_ACTIVE;
1273		io->cmd = cmd;
1274		ublk_commit_completion(ub, ub_cmd);
1275		break;
1276	case UBLK_IO_NEED_GET_DATA:
1277		if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
1278			goto out;
1279		io->addr = ub_cmd->addr;
1280		io->cmd = cmd;
1281		io->flags |= UBLK_IO_FLAG_ACTIVE;
1282		ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag);
1283		break;
1284	default:
1285		goto out;
1286	}
1287	return -EIOCBQUEUED;
1288
1289 out:
1290	io_uring_cmd_done(cmd, ret, 0);
1291	pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
1292			__func__, cmd_op, tag, ret, io->flags);
1293	return -EIOCBQUEUED;
1294}
1295
1296static const struct file_operations ublk_ch_fops = {
1297	.owner = THIS_MODULE,
1298	.open = ublk_ch_open,
1299	.release = ublk_ch_release,
1300	.llseek = no_llseek,
1301	.uring_cmd = ublk_ch_uring_cmd,
1302	.mmap = ublk_ch_mmap,
1303};
1304
1305static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
1306{
1307	int size = ublk_queue_cmd_buf_size(ub, q_id);
1308	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
1309
1310	if (ubq->ubq_daemon)
1311		put_task_struct(ubq->ubq_daemon);
1312	if (ubq->io_cmd_buf)
1313		free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
1314}
1315
1316static int ublk_init_queue(struct ublk_device *ub, int q_id)
1317{
1318	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
1319	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
1320	void *ptr;
1321	int size;
1322
1323	ubq->flags = ub->dev_info.flags;
1324	ubq->q_id = q_id;
1325	ubq->q_depth = ub->dev_info.queue_depth;
1326	size = ublk_queue_cmd_buf_size(ub, q_id);
1327
1328	ptr = (void *) __get_free_pages(gfp_flags, get_order(size));
1329	if (!ptr)
1330		return -ENOMEM;
1331
1332	ubq->io_cmd_buf = ptr;
1333	ubq->dev = ub;
1334	return 0;
1335}
1336
1337static void ublk_deinit_queues(struct ublk_device *ub)
1338{
1339	int nr_queues = ub->dev_info.nr_hw_queues;
1340	int i;
1341
1342	if (!ub->__queues)
1343		return;
1344
1345	for (i = 0; i < nr_queues; i++)
1346		ublk_deinit_queue(ub, i);
1347	kfree(ub->__queues);
1348}
1349
1350static int ublk_init_queues(struct ublk_device *ub)
1351{
1352	int nr_queues = ub->dev_info.nr_hw_queues;
1353	int depth = ub->dev_info.queue_depth;
1354	int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io);
1355	int i, ret = -ENOMEM;
1356
1357	ub->queue_size = ubq_size;
1358	ub->__queues = kcalloc(nr_queues, ubq_size, GFP_KERNEL);
1359	if (!ub->__queues)
1360		return ret;
1361
1362	for (i = 0; i < nr_queues; i++) {
1363		if (ublk_init_queue(ub, i))
1364			goto fail;
1365	}
1366
1367	init_completion(&ub->completion);
1368	return 0;
1369
1370 fail:
1371	ublk_deinit_queues(ub);
1372	return ret;
1373}
1374
1375static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
1376{
1377	int i = idx;
1378	int err;
1379
1380	spin_lock(&ublk_idr_lock);
1381	/* allocate id, if @id >= 0, we're requesting that specific id */
1382	if (i >= 0) {
1383		err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
1384		if (err == -ENOSPC)
1385			err = -EEXIST;
1386	} else {
1387		err = idr_alloc(&ublk_index_idr, ub, 0, 0, GFP_NOWAIT);
1388	}
1389	spin_unlock(&ublk_idr_lock);
1390
1391	if (err >= 0)
1392		ub->ub_number = err;
1393
1394	return err;
1395}
1396
1397static void ublk_free_dev_number(struct ublk_device *ub)
1398{
1399	spin_lock(&ublk_idr_lock);
1400	idr_remove(&ublk_index_idr, ub->ub_number);
1401	wake_up_all(&ublk_idr_wq);
1402	spin_unlock(&ublk_idr_lock);
1403}
1404
1405static void ublk_cdev_rel(struct device *dev)
1406{
1407	struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
1408
1409	blk_mq_free_tag_set(&ub->tag_set);
1410	ublk_deinit_queues(ub);
1411	ublk_free_dev_number(ub);
1412	mutex_destroy(&ub->mutex);
1413	kfree(ub);
1414}
1415
1416static int ublk_add_chdev(struct ublk_device *ub)
1417{
1418	struct device *dev = &ub->cdev_dev;
1419	int minor = ub->ub_number;
1420	int ret;
1421
1422	dev->parent = ublk_misc.this_device;
1423	dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
1424	dev->class = ublk_chr_class;
1425	dev->release = ublk_cdev_rel;
1426	device_initialize(dev);
1427
1428	ret = dev_set_name(dev, "ublkc%d", minor);
1429	if (ret)
1430		goto fail;
1431
1432	cdev_init(&ub->cdev, &ublk_ch_fops);
1433	ret = cdev_device_add(&ub->cdev, dev);
1434	if (ret)
1435		goto fail;
1436	return 0;
1437 fail:
1438	put_device(dev);
1439	return ret;
1440}
1441
1442static void ublk_stop_work_fn(struct work_struct *work)
1443{
1444	struct ublk_device *ub =
1445		container_of(work, struct ublk_device, stop_work);
1446
1447	ublk_stop_dev(ub);
1448}
1449
1450/* align max io buffer size with PAGE_SIZE */
1451static void ublk_align_max_io_size(struct ublk_device *ub)
1452{
1453	unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
1454
1455	ub->dev_info.max_io_buf_bytes =
1456		round_down(max_io_bytes, PAGE_SIZE);
1457}
1458
1459static int ublk_add_tag_set(struct ublk_device *ub)
1460{
1461	ub->tag_set.ops = &ublk_mq_ops;
1462	ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
1463	ub->tag_set.queue_depth = ub->dev_info.queue_depth;
1464	ub->tag_set.numa_node = NUMA_NO_NODE;
1465	ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
1466	ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
1467	ub->tag_set.driver_data = ub;
1468	return blk_mq_alloc_tag_set(&ub->tag_set);
1469}
1470
1471static void ublk_remove(struct ublk_device *ub)
1472{
1473	ublk_stop_dev(ub);
1474	cancel_work_sync(&ub->stop_work);
1475	cancel_work_sync(&ub->quiesce_work);
1476	cdev_device_del(&ub->cdev, &ub->cdev_dev);
1477	put_device(&ub->cdev_dev);
1478}
1479
1480static struct ublk_device *ublk_get_device_from_id(int idx)
1481{
1482	struct ublk_device *ub = NULL;
1483
1484	if (idx < 0)
1485		return NULL;
1486
1487	spin_lock(&ublk_idr_lock);
1488	ub = idr_find(&ublk_index_idr, idx);
1489	if (ub)
1490		ub = ublk_get_device(ub);
1491	spin_unlock(&ublk_idr_lock);
1492
1493	return ub;
1494}
1495
1496static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
1497{
1498	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1499	int ublksrv_pid = (int)header->data[0];
1500	struct ublk_device *ub;
1501	struct gendisk *disk;
1502	int ret = -EINVAL;
1503
1504	if (ublksrv_pid <= 0)
1505		return -EINVAL;
1506
1507	ub = ublk_get_device_from_id(header->dev_id);
1508	if (!ub)
1509		return -EINVAL;
1510
1511	wait_for_completion_interruptible(&ub->completion);
1512
1513	schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD);
1514
1515	mutex_lock(&ub->mutex);
1516	if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
1517	    test_bit(UB_STATE_USED, &ub->state)) {
1518		ret = -EEXIST;
1519		goto out_unlock;
1520	}
1521
1522	disk = blk_mq_alloc_disk(&ub->tag_set, ub);
1523	if (IS_ERR(disk)) {
1524		ret = PTR_ERR(disk);
1525		goto out_unlock;
1526	}
1527	sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
1528	disk->fops = &ub_fops;
1529	disk->private_data = ub;
1530
1531	ub->dev_info.ublksrv_pid = ublksrv_pid;
1532	ub->ub_disk = disk;
1533
1534	ret = ublk_apply_params(ub);
1535	if (ret)
1536		goto out_put_disk;
1537
1538	get_device(&ub->cdev_dev);
1539	ret = add_disk(disk);
1540	if (ret) {
1541		/*
1542		 * Has to drop the reference since ->free_disk won't be
1543		 * called in case of add_disk failure.
1544		 */
1545		ublk_put_device(ub);
1546		goto out_put_disk;
1547	}
1548	set_bit(UB_STATE_USED, &ub->state);
1549	ub->dev_info.state = UBLK_S_DEV_LIVE;
1550out_put_disk:
1551	if (ret)
1552		put_disk(disk);
1553out_unlock:
1554	mutex_unlock(&ub->mutex);
1555	ublk_put_device(ub);
1556	return ret;
1557}
1558
1559static int ublk_ctrl_get_queue_affinity(struct io_uring_cmd *cmd)
1560{
1561	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1562	void __user *argp = (void __user *)(unsigned long)header->addr;
1563	struct ublk_device *ub;
1564	cpumask_var_t cpumask;
1565	unsigned long queue;
1566	unsigned int retlen;
1567	unsigned int i;
1568	int ret = -EINVAL;
1569	
1570	if (header->len * BITS_PER_BYTE < nr_cpu_ids)
1571		return -EINVAL;
1572	if (header->len & (sizeof(unsigned long)-1))
1573		return -EINVAL;
1574	if (!header->addr)
1575		return -EINVAL;
1576
1577	ub = ublk_get_device_from_id(header->dev_id);
1578	if (!ub)
1579		return -EINVAL;
1580
1581	queue = header->data[0];
1582	if (queue >= ub->dev_info.nr_hw_queues)
1583		goto out_put_device;
1584
1585	ret = -ENOMEM;
1586	if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
1587		goto out_put_device;
1588
1589	for_each_possible_cpu(i) {
1590		if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
1591			cpumask_set_cpu(i, cpumask);
1592	}
1593
1594	ret = -EFAULT;
1595	retlen = min_t(unsigned short, header->len, cpumask_size());
1596	if (copy_to_user(argp, cpumask, retlen))
1597		goto out_free_cpumask;
1598	if (retlen != header->len &&
1599	    clear_user(argp + retlen, header->len - retlen))
1600		goto out_free_cpumask;
1601
1602	ret = 0;
1603out_free_cpumask:
1604	free_cpumask_var(cpumask);
1605out_put_device:
1606	ublk_put_device(ub);
1607	return ret;
1608}
1609
1610static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
1611{
1612	pr_devel("%s: dev id %d flags %llx\n", __func__,
1613			info->dev_id, info->flags);
1614	pr_devel("\t nr_hw_queues %d queue_depth %d\n",
1615			info->nr_hw_queues, info->queue_depth);
1616}
1617
1618static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
1619{
1620	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1621	void __user *argp = (void __user *)(unsigned long)header->addr;
1622	struct ublksrv_ctrl_dev_info info;
1623	struct ublk_device *ub;
1624	int ret = -EINVAL;
1625
1626	if (header->len < sizeof(info) || !header->addr)
1627		return -EINVAL;
1628	if (header->queue_id != (u16)-1) {
1629		pr_warn("%s: queue_id is wrong %x\n",
1630			__func__, header->queue_id);
1631		return -EINVAL;
1632	}
1633	if (copy_from_user(&info, argp, sizeof(info)))
1634		return -EFAULT;
1635	ublk_dump_dev_info(&info);
1636	if (header->dev_id != info.dev_id) {
1637		pr_warn("%s: dev id not match %u %u\n",
1638			__func__, header->dev_id, info.dev_id);
1639		return -EINVAL;
1640	}
1641
1642	ret = mutex_lock_killable(&ublk_ctl_mutex);
1643	if (ret)
1644		return ret;
1645
1646	ret = -ENOMEM;
1647	ub = kzalloc(sizeof(*ub), GFP_KERNEL);
1648	if (!ub)
1649		goto out_unlock;
1650	mutex_init(&ub->mutex);
1651	spin_lock_init(&ub->mm_lock);
1652	INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn);
1653	INIT_WORK(&ub->stop_work, ublk_stop_work_fn);
1654	INIT_DELAYED_WORK(&ub->monitor_work, ublk_daemon_monitor_work);
1655
1656	ret = ublk_alloc_dev_number(ub, header->dev_id);
1657	if (ret < 0)
1658		goto out_free_ub;
1659
1660	memcpy(&ub->dev_info, &info, sizeof(info));
1661
1662	/* update device id */
1663	ub->dev_info.dev_id = ub->ub_number;
1664
1665	/*
1666	 * 64bit flags will be copied back to userspace as feature
1667	 * negotiation result, so have to clear flags which driver
1668	 * doesn't support yet, then userspace can get correct flags
1669	 * (features) to handle.
1670	 */
1671	ub->dev_info.flags &= UBLK_F_ALL;
1672
1673	if (!IS_BUILTIN(CONFIG_BLK_DEV_UBLK))
1674		ub->dev_info.flags |= UBLK_F_URING_CMD_COMP_IN_TASK;
1675
1676	/* We are not ready to support zero copy */
1677	ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
1678
1679	ub->dev_info.nr_hw_queues = min_t(unsigned int,
1680			ub->dev_info.nr_hw_queues, nr_cpu_ids);
1681	ublk_align_max_io_size(ub);
1682
1683	ret = ublk_init_queues(ub);
1684	if (ret)
1685		goto out_free_dev_number;
1686
1687	ret = ublk_add_tag_set(ub);
1688	if (ret)
1689		goto out_deinit_queues;
1690
1691	ret = -EFAULT;
1692	if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
1693		goto out_free_tag_set;
1694
1695	/*
1696	 * Add the char dev so that ublksrv daemon can be setup.
1697	 * ublk_add_chdev() will cleanup everything if it fails.
1698	 */
1699	ret = ublk_add_chdev(ub);
1700	goto out_unlock;
1701
1702out_free_tag_set:
1703	blk_mq_free_tag_set(&ub->tag_set);
1704out_deinit_queues:
1705	ublk_deinit_queues(ub);
1706out_free_dev_number:
1707	ublk_free_dev_number(ub);
1708out_free_ub:
1709	mutex_destroy(&ub->mutex);
1710	kfree(ub);
1711out_unlock:
1712	mutex_unlock(&ublk_ctl_mutex);
1713	return ret;
1714}
1715
1716static inline bool ublk_idr_freed(int id)
1717{
1718	void *ptr;
1719
1720	spin_lock(&ublk_idr_lock);
1721	ptr = idr_find(&ublk_index_idr, id);
1722	spin_unlock(&ublk_idr_lock);
1723
1724	return ptr == NULL;
1725}
1726
1727static int ublk_ctrl_del_dev(int idx)
1728{
1729	struct ublk_device *ub;
1730	int ret;
1731
1732	ret = mutex_lock_killable(&ublk_ctl_mutex);
1733	if (ret)
1734		return ret;
1735
1736	ub = ublk_get_device_from_id(idx);
1737	if (ub) {
1738		ublk_remove(ub);
1739		ublk_put_device(ub);
1740		ret = 0;
1741	} else {
1742		ret = -ENODEV;
1743	}
1744
1745	/*
1746	 * Wait until the idr is removed, then it can be reused after
1747	 * DEL_DEV command is returned.
1748	 */
1749	if (!ret)
1750		wait_event(ublk_idr_wq, ublk_idr_freed(idx));
1751	mutex_unlock(&ublk_ctl_mutex);
1752
1753	return ret;
1754}
1755
1756static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd)
1757{
1758	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1759
1760	pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
1761			__func__, cmd->cmd_op, header->dev_id, header->queue_id,
1762			header->data[0], header->addr, header->len);
1763}
1764
1765static int ublk_ctrl_stop_dev(struct io_uring_cmd *cmd)
1766{
1767	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1768	struct ublk_device *ub;
1769
1770	ub = ublk_get_device_from_id(header->dev_id);
1771	if (!ub)
1772		return -EINVAL;
1773
1774	ublk_stop_dev(ub);
1775	cancel_work_sync(&ub->stop_work);
1776	cancel_work_sync(&ub->quiesce_work);
1777
1778	ublk_put_device(ub);
1779	return 0;
1780}
1781
1782static int ublk_ctrl_get_dev_info(struct io_uring_cmd *cmd)
1783{
1784	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1785	void __user *argp = (void __user *)(unsigned long)header->addr;
1786	struct ublk_device *ub;
1787	int ret = 0;
1788
1789	if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
1790		return -EINVAL;
1791
1792	ub = ublk_get_device_from_id(header->dev_id);
1793	if (!ub)
1794		return -EINVAL;
1795
1796	if (copy_to_user(argp, &ub->dev_info, sizeof(ub->dev_info)))
1797		ret = -EFAULT;
1798	ublk_put_device(ub);
1799
1800	return ret;
1801}
1802
1803static int ublk_ctrl_get_params(struct io_uring_cmd *cmd)
1804{
1805	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1806	void __user *argp = (void __user *)(unsigned long)header->addr;
1807	struct ublk_params_header ph;
1808	struct ublk_device *ub;
1809	int ret;
1810
1811	if (header->len <= sizeof(ph) || !header->addr)
1812		return -EINVAL;
1813
1814	if (copy_from_user(&ph, argp, sizeof(ph)))
1815		return -EFAULT;
1816
1817	if (ph.len > header->len || !ph.len)
1818		return -EINVAL;
1819
1820	if (ph.len > sizeof(struct ublk_params))
1821		ph.len = sizeof(struct ublk_params);
1822
1823	ub = ublk_get_device_from_id(header->dev_id);
1824	if (!ub)
1825		return -EINVAL;
1826
1827	mutex_lock(&ub->mutex);
1828	if (copy_to_user(argp, &ub->params, ph.len))
1829		ret = -EFAULT;
1830	else
1831		ret = 0;
1832	mutex_unlock(&ub->mutex);
1833
1834	ublk_put_device(ub);
1835	return ret;
1836}
1837
1838static int ublk_ctrl_set_params(struct io_uring_cmd *cmd)
1839{
1840	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1841	void __user *argp = (void __user *)(unsigned long)header->addr;
1842	struct ublk_params_header ph;
1843	struct ublk_device *ub;
1844	int ret = -EFAULT;
1845
1846	if (header->len <= sizeof(ph) || !header->addr)
1847		return -EINVAL;
1848
1849	if (copy_from_user(&ph, argp, sizeof(ph)))
1850		return -EFAULT;
1851
1852	if (ph.len > header->len || !ph.len || !ph.types)
1853		return -EINVAL;
1854
1855	if (ph.len > sizeof(struct ublk_params))
1856		ph.len = sizeof(struct ublk_params);
1857
1858	ub = ublk_get_device_from_id(header->dev_id);
1859	if (!ub)
1860		return -EINVAL;
1861
1862	/* parameters can only be changed when device isn't live */
1863	mutex_lock(&ub->mutex);
1864	if (ub->dev_info.state == UBLK_S_DEV_LIVE) {
1865		ret = -EACCES;
1866	} else if (copy_from_user(&ub->params, argp, ph.len)) {
1867		ret = -EFAULT;
1868	} else {
1869		/* clear all we don't support yet */
1870		ub->params.types &= UBLK_PARAM_TYPE_ALL;
1871		ret = ublk_validate_params(ub);
1872	}
1873	mutex_unlock(&ub->mutex);
1874	ublk_put_device(ub);
1875
1876	return ret;
1877}
1878
1879static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
1880{
1881	int i;
1882
1883	WARN_ON_ONCE(!(ubq->ubq_daemon && ubq_daemon_is_dying(ubq)));
1884	/* All old ioucmds have to be completed */
1885	WARN_ON_ONCE(ubq->nr_io_ready);
1886	/* old daemon is PF_EXITING, put it now */
1887	put_task_struct(ubq->ubq_daemon);
1888	/* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */
1889	ubq->ubq_daemon = NULL;
1890
1891	for (i = 0; i < ubq->q_depth; i++) {
1892		struct ublk_io *io = &ubq->ios[i];
1893
1894		/* forget everything now and be ready for new FETCH_REQ */
1895		io->flags = 0;
1896		io->cmd = NULL;
1897		io->addr = 0;
1898	}
1899}
1900
1901static int ublk_ctrl_start_recovery(struct io_uring_cmd *cmd)
1902{
1903	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1904	struct ublk_device *ub;
1905	int ret = -EINVAL;
1906	int i;
1907
1908	ub = ublk_get_device_from_id(header->dev_id);
1909	if (!ub)
1910		return ret;
1911
1912	mutex_lock(&ub->mutex);
1913	if (!ublk_can_use_recovery(ub))
1914		goto out_unlock;
1915	/*
1916	 * START_RECOVERY is only allowd after:
1917	 *
1918	 * (1) UB_STATE_OPEN is not set, which means the dying process is exited
1919	 *     and related io_uring ctx is freed so file struct of /dev/ublkcX is
1920	 *     released.
1921	 *
1922	 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
1923	 *     (a)has quiesced request queue
1924	 *     (b)has requeued every inflight rqs whose io_flags is ACTIVE
1925	 *     (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
1926	 *     (d)has completed/camceled all ioucmds owned by ther dying process
1927	 */
1928	if (test_bit(UB_STATE_OPEN, &ub->state) ||
1929			ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
1930		ret = -EBUSY;
1931		goto out_unlock;
1932	}
1933	pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id);
1934	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1935		ublk_queue_reinit(ub, ublk_get_queue(ub, i));
1936	/* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */
1937	ub->mm = NULL;
1938	ub->nr_queues_ready = 0;
1939	init_completion(&ub->completion);
1940	ret = 0;
1941 out_unlock:
1942	mutex_unlock(&ub->mutex);
1943	ublk_put_device(ub);
1944	return ret;
1945}
1946
1947static int ublk_ctrl_end_recovery(struct io_uring_cmd *cmd)
1948{
1949	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1950	int ublksrv_pid = (int)header->data[0];
1951	struct ublk_device *ub;
1952	int ret = -EINVAL;
1953
1954	ub = ublk_get_device_from_id(header->dev_id);
1955	if (!ub)
1956		return ret;
1957
1958	pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n",
1959			__func__, ub->dev_info.nr_hw_queues, header->dev_id);
1960	/* wait until new ubq_daemon sending all FETCH_REQ */
1961	wait_for_completion_interruptible(&ub->completion);
1962	pr_devel("%s: All new ubq_daemons(nr: %d) are ready, dev id %d\n",
1963			__func__, ub->dev_info.nr_hw_queues, header->dev_id);
1964
1965	mutex_lock(&ub->mutex);
1966	if (!ublk_can_use_recovery(ub))
1967		goto out_unlock;
1968
1969	if (ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
1970		ret = -EBUSY;
1971		goto out_unlock;
1972	}
1973	ub->dev_info.ublksrv_pid = ublksrv_pid;
1974	pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
1975			__func__, ublksrv_pid, header->dev_id);
1976	blk_mq_unquiesce_queue(ub->ub_disk->queue);
1977	pr_devel("%s: queue unquiesced, dev id %d.\n",
1978			__func__, header->dev_id);
1979	blk_mq_kick_requeue_list(ub->ub_disk->queue);
1980	ub->dev_info.state = UBLK_S_DEV_LIVE;
1981	schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD);
1982	ret = 0;
1983 out_unlock:
1984	mutex_unlock(&ub->mutex);
1985	ublk_put_device(ub);
1986	return ret;
1987}
1988
1989static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
1990		unsigned int issue_flags)
1991{
1992	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1993	int ret = -EINVAL;
1994
1995	if (issue_flags & IO_URING_F_NONBLOCK)
1996		return -EAGAIN;
1997
1998	ublk_ctrl_cmd_dump(cmd);
1999
2000	if (!(issue_flags & IO_URING_F_SQE128))
2001		goto out;
2002
2003	ret = -EPERM;
2004	if (!capable(CAP_SYS_ADMIN))
2005		goto out;
2006
2007	ret = -ENODEV;
2008	switch (cmd->cmd_op) {
2009	case UBLK_CMD_START_DEV:
2010		ret = ublk_ctrl_start_dev(cmd);
2011		break;
2012	case UBLK_CMD_STOP_DEV:
2013		ret = ublk_ctrl_stop_dev(cmd);
2014		break;
2015	case UBLK_CMD_GET_DEV_INFO:
2016		ret = ublk_ctrl_get_dev_info(cmd);
2017		break;
2018	case UBLK_CMD_ADD_DEV:
2019		ret = ublk_ctrl_add_dev(cmd);
2020		break;
2021	case UBLK_CMD_DEL_DEV:
2022		ret = ublk_ctrl_del_dev(header->dev_id);
2023		break;
2024	case UBLK_CMD_GET_QUEUE_AFFINITY:
2025		ret = ublk_ctrl_get_queue_affinity(cmd);
2026		break;
2027	case UBLK_CMD_GET_PARAMS:
2028		ret = ublk_ctrl_get_params(cmd);
2029		break;
2030	case UBLK_CMD_SET_PARAMS:
2031		ret = ublk_ctrl_set_params(cmd);
2032		break;
2033	case UBLK_CMD_START_USER_RECOVERY:
2034		ret = ublk_ctrl_start_recovery(cmd);
2035		break;
2036	case UBLK_CMD_END_USER_RECOVERY:
2037		ret = ublk_ctrl_end_recovery(cmd);
2038		break;
2039	default:
2040		break;
2041	}
2042 out:
2043	io_uring_cmd_done(cmd, ret, 0);
2044	pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
2045			__func__, ret, cmd->cmd_op, header->dev_id, header->queue_id);
2046	return -EIOCBQUEUED;
2047}
2048
2049static const struct file_operations ublk_ctl_fops = {
2050	.open		= nonseekable_open,
2051	.uring_cmd      = ublk_ctrl_uring_cmd,
2052	.owner		= THIS_MODULE,
2053	.llseek		= noop_llseek,
2054};
2055
2056static struct miscdevice ublk_misc = {
2057	.minor		= MISC_DYNAMIC_MINOR,
2058	.name		= "ublk-control",
2059	.fops		= &ublk_ctl_fops,
2060};
2061
2062static int __init ublk_init(void)
2063{
2064	int ret;
2065
2066	init_waitqueue_head(&ublk_idr_wq);
2067
2068	ret = misc_register(&ublk_misc);
2069	if (ret)
2070		return ret;
2071
2072	ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
2073	if (ret)
2074		goto unregister_mis;
2075
2076	ublk_chr_class = class_create(THIS_MODULE, "ublk-char");
2077	if (IS_ERR(ublk_chr_class)) {
2078		ret = PTR_ERR(ublk_chr_class);
2079		goto free_chrdev_region;
2080	}
2081	return 0;
2082
2083free_chrdev_region:
2084	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
2085unregister_mis:
2086	misc_deregister(&ublk_misc);
2087	return ret;
2088}
2089
2090static void __exit ublk_exit(void)
2091{
2092	struct ublk_device *ub;
2093	int id;
2094
2095	idr_for_each_entry(&ublk_index_idr, ub, id)
2096		ublk_remove(ub);
2097
2098	class_destroy(ublk_chr_class);
2099	misc_deregister(&ublk_misc);
2100
2101	idr_destroy(&ublk_index_idr);
2102	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
2103}
2104
2105module_init(ublk_init);
2106module_exit(ublk_exit);
2107
2108MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
2109MODULE_LICENSE("GPL");