Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.1.
   1/*
   2 * NVM Express device driver
   3 * Copyright (c) 2011-2014, Intel Corporation.
   4 *
   5 * This program is free software; you can redistribute it and/or modify it
   6 * under the terms and conditions of the GNU General Public License,
   7 * version 2, as published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope it will be useful, but WITHOUT
  10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  12 * more details.
  13 */
  14
  15#include <linux/blkdev.h>
  16#include <linux/blk-mq.h>
  17#include <linux/delay.h>
  18#include <linux/errno.h>
  19#include <linux/hdreg.h>
  20#include <linux/kernel.h>
  21#include <linux/module.h>
  22#include <linux/list_sort.h>
  23#include <linux/slab.h>
  24#include <linux/types.h>
  25#include <linux/pr.h>
  26#include <linux/ptrace.h>
  27#include <linux/nvme_ioctl.h>
  28#include <linux/t10-pi.h>
  29#include <scsi/sg.h>
  30#include <asm/unaligned.h>
  31
  32#include "nvme.h"
  33
  34#define NVME_MINORS		(1U << MINORBITS)
  35
  36unsigned char admin_timeout = 60;
  37module_param(admin_timeout, byte, 0644);
  38MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
  39EXPORT_SYMBOL_GPL(admin_timeout);
  40
  41unsigned char nvme_io_timeout = 30;
  42module_param_named(io_timeout, nvme_io_timeout, byte, 0644);
  43MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
  44EXPORT_SYMBOL_GPL(nvme_io_timeout);
  45
  46unsigned char shutdown_timeout = 5;
  47module_param(shutdown_timeout, byte, 0644);
  48MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
  49
  50static int nvme_major;
  51module_param(nvme_major, int, 0);
  52
  53static int nvme_char_major;
  54module_param(nvme_char_major, int, 0);
  55
  56static LIST_HEAD(nvme_ctrl_list);
  57static DEFINE_SPINLOCK(dev_list_lock);
  58
  59static struct class *nvme_class;
  60
  61static void nvme_free_ns(struct kref *kref)
  62{
  63	struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
  64
  65	if (ns->type == NVME_NS_LIGHTNVM)
  66		nvme_nvm_unregister(ns->queue, ns->disk->disk_name);
  67
  68	spin_lock(&dev_list_lock);
  69	ns->disk->private_data = NULL;
  70	spin_unlock(&dev_list_lock);
  71
  72	put_disk(ns->disk);
  73	ida_simple_remove(&ns->ctrl->ns_ida, ns->instance);
  74	nvme_put_ctrl(ns->ctrl);
  75	kfree(ns);
  76}
  77
  78static void nvme_put_ns(struct nvme_ns *ns)
  79{
  80	kref_put(&ns->kref, nvme_free_ns);
  81}
  82
  83static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk)
  84{
  85	struct nvme_ns *ns;
  86
  87	spin_lock(&dev_list_lock);
  88	ns = disk->private_data;
  89	if (ns) {
  90		if (!kref_get_unless_zero(&ns->kref))
  91			goto fail;
  92		if (!try_module_get(ns->ctrl->ops->module))
  93			goto fail_put_ns;
  94	}
  95	spin_unlock(&dev_list_lock);
  96
  97	return ns;
  98
  99fail_put_ns:
 100	kref_put(&ns->kref, nvme_free_ns);
 101fail:
 102	spin_unlock(&dev_list_lock);
 103	return NULL;
 104}
 105
 106void nvme_requeue_req(struct request *req)
 107{
 108	unsigned long flags;
 109
 110	blk_mq_requeue_request(req);
 111	spin_lock_irqsave(req->q->queue_lock, flags);
 112	if (!blk_queue_stopped(req->q))
 113		blk_mq_kick_requeue_list(req->q);
 114	spin_unlock_irqrestore(req->q->queue_lock, flags);
 115}
 116EXPORT_SYMBOL_GPL(nvme_requeue_req);
 117
 118struct request *nvme_alloc_request(struct request_queue *q,
 119		struct nvme_command *cmd, unsigned int flags)
 120{
 121	bool write = cmd->common.opcode & 1;
 122	struct request *req;
 123
 124	req = blk_mq_alloc_request(q, write, flags);
 125	if (IS_ERR(req))
 126		return req;
 127
 128	req->cmd_type = REQ_TYPE_DRV_PRIV;
 129	req->cmd_flags |= REQ_FAILFAST_DRIVER;
 130	req->__data_len = 0;
 131	req->__sector = (sector_t) -1;
 132	req->bio = req->biotail = NULL;
 133
 134	req->cmd = (unsigned char *)cmd;
 135	req->cmd_len = sizeof(struct nvme_command);
 136
 137	return req;
 138}
 139EXPORT_SYMBOL_GPL(nvme_alloc_request);
 140
 141/*
 142 * Returns 0 on success.  If the result is negative, it's a Linux error code;
 143 * if the result is positive, it's an NVM Express status code
 144 */
 145int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 146		struct nvme_completion *cqe, void *buffer, unsigned bufflen,
 147		unsigned timeout)
 148{
 149	struct request *req;
 150	int ret;
 151
 152	req = nvme_alloc_request(q, cmd, 0);
 153	if (IS_ERR(req))
 154		return PTR_ERR(req);
 155
 156	req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
 157	req->special = cqe;
 158
 159	if (buffer && bufflen) {
 160		ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
 161		if (ret)
 162			goto out;
 163	}
 164
 165	blk_execute_rq(req->q, NULL, req, 0);
 166	ret = req->errors;
 167 out:
 168	blk_mq_free_request(req);
 169	return ret;
 170}
 171
 172int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 173		void *buffer, unsigned bufflen)
 174{
 175	return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0);
 176}
 177EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
 178
 179int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
 180		void __user *ubuffer, unsigned bufflen,
 181		void __user *meta_buffer, unsigned meta_len, u32 meta_seed,
 182		u32 *result, unsigned timeout)
 183{
 184	bool write = cmd->common.opcode & 1;
 185	struct nvme_completion cqe;
 186	struct nvme_ns *ns = q->queuedata;
 187	struct gendisk *disk = ns ? ns->disk : NULL;
 188	struct request *req;
 189	struct bio *bio = NULL;
 190	void *meta = NULL;
 191	int ret;
 192
 193	req = nvme_alloc_request(q, cmd, 0);
 194	if (IS_ERR(req))
 195		return PTR_ERR(req);
 196
 197	req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
 198	req->special = &cqe;
 199
 200	if (ubuffer && bufflen) {
 201		ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
 202				GFP_KERNEL);
 203		if (ret)
 204			goto out;
 205		bio = req->bio;
 206
 207		if (!disk)
 208			goto submit;
 209		bio->bi_bdev = bdget_disk(disk, 0);
 210		if (!bio->bi_bdev) {
 211			ret = -ENODEV;
 212			goto out_unmap;
 213		}
 214
 215		if (meta_buffer && meta_len) {
 216			struct bio_integrity_payload *bip;
 217
 218			meta = kmalloc(meta_len, GFP_KERNEL);
 219			if (!meta) {
 220				ret = -ENOMEM;
 221				goto out_unmap;
 222			}
 223
 224			if (write) {
 225				if (copy_from_user(meta, meta_buffer,
 226						meta_len)) {
 227					ret = -EFAULT;
 228					goto out_free_meta;
 229				}
 230			}
 231
 232			bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
 233			if (IS_ERR(bip)) {
 234				ret = PTR_ERR(bip);
 235				goto out_free_meta;
 236			}
 237
 238			bip->bip_iter.bi_size = meta_len;
 239			bip->bip_iter.bi_sector = meta_seed;
 240
 241			ret = bio_integrity_add_page(bio, virt_to_page(meta),
 242					meta_len, offset_in_page(meta));
 243			if (ret != meta_len) {
 244				ret = -ENOMEM;
 245				goto out_free_meta;
 246			}
 247		}
 248	}
 249 submit:
 250	blk_execute_rq(req->q, disk, req, 0);
 251	ret = req->errors;
 252	if (result)
 253		*result = le32_to_cpu(cqe.result);
 254	if (meta && !ret && !write) {
 255		if (copy_to_user(meta_buffer, meta, meta_len))
 256			ret = -EFAULT;
 257	}
 258 out_free_meta:
 259	kfree(meta);
 260 out_unmap:
 261	if (bio) {
 262		if (disk && bio->bi_bdev)
 263			bdput(bio->bi_bdev);
 264		blk_rq_unmap_user(bio);
 265	}
 266 out:
 267	blk_mq_free_request(req);
 268	return ret;
 269}
 270
 271int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
 272		void __user *ubuffer, unsigned bufflen, u32 *result,
 273		unsigned timeout)
 274{
 275	return __nvme_submit_user_cmd(q, cmd, ubuffer, bufflen, NULL, 0, 0,
 276			result, timeout);
 277}
 278
 279int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
 280{
 281	struct nvme_command c = { };
 282	int error;
 283
 284	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
 285	c.identify.opcode = nvme_admin_identify;
 286	c.identify.cns = cpu_to_le32(1);
 287
 288	*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
 289	if (!*id)
 290		return -ENOMEM;
 291
 292	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
 293			sizeof(struct nvme_id_ctrl));
 294	if (error)
 295		kfree(*id);
 296	return error;
 297}
 298
 299static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list)
 300{
 301	struct nvme_command c = { };
 302
 303	c.identify.opcode = nvme_admin_identify;
 304	c.identify.cns = cpu_to_le32(2);
 305	c.identify.nsid = cpu_to_le32(nsid);
 306	return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000);
 307}
 308
 309int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid,
 310		struct nvme_id_ns **id)
 311{
 312	struct nvme_command c = { };
 313	int error;
 314
 315	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
 316	c.identify.opcode = nvme_admin_identify,
 317	c.identify.nsid = cpu_to_le32(nsid),
 318
 319	*id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
 320	if (!*id)
 321		return -ENOMEM;
 322
 323	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
 324			sizeof(struct nvme_id_ns));
 325	if (error)
 326		kfree(*id);
 327	return error;
 328}
 329
 330int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
 331					dma_addr_t dma_addr, u32 *result)
 332{
 333	struct nvme_command c;
 334	struct nvme_completion cqe;
 335	int ret;
 336
 337	memset(&c, 0, sizeof(c));
 338	c.features.opcode = nvme_admin_get_features;
 339	c.features.nsid = cpu_to_le32(nsid);
 340	c.features.prp1 = cpu_to_le64(dma_addr);
 341	c.features.fid = cpu_to_le32(fid);
 342
 343	ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0);
 344	if (ret >= 0)
 345		*result = le32_to_cpu(cqe.result);
 346	return ret;
 347}
 348
 349int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
 350					dma_addr_t dma_addr, u32 *result)
 351{
 352	struct nvme_command c;
 353	struct nvme_completion cqe;
 354	int ret;
 355
 356	memset(&c, 0, sizeof(c));
 357	c.features.opcode = nvme_admin_set_features;
 358	c.features.prp1 = cpu_to_le64(dma_addr);
 359	c.features.fid = cpu_to_le32(fid);
 360	c.features.dword11 = cpu_to_le32(dword11);
 361
 362	ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0);
 363	if (ret >= 0)
 364		*result = le32_to_cpu(cqe.result);
 365	return ret;
 366}
 367
 368int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log)
 369{
 370	struct nvme_command c = { };
 371	int error;
 372
 373	c.common.opcode = nvme_admin_get_log_page,
 374	c.common.nsid = cpu_to_le32(0xFFFFFFFF),
 375	c.common.cdw10[0] = cpu_to_le32(
 376			(((sizeof(struct nvme_smart_log) / 4) - 1) << 16) |
 377			 NVME_LOG_SMART),
 378
 379	*log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL);
 380	if (!*log)
 381		return -ENOMEM;
 382
 383	error = nvme_submit_sync_cmd(dev->admin_q, &c, *log,
 384			sizeof(struct nvme_smart_log));
 385	if (error)
 386		kfree(*log);
 387	return error;
 388}
 389
 390int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
 391{
 392	u32 q_count = (*count - 1) | ((*count - 1) << 16);
 393	u32 result;
 394	int status, nr_io_queues;
 395
 396	status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0,
 397			&result);
 398	if (status)
 399		return status;
 400
 401	nr_io_queues = min(result & 0xffff, result >> 16) + 1;
 402	*count = min(*count, nr_io_queues);
 403	return 0;
 404}
 405EXPORT_SYMBOL_GPL(nvme_set_queue_count);
 406
 407static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 408{
 409	struct nvme_user_io io;
 410	struct nvme_command c;
 411	unsigned length, meta_len;
 412	void __user *metadata;
 413
 414	if (copy_from_user(&io, uio, sizeof(io)))
 415		return -EFAULT;
 416	if (io.flags)
 417		return -EINVAL;
 418
 419	switch (io.opcode) {
 420	case nvme_cmd_write:
 421	case nvme_cmd_read:
 422	case nvme_cmd_compare:
 423		break;
 424	default:
 425		return -EINVAL;
 426	}
 427
 428	length = (io.nblocks + 1) << ns->lba_shift;
 429	meta_len = (io.nblocks + 1) * ns->ms;
 430	metadata = (void __user *)(uintptr_t)io.metadata;
 431
 432	if (ns->ext) {
 433		length += meta_len;
 434		meta_len = 0;
 435	} else if (meta_len) {
 436		if ((io.metadata & 3) || !io.metadata)
 437			return -EINVAL;
 438	}
 439
 440	memset(&c, 0, sizeof(c));
 441	c.rw.opcode = io.opcode;
 442	c.rw.flags = io.flags;
 443	c.rw.nsid = cpu_to_le32(ns->ns_id);
 444	c.rw.slba = cpu_to_le64(io.slba);
 445	c.rw.length = cpu_to_le16(io.nblocks);
 446	c.rw.control = cpu_to_le16(io.control);
 447	c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
 448	c.rw.reftag = cpu_to_le32(io.reftag);
 449	c.rw.apptag = cpu_to_le16(io.apptag);
 450	c.rw.appmask = cpu_to_le16(io.appmask);
 451
 452	return __nvme_submit_user_cmd(ns->queue, &c,
 453			(void __user *)(uintptr_t)io.addr, length,
 454			metadata, meta_len, io.slba, NULL, 0);
 455}
 456
 457static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 458			struct nvme_passthru_cmd __user *ucmd)
 459{
 460	struct nvme_passthru_cmd cmd;
 461	struct nvme_command c;
 462	unsigned timeout = 0;
 463	int status;
 464
 465	if (!capable(CAP_SYS_ADMIN))
 466		return -EACCES;
 467	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
 468		return -EFAULT;
 469	if (cmd.flags)
 470		return -EINVAL;
 471
 472	memset(&c, 0, sizeof(c));
 473	c.common.opcode = cmd.opcode;
 474	c.common.flags = cmd.flags;
 475	c.common.nsid = cpu_to_le32(cmd.nsid);
 476	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
 477	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
 478	c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
 479	c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
 480	c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
 481	c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
 482	c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
 483	c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
 484
 485	if (cmd.timeout_ms)
 486		timeout = msecs_to_jiffies(cmd.timeout_ms);
 487
 488	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
 489			(void __user *)(uintptr_t)cmd.addr, cmd.data_len,
 490			&cmd.result, timeout);
 491	if (status >= 0) {
 492		if (put_user(cmd.result, &ucmd->result))
 493			return -EFAULT;
 494	}
 495
 496	return status;
 497}
 498
 499static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
 500		unsigned int cmd, unsigned long arg)
 501{
 502	struct nvme_ns *ns = bdev->bd_disk->private_data;
 503
 504	switch (cmd) {
 505	case NVME_IOCTL_ID:
 506		force_successful_syscall_return();
 507		return ns->ns_id;
 508	case NVME_IOCTL_ADMIN_CMD:
 509		return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
 510	case NVME_IOCTL_IO_CMD:
 511		return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg);
 512	case NVME_IOCTL_SUBMIT_IO:
 513		return nvme_submit_io(ns, (void __user *)arg);
 514#ifdef CONFIG_BLK_DEV_NVME_SCSI
 515	case SG_GET_VERSION_NUM:
 516		return nvme_sg_get_version_num((void __user *)arg);
 517	case SG_IO:
 518		return nvme_sg_io(ns, (void __user *)arg);
 519#endif
 520	default:
 521		return -ENOTTY;
 522	}
 523}
 524
 525#ifdef CONFIG_COMPAT
 526static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
 527			unsigned int cmd, unsigned long arg)
 528{
 529	switch (cmd) {
 530	case SG_IO:
 531		return -ENOIOCTLCMD;
 532	}
 533	return nvme_ioctl(bdev, mode, cmd, arg);
 534}
 535#else
 536#define nvme_compat_ioctl	NULL
 537#endif
 538
 539static int nvme_open(struct block_device *bdev, fmode_t mode)
 540{
 541	return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO;
 542}
 543
 544static void nvme_release(struct gendisk *disk, fmode_t mode)
 545{
 546	struct nvme_ns *ns = disk->private_data;
 547
 548	module_put(ns->ctrl->ops->module);
 549	nvme_put_ns(ns);
 550}
 551
 552static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 553{
 554	/* some standard values */
 555	geo->heads = 1 << 6;
 556	geo->sectors = 1 << 5;
 557	geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
 558	return 0;
 559}
 560
 561#ifdef CONFIG_BLK_DEV_INTEGRITY
 562static void nvme_init_integrity(struct nvme_ns *ns)
 563{
 564	struct blk_integrity integrity;
 565
 566	switch (ns->pi_type) {
 567	case NVME_NS_DPS_PI_TYPE3:
 568		integrity.profile = &t10_pi_type3_crc;
 569		break;
 570	case NVME_NS_DPS_PI_TYPE1:
 571	case NVME_NS_DPS_PI_TYPE2:
 572		integrity.profile = &t10_pi_type1_crc;
 573		break;
 574	default:
 575		integrity.profile = NULL;
 576		break;
 577	}
 578	integrity.tuple_size = ns->ms;
 579	blk_integrity_register(ns->disk, &integrity);
 580	blk_queue_max_integrity_segments(ns->queue, 1);
 581}
 582#else
 583static void nvme_init_integrity(struct nvme_ns *ns)
 584{
 585}
 586#endif /* CONFIG_BLK_DEV_INTEGRITY */
 587
 588static void nvme_config_discard(struct nvme_ns *ns)
 589{
 590	struct nvme_ctrl *ctrl = ns->ctrl;
 591	u32 logical_block_size = queue_logical_block_size(ns->queue);
 592
 593	if (ctrl->quirks & NVME_QUIRK_DISCARD_ZEROES)
 594		ns->queue->limits.discard_zeroes_data = 1;
 595	else
 596		ns->queue->limits.discard_zeroes_data = 0;
 597
 598	ns->queue->limits.discard_alignment = logical_block_size;
 599	ns->queue->limits.discard_granularity = logical_block_size;
 600	blk_queue_max_discard_sectors(ns->queue, 0xffffffff);
 601	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
 602}
 603
 604static int nvme_revalidate_disk(struct gendisk *disk)
 605{
 606	struct nvme_ns *ns = disk->private_data;
 607	struct nvme_id_ns *id;
 608	u8 lbaf, pi_type;
 609	u16 old_ms;
 610	unsigned short bs;
 611
 612	if (test_bit(NVME_NS_DEAD, &ns->flags)) {
 613		set_capacity(disk, 0);
 614		return -ENODEV;
 615	}
 616	if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) {
 617		dev_warn(disk_to_dev(ns->disk), "%s: Identify failure\n",
 618				__func__);
 619		return -ENODEV;
 620	}
 621	if (id->ncap == 0) {
 622		kfree(id);
 623		return -ENODEV;
 624	}
 625
 626	if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) {
 627		if (nvme_nvm_register(ns->queue, disk->disk_name)) {
 628			dev_warn(disk_to_dev(ns->disk),
 629				"%s: LightNVM init failure\n", __func__);
 630			kfree(id);
 631			return -ENODEV;
 632		}
 633		ns->type = NVME_NS_LIGHTNVM;
 634	}
 635
 636	if (ns->ctrl->vs >= NVME_VS(1, 1))
 637		memcpy(ns->eui, id->eui64, sizeof(ns->eui));
 638	if (ns->ctrl->vs >= NVME_VS(1, 2))
 639		memcpy(ns->uuid, id->nguid, sizeof(ns->uuid));
 640
 641	old_ms = ns->ms;
 642	lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
 643	ns->lba_shift = id->lbaf[lbaf].ds;
 644	ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
 645	ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
 646
 647	/*
 648	 * If identify namespace failed, use default 512 byte block size so
 649	 * block layer can use before failing read/write for 0 capacity.
 650	 */
 651	if (ns->lba_shift == 0)
 652		ns->lba_shift = 9;
 653	bs = 1 << ns->lba_shift;
 654	/* XXX: PI implementation requires metadata equal t10 pi tuple size */
 655	pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
 656					id->dps & NVME_NS_DPS_PI_MASK : 0;
 657
 658	blk_mq_freeze_queue(disk->queue);
 659	if (blk_get_integrity(disk) && (ns->pi_type != pi_type ||
 660				ns->ms != old_ms ||
 661				bs != queue_logical_block_size(disk->queue) ||
 662				(ns->ms && ns->ext)))
 663		blk_integrity_unregister(disk);
 664
 665	ns->pi_type = pi_type;
 666	blk_queue_logical_block_size(ns->queue, bs);
 667
 668	if (ns->ms && !blk_get_integrity(disk) && !ns->ext)
 669		nvme_init_integrity(ns);
 670	if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
 671		set_capacity(disk, 0);
 672	else
 673		set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
 674
 675	if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
 676		nvme_config_discard(ns);
 677	blk_mq_unfreeze_queue(disk->queue);
 678
 679	kfree(id);
 680	return 0;
 681}
 682
 683static char nvme_pr_type(enum pr_type type)
 684{
 685	switch (type) {
 686	case PR_WRITE_EXCLUSIVE:
 687		return 1;
 688	case PR_EXCLUSIVE_ACCESS:
 689		return 2;
 690	case PR_WRITE_EXCLUSIVE_REG_ONLY:
 691		return 3;
 692	case PR_EXCLUSIVE_ACCESS_REG_ONLY:
 693		return 4;
 694	case PR_WRITE_EXCLUSIVE_ALL_REGS:
 695		return 5;
 696	case PR_EXCLUSIVE_ACCESS_ALL_REGS:
 697		return 6;
 698	default:
 699		return 0;
 700	}
 701};
 702
 703static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
 704				u64 key, u64 sa_key, u8 op)
 705{
 706	struct nvme_ns *ns = bdev->bd_disk->private_data;
 707	struct nvme_command c;
 708	u8 data[16] = { 0, };
 709
 710	put_unaligned_le64(key, &data[0]);
 711	put_unaligned_le64(sa_key, &data[8]);
 712
 713	memset(&c, 0, sizeof(c));
 714	c.common.opcode = op;
 715	c.common.nsid = cpu_to_le32(ns->ns_id);
 716	c.common.cdw10[0] = cpu_to_le32(cdw10);
 717
 718	return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
 719}
 720
 721static int nvme_pr_register(struct block_device *bdev, u64 old,
 722		u64 new, unsigned flags)
 723{
 724	u32 cdw10;
 725
 726	if (flags & ~PR_FL_IGNORE_KEY)
 727		return -EOPNOTSUPP;
 728
 729	cdw10 = old ? 2 : 0;
 730	cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
 731	cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
 732	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
 733}
 734
 735static int nvme_pr_reserve(struct block_device *bdev, u64 key,
 736		enum pr_type type, unsigned flags)
 737{
 738	u32 cdw10;
 739
 740	if (flags & ~PR_FL_IGNORE_KEY)
 741		return -EOPNOTSUPP;
 742
 743	cdw10 = nvme_pr_type(type) << 8;
 744	cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
 745	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
 746}
 747
 748static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
 749		enum pr_type type, bool abort)
 750{
 751	u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
 752	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
 753}
 754
 755static int nvme_pr_clear(struct block_device *bdev, u64 key)
 756{
 757	u32 cdw10 = 1 | (key ? 1 << 3 : 0);
 758	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
 759}
 760
 761static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
 762{
 763	u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
 764	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
 765}
 766
 767static const struct pr_ops nvme_pr_ops = {
 768	.pr_register	= nvme_pr_register,
 769	.pr_reserve	= nvme_pr_reserve,
 770	.pr_release	= nvme_pr_release,
 771	.pr_preempt	= nvme_pr_preempt,
 772	.pr_clear	= nvme_pr_clear,
 773};
 774
 775static const struct block_device_operations nvme_fops = {
 776	.owner		= THIS_MODULE,
 777	.ioctl		= nvme_ioctl,
 778	.compat_ioctl	= nvme_compat_ioctl,
 779	.open		= nvme_open,
 780	.release	= nvme_release,
 781	.getgeo		= nvme_getgeo,
 782	.revalidate_disk= nvme_revalidate_disk,
 783	.pr_ops		= &nvme_pr_ops,
 784};
 785
 786static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
 787{
 788	unsigned long timeout =
 789		((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
 790	u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
 791	int ret;
 792
 793	while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
 794		if ((csts & NVME_CSTS_RDY) == bit)
 795			break;
 796
 797		msleep(100);
 798		if (fatal_signal_pending(current))
 799			return -EINTR;
 800		if (time_after(jiffies, timeout)) {
 801			dev_err(ctrl->device,
 802				"Device not ready; aborting %s\n", enabled ?
 803						"initialisation" : "reset");
 804			return -ENODEV;
 805		}
 806	}
 807
 808	return ret;
 809}
 810
 811/*
 812 * If the device has been passed off to us in an enabled state, just clear
 813 * the enabled bit.  The spec says we should set the 'shutdown notification
 814 * bits', but doing so may cause the device to complete commands to the
 815 * admin queue ... and we don't know what memory that might be pointing at!
 816 */
 817int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
 818{
 819	int ret;
 820
 821	ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
 822	ctrl->ctrl_config &= ~NVME_CC_ENABLE;
 823
 824	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
 825	if (ret)
 826		return ret;
 827	return nvme_wait_ready(ctrl, cap, false);
 828}
 829EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
 830
 831int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
 832{
 833	/*
 834	 * Default to a 4K page size, with the intention to update this
 835	 * path in the future to accomodate architectures with differing
 836	 * kernel and IO page sizes.
 837	 */
 838	unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
 839	int ret;
 840
 841	if (page_shift < dev_page_min) {
 842		dev_err(ctrl->device,
 843			"Minimum device page size %u too large for host (%u)\n",
 844			1 << dev_page_min, 1 << page_shift);
 845		return -ENODEV;
 846	}
 847
 848	ctrl->page_size = 1 << page_shift;
 849
 850	ctrl->ctrl_config = NVME_CC_CSS_NVM;
 851	ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
 852	ctrl->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
 853	ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
 854	ctrl->ctrl_config |= NVME_CC_ENABLE;
 855
 856	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
 857	if (ret)
 858		return ret;
 859	return nvme_wait_ready(ctrl, cap, true);
 860}
 861EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
 862
 863int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
 864{
 865	unsigned long timeout = SHUTDOWN_TIMEOUT + jiffies;
 866	u32 csts;
 867	int ret;
 868
 869	ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
 870	ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
 871
 872	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
 873	if (ret)
 874		return ret;
 875
 876	while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
 877		if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
 878			break;
 879
 880		msleep(100);
 881		if (fatal_signal_pending(current))
 882			return -EINTR;
 883		if (time_after(jiffies, timeout)) {
 884			dev_err(ctrl->device,
 885				"Device shutdown incomplete; abort shutdown\n");
 886			return -ENODEV;
 887		}
 888	}
 889
 890	return ret;
 891}
 892EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
 893
 894static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
 895		struct request_queue *q)
 896{
 897	if (ctrl->max_hw_sectors) {
 898		u32 max_segments =
 899			(ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
 900
 901		blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
 902		blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
 903	}
 904	if (ctrl->stripe_size)
 905		blk_queue_chunk_sectors(q, ctrl->stripe_size >> 9);
 906	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
 907		blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
 908	blk_queue_virt_boundary(q, ctrl->page_size - 1);
 909}
 910
 911/*
 912 * Initialize the cached copies of the Identify data and various controller
 913 * register in our nvme_ctrl structure.  This should be called as soon as
 914 * the admin queue is fully up and running.
 915 */
 916int nvme_init_identify(struct nvme_ctrl *ctrl)
 917{
 918	struct nvme_id_ctrl *id;
 919	u64 cap;
 920	int ret, page_shift;
 921
 922	ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
 923	if (ret) {
 924		dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
 925		return ret;
 926	}
 927
 928	ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
 929	if (ret) {
 930		dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
 931		return ret;
 932	}
 933	page_shift = NVME_CAP_MPSMIN(cap) + 12;
 934
 935	if (ctrl->vs >= NVME_VS(1, 1))
 936		ctrl->subsystem = NVME_CAP_NSSRC(cap);
 937
 938	ret = nvme_identify_ctrl(ctrl, &id);
 939	if (ret) {
 940		dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
 941		return -EIO;
 942	}
 943
 944	ctrl->vid = le16_to_cpu(id->vid);
 945	ctrl->oncs = le16_to_cpup(&id->oncs);
 946	atomic_set(&ctrl->abort_limit, id->acl + 1);
 947	ctrl->vwc = id->vwc;
 948	ctrl->cntlid = le16_to_cpup(&id->cntlid);
 949	memcpy(ctrl->serial, id->sn, sizeof(id->sn));
 950	memcpy(ctrl->model, id->mn, sizeof(id->mn));
 951	memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr));
 952	if (id->mdts)
 953		ctrl->max_hw_sectors = 1 << (id->mdts + page_shift - 9);
 954	else
 955		ctrl->max_hw_sectors = UINT_MAX;
 956
 957	if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && id->vs[3]) {
 958		unsigned int max_hw_sectors;
 959
 960		ctrl->stripe_size = 1 << (id->vs[3] + page_shift);
 961		max_hw_sectors = ctrl->stripe_size >> (page_shift - 9);
 962		if (ctrl->max_hw_sectors) {
 963			ctrl->max_hw_sectors = min(max_hw_sectors,
 964							ctrl->max_hw_sectors);
 965		} else {
 966			ctrl->max_hw_sectors = max_hw_sectors;
 967		}
 968	}
 969
 970	nvme_set_queue_limits(ctrl, ctrl->admin_q);
 971
 972	kfree(id);
 973	return 0;
 974}
 975EXPORT_SYMBOL_GPL(nvme_init_identify);
 976
 977static int nvme_dev_open(struct inode *inode, struct file *file)
 978{
 979	struct nvme_ctrl *ctrl;
 980	int instance = iminor(inode);
 981	int ret = -ENODEV;
 982
 983	spin_lock(&dev_list_lock);
 984	list_for_each_entry(ctrl, &nvme_ctrl_list, node) {
 985		if (ctrl->instance != instance)
 986			continue;
 987
 988		if (!ctrl->admin_q) {
 989			ret = -EWOULDBLOCK;
 990			break;
 991		}
 992		if (!kref_get_unless_zero(&ctrl->kref))
 993			break;
 994		file->private_data = ctrl;
 995		ret = 0;
 996		break;
 997	}
 998	spin_unlock(&dev_list_lock);
 999
1000	return ret;
1001}
1002
1003static int nvme_dev_release(struct inode *inode, struct file *file)
1004{
1005	nvme_put_ctrl(file->private_data);
1006	return 0;
1007}
1008
1009static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
1010{
1011	struct nvme_ns *ns;
1012	int ret;
1013
1014	mutex_lock(&ctrl->namespaces_mutex);
1015	if (list_empty(&ctrl->namespaces)) {
1016		ret = -ENOTTY;
1017		goto out_unlock;
1018	}
1019
1020	ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
1021	if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
1022		dev_warn(ctrl->device,
1023			"NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
1024		ret = -EINVAL;
1025		goto out_unlock;
1026	}
1027
1028	dev_warn(ctrl->device,
1029		"using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
1030	kref_get(&ns->kref);
1031	mutex_unlock(&ctrl->namespaces_mutex);
1032
1033	ret = nvme_user_cmd(ctrl, ns, argp);
1034	nvme_put_ns(ns);
1035	return ret;
1036
1037out_unlock:
1038	mutex_unlock(&ctrl->namespaces_mutex);
1039	return ret;
1040}
1041
1042static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
1043		unsigned long arg)
1044{
1045	struct nvme_ctrl *ctrl = file->private_data;
1046	void __user *argp = (void __user *)arg;
1047
1048	switch (cmd) {
1049	case NVME_IOCTL_ADMIN_CMD:
1050		return nvme_user_cmd(ctrl, NULL, argp);
1051	case NVME_IOCTL_IO_CMD:
1052		return nvme_dev_user_cmd(ctrl, argp);
1053	case NVME_IOCTL_RESET:
1054		dev_warn(ctrl->device, "resetting controller\n");
1055		return ctrl->ops->reset_ctrl(ctrl);
1056	case NVME_IOCTL_SUBSYS_RESET:
1057		return nvme_reset_subsystem(ctrl);
1058	default:
1059		return -ENOTTY;
1060	}
1061}
1062
1063static const struct file_operations nvme_dev_fops = {
1064	.owner		= THIS_MODULE,
1065	.open		= nvme_dev_open,
1066	.release	= nvme_dev_release,
1067	.unlocked_ioctl	= nvme_dev_ioctl,
1068	.compat_ioctl	= nvme_dev_ioctl,
1069};
1070
1071static ssize_t nvme_sysfs_reset(struct device *dev,
1072				struct device_attribute *attr, const char *buf,
1073				size_t count)
1074{
1075	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
1076	int ret;
1077
1078	ret = ctrl->ops->reset_ctrl(ctrl);
1079	if (ret < 0)
1080		return ret;
1081	return count;
1082}
1083static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
1084
1085static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
1086								char *buf)
1087{
1088	struct nvme_ns *ns = dev_to_disk(dev)->private_data;
1089	struct nvme_ctrl *ctrl = ns->ctrl;
1090	int serial_len = sizeof(ctrl->serial);
1091	int model_len = sizeof(ctrl->model);
1092
1093	if (memchr_inv(ns->uuid, 0, sizeof(ns->uuid)))
1094		return sprintf(buf, "eui.%16phN\n", ns->uuid);
1095
1096	if (memchr_inv(ns->eui, 0, sizeof(ns->eui)))
1097		return sprintf(buf, "eui.%8phN\n", ns->eui);
1098
1099	while (ctrl->serial[serial_len - 1] == ' ')
1100		serial_len--;
1101	while (ctrl->model[model_len - 1] == ' ')
1102		model_len--;
1103
1104	return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", ctrl->vid,
1105		serial_len, ctrl->serial, model_len, ctrl->model, ns->ns_id);
1106}
1107static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL);
1108
1109static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
1110								char *buf)
1111{
1112	struct nvme_ns *ns = dev_to_disk(dev)->private_data;
1113	return sprintf(buf, "%pU\n", ns->uuid);
1114}
1115static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL);
1116
1117static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
1118								char *buf)
1119{
1120	struct nvme_ns *ns = dev_to_disk(dev)->private_data;
1121	return sprintf(buf, "%8phd\n", ns->eui);
1122}
1123static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL);
1124
1125static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
1126								char *buf)
1127{
1128	struct nvme_ns *ns = dev_to_disk(dev)->private_data;
1129	return sprintf(buf, "%d\n", ns->ns_id);
1130}
1131static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL);
1132
1133static struct attribute *nvme_ns_attrs[] = {
1134	&dev_attr_wwid.attr,
1135	&dev_attr_uuid.attr,
1136	&dev_attr_eui.attr,
1137	&dev_attr_nsid.attr,
1138	NULL,
1139};
1140
1141static umode_t nvme_attrs_are_visible(struct kobject *kobj,
1142		struct attribute *a, int n)
1143{
1144	struct device *dev = container_of(kobj, struct device, kobj);
1145	struct nvme_ns *ns = dev_to_disk(dev)->private_data;
1146
1147	if (a == &dev_attr_uuid.attr) {
1148		if (!memchr_inv(ns->uuid, 0, sizeof(ns->uuid)))
1149			return 0;
1150	}
1151	if (a == &dev_attr_eui.attr) {
1152		if (!memchr_inv(ns->eui, 0, sizeof(ns->eui)))
1153			return 0;
1154	}
1155	return a->mode;
1156}
1157
1158static const struct attribute_group nvme_ns_attr_group = {
1159	.attrs		= nvme_ns_attrs,
1160	.is_visible	= nvme_attrs_are_visible,
1161};
1162
1163#define nvme_show_str_function(field)						\
1164static ssize_t  field##_show(struct device *dev,				\
1165			    struct device_attribute *attr, char *buf)		\
1166{										\
1167        struct nvme_ctrl *ctrl = dev_get_drvdata(dev);				\
1168        return sprintf(buf, "%.*s\n", (int)sizeof(ctrl->field), ctrl->field);	\
1169}										\
1170static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
1171
1172#define nvme_show_int_function(field)						\
1173static ssize_t  field##_show(struct device *dev,				\
1174			    struct device_attribute *attr, char *buf)		\
1175{										\
1176        struct nvme_ctrl *ctrl = dev_get_drvdata(dev);				\
1177        return sprintf(buf, "%d\n", ctrl->field);	\
1178}										\
1179static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
1180
1181nvme_show_str_function(model);
1182nvme_show_str_function(serial);
1183nvme_show_str_function(firmware_rev);
1184nvme_show_int_function(cntlid);
1185
1186static struct attribute *nvme_dev_attrs[] = {
1187	&dev_attr_reset_controller.attr,
1188	&dev_attr_model.attr,
1189	&dev_attr_serial.attr,
1190	&dev_attr_firmware_rev.attr,
1191	&dev_attr_cntlid.attr,
1192	NULL
1193};
1194
1195static struct attribute_group nvme_dev_attrs_group = {
1196	.attrs = nvme_dev_attrs,
1197};
1198
1199static const struct attribute_group *nvme_dev_attr_groups[] = {
1200	&nvme_dev_attrs_group,
1201	NULL,
1202};
1203
1204static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
1205{
1206	struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
1207	struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
1208
1209	return nsa->ns_id - nsb->ns_id;
1210}
1211
1212static struct nvme_ns *nvme_find_ns(struct nvme_ctrl *ctrl, unsigned nsid)
1213{
1214	struct nvme_ns *ns;
1215
1216	lockdep_assert_held(&ctrl->namespaces_mutex);
1217
1218	list_for_each_entry(ns, &ctrl->namespaces, list) {
1219		if (ns->ns_id == nsid)
1220			return ns;
1221		if (ns->ns_id > nsid)
1222			break;
1223	}
1224	return NULL;
1225}
1226
1227static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
1228{
1229	struct nvme_ns *ns;
1230	struct gendisk *disk;
1231	int node = dev_to_node(ctrl->dev);
1232
1233	lockdep_assert_held(&ctrl->namespaces_mutex);
1234
1235	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
1236	if (!ns)
1237		return;
1238
1239	ns->instance = ida_simple_get(&ctrl->ns_ida, 1, 0, GFP_KERNEL);
1240	if (ns->instance < 0)
1241		goto out_free_ns;
1242
1243	ns->queue = blk_mq_init_queue(ctrl->tagset);
1244	if (IS_ERR(ns->queue))
1245		goto out_release_instance;
1246	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
1247	ns->queue->queuedata = ns;
1248	ns->ctrl = ctrl;
1249
1250	disk = alloc_disk_node(0, node);
1251	if (!disk)
1252		goto out_free_queue;
1253
1254	kref_init(&ns->kref);
1255	ns->ns_id = nsid;
1256	ns->disk = disk;
1257	ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
1258
1259
1260	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
1261	nvme_set_queue_limits(ctrl, ns->queue);
1262
1263	disk->major = nvme_major;
1264	disk->first_minor = 0;
1265	disk->fops = &nvme_fops;
1266	disk->private_data = ns;
1267	disk->queue = ns->queue;
1268	disk->driverfs_dev = ctrl->device;
1269	disk->flags = GENHD_FL_EXT_DEVT;
1270	sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, ns->instance);
1271
1272	if (nvme_revalidate_disk(ns->disk))
1273		goto out_free_disk;
1274
1275	list_add_tail(&ns->list, &ctrl->namespaces);
1276	kref_get(&ctrl->kref);
1277	if (ns->type == NVME_NS_LIGHTNVM)
1278		return;
1279
1280	add_disk(ns->disk);
1281	if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
1282					&nvme_ns_attr_group))
1283		pr_warn("%s: failed to create sysfs group for identification\n",
1284			ns->disk->disk_name);
1285	return;
1286 out_free_disk:
1287	kfree(disk);
1288 out_free_queue:
1289	blk_cleanup_queue(ns->queue);
1290 out_release_instance:
1291	ida_simple_remove(&ctrl->ns_ida, ns->instance);
1292 out_free_ns:
1293	kfree(ns);
1294}
1295
1296static void nvme_ns_remove(struct nvme_ns *ns)
1297{
1298	if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
1299		return;
1300
1301	if (ns->disk->flags & GENHD_FL_UP) {
1302		if (blk_get_integrity(ns->disk))
1303			blk_integrity_unregister(ns->disk);
1304		sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
1305					&nvme_ns_attr_group);
1306		del_gendisk(ns->disk);
1307		blk_mq_abort_requeue_list(ns->queue);
1308		blk_cleanup_queue(ns->queue);
1309	}
1310	mutex_lock(&ns->ctrl->namespaces_mutex);
1311	list_del_init(&ns->list);
1312	mutex_unlock(&ns->ctrl->namespaces_mutex);
1313	nvme_put_ns(ns);
1314}
1315
1316static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
1317{
1318	struct nvme_ns *ns;
1319
1320	ns = nvme_find_ns(ctrl, nsid);
1321	if (ns) {
1322		if (revalidate_disk(ns->disk))
1323			nvme_ns_remove(ns);
1324	} else
1325		nvme_alloc_ns(ctrl, nsid);
1326}
1327
1328static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
1329{
1330	struct nvme_ns *ns;
1331	__le32 *ns_list;
1332	unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024);
1333	int ret = 0;
1334
1335	ns_list = kzalloc(0x1000, GFP_KERNEL);
1336	if (!ns_list)
1337		return -ENOMEM;
1338
1339	for (i = 0; i < num_lists; i++) {
1340		ret = nvme_identify_ns_list(ctrl, prev, ns_list);
1341		if (ret)
1342			goto out;
1343
1344		for (j = 0; j < min(nn, 1024U); j++) {
1345			nsid = le32_to_cpu(ns_list[j]);
1346			if (!nsid)
1347				goto out;
1348
1349			nvme_validate_ns(ctrl, nsid);
1350
1351			while (++prev < nsid) {
1352				ns = nvme_find_ns(ctrl, prev);
1353				if (ns)
1354					nvme_ns_remove(ns);
1355			}
1356		}
1357		nn -= j;
1358	}
1359 out:
1360	kfree(ns_list);
1361	return ret;
1362}
1363
1364static void __nvme_scan_namespaces(struct nvme_ctrl *ctrl, unsigned nn)
1365{
1366	struct nvme_ns *ns, *next;
1367	unsigned i;
1368
1369	lockdep_assert_held(&ctrl->namespaces_mutex);
1370
1371	for (i = 1; i <= nn; i++)
1372		nvme_validate_ns(ctrl, i);
1373
1374	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
1375		if (ns->ns_id > nn)
1376			nvme_ns_remove(ns);
1377	}
1378}
1379
1380void nvme_scan_namespaces(struct nvme_ctrl *ctrl)
1381{
1382	struct nvme_id_ctrl *id;
1383	unsigned nn;
1384
1385	if (nvme_identify_ctrl(ctrl, &id))
1386		return;
1387
1388	mutex_lock(&ctrl->namespaces_mutex);
1389	nn = le32_to_cpu(id->nn);
1390	if (ctrl->vs >= NVME_VS(1, 1) &&
1391	    !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
1392		if (!nvme_scan_ns_list(ctrl, nn))
1393			goto done;
1394	}
1395	__nvme_scan_namespaces(ctrl, le32_to_cpup(&id->nn));
1396 done:
1397	list_sort(NULL, &ctrl->namespaces, ns_cmp);
1398	mutex_unlock(&ctrl->namespaces_mutex);
1399	kfree(id);
1400}
1401EXPORT_SYMBOL_GPL(nvme_scan_namespaces);
1402
1403void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
1404{
1405	struct nvme_ns *ns, *next;
1406
1407	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list)
1408		nvme_ns_remove(ns);
1409}
1410EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
1411
1412static DEFINE_IDA(nvme_instance_ida);
1413
1414static int nvme_set_instance(struct nvme_ctrl *ctrl)
1415{
1416	int instance, error;
1417
1418	do {
1419		if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
1420			return -ENODEV;
1421
1422		spin_lock(&dev_list_lock);
1423		error = ida_get_new(&nvme_instance_ida, &instance);
1424		spin_unlock(&dev_list_lock);
1425	} while (error == -EAGAIN);
1426
1427	if (error)
1428		return -ENODEV;
1429
1430	ctrl->instance = instance;
1431	return 0;
1432}
1433
1434static void nvme_release_instance(struct nvme_ctrl *ctrl)
1435{
1436	spin_lock(&dev_list_lock);
1437	ida_remove(&nvme_instance_ida, ctrl->instance);
1438	spin_unlock(&dev_list_lock);
1439}
1440
1441void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
1442{
1443	device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance));
1444
1445	spin_lock(&dev_list_lock);
1446	list_del(&ctrl->node);
1447	spin_unlock(&dev_list_lock);
1448}
1449EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
1450
1451static void nvme_free_ctrl(struct kref *kref)
1452{
1453	struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref);
1454
1455	put_device(ctrl->device);
1456	nvme_release_instance(ctrl);
1457	ida_destroy(&ctrl->ns_ida);
1458
1459	ctrl->ops->free_ctrl(ctrl);
1460}
1461
1462void nvme_put_ctrl(struct nvme_ctrl *ctrl)
1463{
1464	kref_put(&ctrl->kref, nvme_free_ctrl);
1465}
1466EXPORT_SYMBOL_GPL(nvme_put_ctrl);
1467
1468/*
1469 * Initialize a NVMe controller structures.  This needs to be called during
1470 * earliest initialization so that we have the initialized structured around
1471 * during probing.
1472 */
1473int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
1474		const struct nvme_ctrl_ops *ops, unsigned long quirks)
1475{
1476	int ret;
1477
1478	INIT_LIST_HEAD(&ctrl->namespaces);
1479	mutex_init(&ctrl->namespaces_mutex);
1480	kref_init(&ctrl->kref);
1481	ctrl->dev = dev;
1482	ctrl->ops = ops;
1483	ctrl->quirks = quirks;
1484
1485	ret = nvme_set_instance(ctrl);
1486	if (ret)
1487		goto out;
1488
1489	ctrl->device = device_create_with_groups(nvme_class, ctrl->dev,
1490				MKDEV(nvme_char_major, ctrl->instance),
1491				ctrl, nvme_dev_attr_groups,
1492				"nvme%d", ctrl->instance);
1493	if (IS_ERR(ctrl->device)) {
1494		ret = PTR_ERR(ctrl->device);
1495		goto out_release_instance;
1496	}
1497	get_device(ctrl->device);
1498	ida_init(&ctrl->ns_ida);
1499
1500	spin_lock(&dev_list_lock);
1501	list_add_tail(&ctrl->node, &nvme_ctrl_list);
1502	spin_unlock(&dev_list_lock);
1503
1504	return 0;
1505out_release_instance:
1506	nvme_release_instance(ctrl);
1507out:
1508	return ret;
1509}
1510EXPORT_SYMBOL_GPL(nvme_init_ctrl);
1511
1512/**
1513 * nvme_kill_queues(): Ends all namespace queues
1514 * @ctrl: the dead controller that needs to end
1515 *
1516 * Call this function when the driver determines it is unable to get the
1517 * controller in a state capable of servicing IO.
1518 */
1519void nvme_kill_queues(struct nvme_ctrl *ctrl)
1520{
1521	struct nvme_ns *ns;
1522
1523	mutex_lock(&ctrl->namespaces_mutex);
1524	list_for_each_entry(ns, &ctrl->namespaces, list) {
1525		if (!kref_get_unless_zero(&ns->kref))
1526			continue;
1527
1528		/*
1529		 * Revalidating a dead namespace sets capacity to 0. This will
1530		 * end buffered writers dirtying pages that can't be synced.
1531		 */
1532		if (!test_and_set_bit(NVME_NS_DEAD, &ns->flags))
1533			revalidate_disk(ns->disk);
1534
1535		blk_set_queue_dying(ns->queue);
1536		blk_mq_abort_requeue_list(ns->queue);
1537		blk_mq_start_stopped_hw_queues(ns->queue, true);
1538
1539		nvme_put_ns(ns);
1540	}
1541	mutex_unlock(&ctrl->namespaces_mutex);
1542}
1543EXPORT_SYMBOL_GPL(nvme_kill_queues);
1544
1545void nvme_stop_queues(struct nvme_ctrl *ctrl)
1546{
1547	struct nvme_ns *ns;
1548
1549	mutex_lock(&ctrl->namespaces_mutex);
1550	list_for_each_entry(ns, &ctrl->namespaces, list) {
1551		spin_lock_irq(ns->queue->queue_lock);
1552		queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue);
1553		spin_unlock_irq(ns->queue->queue_lock);
1554
1555		blk_mq_cancel_requeue_work(ns->queue);
1556		blk_mq_stop_hw_queues(ns->queue);
1557	}
1558	mutex_unlock(&ctrl->namespaces_mutex);
1559}
1560EXPORT_SYMBOL_GPL(nvme_stop_queues);
1561
1562void nvme_start_queues(struct nvme_ctrl *ctrl)
1563{
1564	struct nvme_ns *ns;
1565
1566	mutex_lock(&ctrl->namespaces_mutex);
1567	list_for_each_entry(ns, &ctrl->namespaces, list) {
1568		queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue);
1569		blk_mq_start_stopped_hw_queues(ns->queue, true);
1570		blk_mq_kick_requeue_list(ns->queue);
1571	}
1572	mutex_unlock(&ctrl->namespaces_mutex);
1573}
1574EXPORT_SYMBOL_GPL(nvme_start_queues);
1575
1576int __init nvme_core_init(void)
1577{
1578	int result;
1579
1580	result = register_blkdev(nvme_major, "nvme");
1581	if (result < 0)
1582		return result;
1583	else if (result > 0)
1584		nvme_major = result;
1585
1586	result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
1587							&nvme_dev_fops);
1588	if (result < 0)
1589		goto unregister_blkdev;
1590	else if (result > 0)
1591		nvme_char_major = result;
1592
1593	nvme_class = class_create(THIS_MODULE, "nvme");
1594	if (IS_ERR(nvme_class)) {
1595		result = PTR_ERR(nvme_class);
1596		goto unregister_chrdev;
1597	}
1598
1599	return 0;
1600
1601 unregister_chrdev:
1602	__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
1603 unregister_blkdev:
1604	unregister_blkdev(nvme_major, "nvme");
1605	return result;
1606}
1607
1608void nvme_core_exit(void)
1609{
1610	unregister_blkdev(nvme_major, "nvme");
1611	class_destroy(nvme_class);
1612	__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
1613}
1614
1615MODULE_LICENSE("GPL");
1616MODULE_VERSION("1.0");
1617module_init(nvme_core_init);
1618module_exit(nvme_core_exit);