multipath.c - drivers/nvme/host/multipath.c - Linux diff v6.9.4

  1// SPDX-License-Identifier: GPL-2.0
  2/*
  3 * Copyright (c) 2017-2018 Christoph Hellwig.
  4 */
  5
  6#include <linux/backing-dev.h>
  7#include <linux/moduleparam.h>
  8#include <linux/vmalloc.h>
  9#include <trace/events/block.h>
 10#include "nvme.h"
 11
 12bool multipath = true;
 13module_param(multipath, bool, 0444);
 14MODULE_PARM_DESC(multipath,
 15	"turn on native support for multiple controllers per subsystem");
 16
 17static const char *nvme_iopolicy_names[] = {
 18	[NVME_IOPOLICY_NUMA]	= "numa",
 19	[NVME_IOPOLICY_RR]	= "round-robin",
 20};
 21
 22static int iopolicy = NVME_IOPOLICY_NUMA;
 23
 24static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
 25{
 26	if (!val)
 27		return -EINVAL;
 28	if (!strncmp(val, "numa", 4))
 29		iopolicy = NVME_IOPOLICY_NUMA;
 30	else if (!strncmp(val, "round-robin", 11))
 31		iopolicy = NVME_IOPOLICY_RR;
 32	else
 33		return -EINVAL;
 34
 35	return 0;
 36}
 37
 38static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
 39{
 40	return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]);
 41}
 42
 43module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
 44	&iopolicy, 0644);
 45MODULE_PARM_DESC(iopolicy,
 46	"Default multipath I/O policy; 'numa' (default) or 'round-robin'");
 47
 48void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
 49{
 50	subsys->iopolicy = iopolicy;
 51}
 52
 53void nvme_mpath_unfreeze(struct nvme_subsystem *subsys)
 54{
 55	struct nvme_ns_head *h;
 56
 57	lockdep_assert_held(&subsys->lock);
 58	list_for_each_entry(h, &subsys->nsheads, entry)
 59		if (h->disk)
 60			blk_mq_unfreeze_queue(h->disk->queue);
 61}
 62
 63void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
 64{
 65	struct nvme_ns_head *h;
 66
 67	lockdep_assert_held(&subsys->lock);
 68	list_for_each_entry(h, &subsys->nsheads, entry)
 69		if (h->disk)
 70			blk_mq_freeze_queue_wait(h->disk->queue);
 71}
 72
 73void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
 74{
 75	struct nvme_ns_head *h;
 76
 77	lockdep_assert_held(&subsys->lock);
 78	list_for_each_entry(h, &subsys->nsheads, entry)
 79		if (h->disk)
 80			blk_freeze_queue_start(h->disk->queue);
 81}
 82
 83void nvme_failover_req(struct request *req)
 84{
 85	struct nvme_ns *ns = req->q->queuedata;
 86	u16 status = nvme_req(req)->status & 0x7ff;
 87	unsigned long flags;
 88	struct bio *bio;
 89
 90	nvme_mpath_clear_current_path(ns);
 91
 92	/*
 93	 * If we got back an ANA error, we know the controller is alive but not
 94	 * ready to serve this namespace.  Kick of a re-read of the ANA
 95	 * information page, and just try any other available path for now.
 96	 */
 97	if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
 98		set_bit(NVME_NS_ANA_PENDING, &ns->flags);
 99		queue_work(nvme_wq, &ns->ctrl->ana_work);
100	}
101
102	spin_lock_irqsave(&ns->head->requeue_lock, flags);
103	for (bio = req->bio; bio; bio = bio->bi_next) {
104		bio_set_dev(bio, ns->head->disk->part0);
105		if (bio->bi_opf & REQ_POLLED) {
106			bio->bi_opf &= ~REQ_POLLED;
107			bio->bi_cookie = BLK_QC_T_NONE;
108		}
109		/*
110		 * The alternate request queue that we may end up submitting
111		 * the bio to may be frozen temporarily, in this case REQ_NOWAIT
112		 * will fail the I/O immediately with EAGAIN to the issuer.
113		 * We are not in the issuer context which cannot block. Clear
114		 * the flag to avoid spurious EAGAIN I/O failures.
115		 */
116		bio->bi_opf &= ~REQ_NOWAIT;
117	}
118	blk_steal_bios(&ns->head->requeue_list, req);
119	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
120
121	nvme_req(req)->status = 0;
122	nvme_end_req(req);
123	kblockd_schedule_work(&ns->head->requeue_work);
124}
125
126void nvme_mpath_start_request(struct request *rq)
127{
128	struct nvme_ns *ns = rq->q->queuedata;
129	struct gendisk *disk = ns->head->disk;
130
131	if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq))
132		return;
133
134	nvme_req(rq)->flags |= NVME_MPATH_IO_STATS;
135	nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, req_op(rq),
136						      jiffies);
 
137}
138EXPORT_SYMBOL_GPL(nvme_mpath_start_request);
139
140void nvme_mpath_end_request(struct request *rq)
141{
142	struct nvme_ns *ns = rq->q->queuedata;
143
144	if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
145		return;
146	bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
147			 blk_rq_bytes(rq) >> SECTOR_SHIFT,
148			 nvme_req(rq)->start_time);
149}
150
151void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
152{
153	struct nvme_ns *ns;
154
155	down_read(&ctrl->namespaces_rwsem);
156	list_for_each_entry(ns, &ctrl->namespaces, list) {
157		if (!ns->head->disk)
158			continue;
159		kblockd_schedule_work(&ns->head->requeue_work);
160		if (nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
161			disk_uevent(ns->head->disk, KOBJ_CHANGE);
162	}
163	up_read(&ctrl->namespaces_rwsem);
164}
165
166static const char *nvme_ana_state_names[] = {
167	[0]				= "invalid state",
168	[NVME_ANA_OPTIMIZED]		= "optimized",
169	[NVME_ANA_NONOPTIMIZED]		= "non-optimized",
170	[NVME_ANA_INACCESSIBLE]		= "inaccessible",
171	[NVME_ANA_PERSISTENT_LOSS]	= "persistent-loss",
172	[NVME_ANA_CHANGE]		= "change",
173};
174
175bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
176{
177	struct nvme_ns_head *head = ns->head;
178	bool changed = false;
179	int node;
180
181	if (!head)
182		goto out;
183
184	for_each_node(node) {
185		if (ns == rcu_access_pointer(head->current_path[node])) {
186			rcu_assign_pointer(head->current_path[node], NULL);
187			changed = true;
188		}
189	}
190out:
191	return changed;
192}
193
194void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
195{
196	struct nvme_ns *ns;
197
198	down_read(&ctrl->namespaces_rwsem);
199	list_for_each_entry(ns, &ctrl->namespaces, list) {
200		nvme_mpath_clear_current_path(ns);
201		kblockd_schedule_work(&ns->head->requeue_work);
202	}
203	up_read(&ctrl->namespaces_rwsem);
204}
205
206void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
207{
208	struct nvme_ns_head *head = ns->head;
209	sector_t capacity = get_capacity(head->disk);
210	int node;
211	int srcu_idx;
212
213	srcu_idx = srcu_read_lock(&head->srcu);
214	list_for_each_entry_rcu(ns, &head->list, siblings) {
215		if (capacity != get_capacity(ns->disk))
216			clear_bit(NVME_NS_READY, &ns->flags);
217	}
218	srcu_read_unlock(&head->srcu, srcu_idx);
219
220	for_each_node(node)
221		rcu_assign_pointer(head->current_path[node], NULL);
222	kblockd_schedule_work(&head->requeue_work);
223}
224
225static bool nvme_path_is_disabled(struct nvme_ns *ns)
226{
227	enum nvme_ctrl_state state = nvme_ctrl_state(ns->ctrl);
228
229	/*
230	 * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should
231	 * still be able to complete assuming that the controller is connected.
232	 * Otherwise it will fail immediately and return to the requeue list.
233	 */
234	if (state != NVME_CTRL_LIVE && state != NVME_CTRL_DELETING)
 
235		return true;
236	if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
237	    !test_bit(NVME_NS_READY, &ns->flags))
238		return true;
239	return false;
240}
241
242static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
243{
244	int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
245	struct nvme_ns *found = NULL, *fallback = NULL, *ns;
246
247	list_for_each_entry_rcu(ns, &head->list, siblings) {
248		if (nvme_path_is_disabled(ns))
249			continue;
250
251		if (ns->ctrl->numa_node != NUMA_NO_NODE &&
252		    READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
253			distance = node_distance(node, ns->ctrl->numa_node);
254		else
255			distance = LOCAL_DISTANCE;
256
257		switch (ns->ana_state) {
258		case NVME_ANA_OPTIMIZED:
259			if (distance < found_distance) {
260				found_distance = distance;
261				found = ns;
262			}
263			break;
264		case NVME_ANA_NONOPTIMIZED:
265			if (distance < fallback_distance) {
266				fallback_distance = distance;
267				fallback = ns;
268			}
269			break;
270		default:
271			break;
272		}
273	}
274
275	if (!found)
276		found = fallback;
277	if (found)
278		rcu_assign_pointer(head->current_path[node], found);
279	return found;
280}
281
282static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
283		struct nvme_ns *ns)
284{
285	ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
286			siblings);
287	if (ns)
288		return ns;
289	return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
290}
291
292static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
293		int node, struct nvme_ns *old)
294{
295	struct nvme_ns *ns, *found = NULL;
296
297	if (list_is_singular(&head->list)) {
298		if (nvme_path_is_disabled(old))
299			return NULL;
300		return old;
301	}
302
303	for (ns = nvme_next_ns(head, old);
304	     ns && ns != old;
305	     ns = nvme_next_ns(head, ns)) {
306		if (nvme_path_is_disabled(ns))
307			continue;
308
309		if (ns->ana_state == NVME_ANA_OPTIMIZED) {
310			found = ns;
311			goto out;
312		}
313		if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
314			found = ns;
315	}
316
317	/*
318	 * The loop above skips the current path for round-robin semantics.
319	 * Fall back to the current path if either:
320	 *  - no other optimized path found and current is optimized,
321	 *  - no other usable path found and current is usable.
322	 */
323	if (!nvme_path_is_disabled(old) &&
324	    (old->ana_state == NVME_ANA_OPTIMIZED ||
325	     (!found && old->ana_state == NVME_ANA_NONOPTIMIZED)))
326		return old;
327
328	if (!found)
329		return NULL;
330out:
331	rcu_assign_pointer(head->current_path[node], found);
332	return found;
333}
334
335static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
336{
337	return nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE &&
338		ns->ana_state == NVME_ANA_OPTIMIZED;
339}
340
341inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
342{
343	int node = numa_node_id();
344	struct nvme_ns *ns;
345
346	ns = srcu_dereference(head->current_path[node], &head->srcu);
347	if (unlikely(!ns))
348		return __nvme_find_path(head, node);
349
350	if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
351		return nvme_round_robin_path(head, node, ns);
352	if (unlikely(!nvme_path_is_optimized(ns)))
353		return __nvme_find_path(head, node);
354	return ns;
355}
356
357static bool nvme_available_path(struct nvme_ns_head *head)
358{
359	struct nvme_ns *ns;
360
361	list_for_each_entry_rcu(ns, &head->list, siblings) {
362		if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
363			continue;
364		switch (nvme_ctrl_state(ns->ctrl)) {
365		case NVME_CTRL_LIVE:
366		case NVME_CTRL_RESETTING:
367		case NVME_CTRL_CONNECTING:
368			/* fallthru */
369			return true;
370		default:
371			break;
372		}
373	}
374	return false;
375}
376
377static void nvme_ns_head_submit_bio(struct bio *bio)
378{
379	struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data;
380	struct device *dev = disk_to_dev(head->disk);
381	struct nvme_ns *ns;
382	int srcu_idx;
383
384	/*
385	 * The namespace might be going away and the bio might be moved to a
386	 * different queue via blk_steal_bios(), so we need to use the bio_split
387	 * pool from the original queue to allocate the bvecs from.
388	 */
389	bio = bio_split_to_limits(bio);
390	if (!bio)
391		return;
392
393	srcu_idx = srcu_read_lock(&head->srcu);
394	ns = nvme_find_path(head);
395	if (likely(ns)) {
396		bio_set_dev(bio, ns->disk->part0);
397		bio->bi_opf |= REQ_NVME_MPATH;
398		trace_block_bio_remap(bio, disk_devt(ns->head->disk),
399				      bio->bi_iter.bi_sector);
400		submit_bio_noacct(bio);
401	} else if (nvme_available_path(head)) {
402		dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
403
404		spin_lock_irq(&head->requeue_lock);
405		bio_list_add(&head->requeue_list, bio);
406		spin_unlock_irq(&head->requeue_lock);
407	} else {
408		dev_warn_ratelimited(dev, "no available path - failing I/O\n");
409
410		bio_io_error(bio);
411	}
412
413	srcu_read_unlock(&head->srcu, srcu_idx);
414}
415
416static int nvme_ns_head_open(struct gendisk *disk, blk_mode_t mode)
417{
418	if (!nvme_tryget_ns_head(disk->private_data))
419		return -ENXIO;
420	return 0;
421}
422
423static void nvme_ns_head_release(struct gendisk *disk)
424{
425	nvme_put_ns_head(disk->private_data);
426}
427
428#ifdef CONFIG_BLK_DEV_ZONED
429static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector,
430		unsigned int nr_zones, report_zones_cb cb, void *data)
431{
432	struct nvme_ns_head *head = disk->private_data;
433	struct nvme_ns *ns;
434	int srcu_idx, ret = -EWOULDBLOCK;
435
436	srcu_idx = srcu_read_lock(&head->srcu);
437	ns = nvme_find_path(head);
438	if (ns)
439		ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
440	srcu_read_unlock(&head->srcu, srcu_idx);
441	return ret;
442}
443#else
444#define nvme_ns_head_report_zones	NULL
445#endif /* CONFIG_BLK_DEV_ZONED */
446
447const struct block_device_operations nvme_ns_head_ops = {
448	.owner		= THIS_MODULE,
449	.submit_bio	= nvme_ns_head_submit_bio,
450	.open		= nvme_ns_head_open,
451	.release	= nvme_ns_head_release,
452	.ioctl		= nvme_ns_head_ioctl,
453	.compat_ioctl	= blkdev_compat_ptr_ioctl,
454	.getgeo		= nvme_getgeo,
455	.report_zones	= nvme_ns_head_report_zones,
456	.pr_ops		= &nvme_pr_ops,
457};
458
459static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev)
460{
461	return container_of(cdev, struct nvme_ns_head, cdev);
462}
463
464static int nvme_ns_head_chr_open(struct inode *inode, struct file *file)
465{
466	if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev)))
467		return -ENXIO;
468	return 0;
469}
470
471static int nvme_ns_head_chr_release(struct inode *inode, struct file *file)
472{
473	nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev));
474	return 0;
475}
476
477static const struct file_operations nvme_ns_head_chr_fops = {
478	.owner		= THIS_MODULE,
479	.open		= nvme_ns_head_chr_open,
480	.release	= nvme_ns_head_chr_release,
481	.unlocked_ioctl	= nvme_ns_head_chr_ioctl,
482	.compat_ioctl	= compat_ptr_ioctl,
483	.uring_cmd	= nvme_ns_head_chr_uring_cmd,
484	.uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll,
485};
486
487static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
488{
489	int ret;
490
491	head->cdev_device.parent = &head->subsys->dev;
492	ret = dev_set_name(&head->cdev_device, "ng%dn%d",
493			   head->subsys->instance, head->instance);
494	if (ret)
495		return ret;
496	ret = nvme_cdev_add(&head->cdev, &head->cdev_device,
497			    &nvme_ns_head_chr_fops, THIS_MODULE);
498	return ret;
499}
500
501static void nvme_requeue_work(struct work_struct *work)
502{
503	struct nvme_ns_head *head =
504		container_of(work, struct nvme_ns_head, requeue_work);
505	struct bio *bio, *next;
506
507	spin_lock_irq(&head->requeue_lock);
508	next = bio_list_get(&head->requeue_list);
509	spin_unlock_irq(&head->requeue_lock);
510
511	while ((bio = next) != NULL) {
512		next = bio->bi_next;
513		bio->bi_next = NULL;
514
515		submit_bio_noacct(bio);
516	}
517}
518
519int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
520{
521	struct queue_limits lim;
522	bool vwc = false;
523
524	mutex_init(&head->lock);
525	bio_list_init(&head->requeue_list);
526	spin_lock_init(&head->requeue_lock);
527	INIT_WORK(&head->requeue_work, nvme_requeue_work);
528
529	/*
530	 * Add a multipath node if the subsystems supports multiple controllers.
531	 * We also do this for private namespaces as the namespace sharing flag
532	 * could change after a rescan.
533	 */
534	if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
535	    !nvme_is_unique_nsid(ctrl, head) || !multipath)
536		return 0;
537
538	blk_set_stacking_limits(&lim);
539	lim.dma_alignment = 3;
540	if (head->ids.csi != NVME_CSI_ZNS)
541		lim.max_zone_append_sectors = 0;
542
543	head->disk = blk_alloc_disk(&lim, ctrl->numa_node);
544	if (IS_ERR(head->disk))
545		return PTR_ERR(head->disk);
546	head->disk->fops = &nvme_ns_head_ops;
547	head->disk->private_data = head;
548	sprintf(head->disk->disk_name, "nvme%dn%d",
549			ctrl->subsys->instance, head->instance);
550
551	blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue);
552	blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue);
553	blk_queue_flag_set(QUEUE_FLAG_IO_STAT, head->disk->queue);
554	/*
555	 * This assumes all controllers that refer to a namespace either
556	 * support poll queues or not.  That is not a strict guarantee,
557	 * but if the assumption is wrong the effect is only suboptimal
558	 * performance but not correctness problem.
559	 */
560	if (ctrl->tagset->nr_maps > HCTX_TYPE_POLL &&
561	    ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues)
562		blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue);
563
 
 
 
 
 
564	/* we need to propagate up the VMC settings */
565	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
566		vwc = true;
567	blk_queue_write_cache(head->disk->queue, vwc, vwc);
568	return 0;
569}
570
571static void nvme_mpath_set_live(struct nvme_ns *ns)
572{
573	struct nvme_ns_head *head = ns->head;
574	int rc;
575
576	if (!head->disk)
577		return;
578
579	/*
580	 * test_and_set_bit() is used because it is protecting against two nvme
581	 * paths simultaneously calling device_add_disk() on the same namespace
582	 * head.
583	 */
584	if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
585		rc = device_add_disk(&head->subsys->dev, head->disk,
586				     nvme_ns_attr_groups);
587		if (rc) {
588			clear_bit(NVME_NSHEAD_DISK_LIVE, &ns->flags);
589			return;
590		}
591		nvme_add_ns_head_cdev(head);
592	}
593
594	mutex_lock(&head->lock);
595	if (nvme_path_is_optimized(ns)) {
596		int node, srcu_idx;
597
598		srcu_idx = srcu_read_lock(&head->srcu);
599		for_each_node(node)
600			__nvme_find_path(head, node);
601		srcu_read_unlock(&head->srcu, srcu_idx);
602	}
603	mutex_unlock(&head->lock);
604
605	synchronize_srcu(&head->srcu);
606	kblockd_schedule_work(&head->requeue_work);
607}
608
609static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
610		int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
611			void *))
612{
613	void *base = ctrl->ana_log_buf;
614	size_t offset = sizeof(struct nvme_ana_rsp_hdr);
615	int error, i;
616
617	lockdep_assert_held(&ctrl->ana_lock);
618
619	for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
620		struct nvme_ana_group_desc *desc = base + offset;
621		u32 nr_nsids;
622		size_t nsid_buf_size;
623
624		if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
625			return -EINVAL;
626
627		nr_nsids = le32_to_cpu(desc->nnsids);
628		nsid_buf_size = flex_array_size(desc, nsids, nr_nsids);
629
630		if (WARN_ON_ONCE(desc->grpid == 0))
631			return -EINVAL;
632		if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
633			return -EINVAL;
634		if (WARN_ON_ONCE(desc->state == 0))
635			return -EINVAL;
636		if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
637			return -EINVAL;
638
639		offset += sizeof(*desc);
640		if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
641			return -EINVAL;
642
643		error = cb(ctrl, desc, data);
644		if (error)
645			return error;
646
647		offset += nsid_buf_size;
648	}
649
650	return 0;
651}
652
653static inline bool nvme_state_is_live(enum nvme_ana_state state)
654{
655	return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
656}
657
658static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
659		struct nvme_ns *ns)
660{
661	ns->ana_grpid = le32_to_cpu(desc->grpid);
662	ns->ana_state = desc->state;
663	clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
664	/*
665	 * nvme_mpath_set_live() will trigger I/O to the multipath path device
666	 * and in turn to this path device.  However we cannot accept this I/O
667	 * if the controller is not live.  This may deadlock if called from
668	 * nvme_mpath_init_identify() and the ctrl will never complete
669	 * initialization, preventing I/O from completing.  For this case we
670	 * will reprocess the ANA log page in nvme_mpath_update() once the
671	 * controller is ready.
672	 */
673	if (nvme_state_is_live(ns->ana_state) &&
674	    nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
675		nvme_mpath_set_live(ns);
676}
677
678static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
679		struct nvme_ana_group_desc *desc, void *data)
680{
681	u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
682	unsigned *nr_change_groups = data;
683	struct nvme_ns *ns;
684
685	dev_dbg(ctrl->device, "ANA group %d: %s.\n",
686			le32_to_cpu(desc->grpid),
687			nvme_ana_state_names[desc->state]);
688
689	if (desc->state == NVME_ANA_CHANGE)
690		(*nr_change_groups)++;
691
692	if (!nr_nsids)
693		return 0;
694
695	down_read(&ctrl->namespaces_rwsem);
696	list_for_each_entry(ns, &ctrl->namespaces, list) {
697		unsigned nsid;
698again:
699		nsid = le32_to_cpu(desc->nsids[n]);
700		if (ns->head->ns_id < nsid)
701			continue;
702		if (ns->head->ns_id == nsid)
703			nvme_update_ns_ana_state(desc, ns);
704		if (++n == nr_nsids)
705			break;
706		if (ns->head->ns_id > nsid)
707			goto again;
708	}
709	up_read(&ctrl->namespaces_rwsem);
710	return 0;
711}
712
713static int nvme_read_ana_log(struct nvme_ctrl *ctrl)
714{
715	u32 nr_change_groups = 0;
716	int error;
717
718	mutex_lock(&ctrl->ana_lock);
719	error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM,
720			ctrl->ana_log_buf, ctrl->ana_log_size, 0);
721	if (error) {
722		dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
723		goto out_unlock;
724	}
725
726	error = nvme_parse_ana_log(ctrl, &nr_change_groups,
727			nvme_update_ana_state);
728	if (error)
729		goto out_unlock;
730
731	/*
732	 * In theory we should have an ANATT timer per group as they might enter
733	 * the change state at different times.  But that is a lot of overhead
734	 * just to protect against a target that keeps entering new changes
735	 * states while never finishing previous ones.  But we'll still
736	 * eventually time out once all groups are in change state, so this
737	 * isn't a big deal.
738	 *
739	 * We also double the ANATT value to provide some slack for transports
740	 * or AEN processing overhead.
741	 */
742	if (nr_change_groups)
743		mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
744	else
745		del_timer_sync(&ctrl->anatt_timer);
746out_unlock:
747	mutex_unlock(&ctrl->ana_lock);
748	return error;
749}
750
751static void nvme_ana_work(struct work_struct *work)
752{
753	struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
754
755	if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
756		return;
757
758	nvme_read_ana_log(ctrl);
759}
760
761void nvme_mpath_update(struct nvme_ctrl *ctrl)
762{
763	u32 nr_change_groups = 0;
764
765	if (!ctrl->ana_log_buf)
766		return;
767
768	mutex_lock(&ctrl->ana_lock);
769	nvme_parse_ana_log(ctrl, &nr_change_groups, nvme_update_ana_state);
770	mutex_unlock(&ctrl->ana_lock);
771}
772
773static void nvme_anatt_timeout(struct timer_list *t)
774{
775	struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);
776
777	dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
778	nvme_reset_ctrl(ctrl);
779}
780
781void nvme_mpath_stop(struct nvme_ctrl *ctrl)
782{
783	if (!nvme_ctrl_use_ana(ctrl))
784		return;
785	del_timer_sync(&ctrl->anatt_timer);
786	cancel_work_sync(&ctrl->ana_work);
787}
788
789#define SUBSYS_ATTR_RW(_name, _mode, _show, _store)  \
790	struct device_attribute subsys_attr_##_name =	\
791		__ATTR(_name, _mode, _show, _store)
792
793static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
794		struct device_attribute *attr, char *buf)
795{
796	struct nvme_subsystem *subsys =
797		container_of(dev, struct nvme_subsystem, dev);
798
799	return sysfs_emit(buf, "%s\n",
800			  nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
801}
802
803static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
804		struct device_attribute *attr, const char *buf, size_t count)
805{
806	struct nvme_subsystem *subsys =
807		container_of(dev, struct nvme_subsystem, dev);
808	int i;
809
810	for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
811		if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
812			WRITE_ONCE(subsys->iopolicy, i);
813			return count;
814		}
815	}
816
817	return -EINVAL;
818}
819SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
820		      nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
821
822static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
823		char *buf)
824{
825	return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
826}
827DEVICE_ATTR_RO(ana_grpid);
828
829static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
830		char *buf)
831{
832	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
833
834	return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
835}
836DEVICE_ATTR_RO(ana_state);
837
838static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
839		struct nvme_ana_group_desc *desc, void *data)
840{
841	struct nvme_ana_group_desc *dst = data;
842
843	if (desc->grpid != dst->grpid)
844		return 0;
845
846	*dst = *desc;
847	return -ENXIO; /* just break out of the loop */
848}
849
850void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
851{
852	if (nvme_ctrl_use_ana(ns->ctrl)) {
853		struct nvme_ana_group_desc desc = {
854			.grpid = anagrpid,
855			.state = 0,
856		};
857
858		mutex_lock(&ns->ctrl->ana_lock);
859		ns->ana_grpid = le32_to_cpu(anagrpid);
860		nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc);
861		mutex_unlock(&ns->ctrl->ana_lock);
862		if (desc.state) {
863			/* found the group desc: update */
864			nvme_update_ns_ana_state(&desc, ns);
865		} else {
866			/* group desc not found: trigger a re-read */
867			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
868			queue_work(nvme_wq, &ns->ctrl->ana_work);
869		}
870	} else {
871		ns->ana_state = NVME_ANA_OPTIMIZED;
872		nvme_mpath_set_live(ns);
873	}
874
875	if (blk_queue_stable_writes(ns->queue) && ns->head->disk)
876		blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES,
877				   ns->head->disk->queue);
878#ifdef CONFIG_BLK_DEV_ZONED
879	if (blk_queue_is_zoned(ns->queue) && ns->head->disk)
880		ns->head->disk->nr_zones = ns->disk->nr_zones;
881#endif
882}
883
884void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
885{
886	if (!head->disk)
887		return;
888	kblockd_schedule_work(&head->requeue_work);
889	if (test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
890		nvme_cdev_del(&head->cdev, &head->cdev_device);
891		del_gendisk(head->disk);
892	}
893}
894
895void nvme_mpath_remove_disk(struct nvme_ns_head *head)
896{
897	if (!head->disk)
898		return;
 
899	/* make sure all pending bios are cleaned up */
900	kblockd_schedule_work(&head->requeue_work);
901	flush_work(&head->requeue_work);
902	put_disk(head->disk);
903}
904
905void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl)
906{
907	mutex_init(&ctrl->ana_lock);
908	timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
909	INIT_WORK(&ctrl->ana_work, nvme_ana_work);
910}
911
912int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
913{
914	size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT;
915	size_t ana_log_size;
916	int error = 0;
917
918	/* check if multipath is enabled and we have the capability */
919	if (!multipath || !ctrl->subsys ||
920	    !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
921		return 0;
922
923	if (!ctrl->max_namespaces ||
924	    ctrl->max_namespaces > le32_to_cpu(id->nn)) {
925		dev_err(ctrl->device,
926			"Invalid MNAN value %u\n", ctrl->max_namespaces);
927		return -EINVAL;
928	}
929
930	ctrl->anacap = id->anacap;
931	ctrl->anatt = id->anatt;
932	ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
933	ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
934
935	ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
936		ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) +
937		ctrl->max_namespaces * sizeof(__le32);
938	if (ana_log_size > max_transfer_size) {
939		dev_err(ctrl->device,
940			"ANA log page size (%zd) larger than MDTS (%zd).\n",
941			ana_log_size, max_transfer_size);
942		dev_err(ctrl->device, "disabling ANA support.\n");
943		goto out_uninit;
944	}
945	if (ana_log_size > ctrl->ana_log_size) {
946		nvme_mpath_stop(ctrl);
947		nvme_mpath_uninit(ctrl);
948		ctrl->ana_log_buf = kvmalloc(ana_log_size, GFP_KERNEL);
949		if (!ctrl->ana_log_buf)
950			return -ENOMEM;
951	}
952	ctrl->ana_log_size = ana_log_size;
953	error = nvme_read_ana_log(ctrl);
954	if (error)
955		goto out_uninit;
956	return 0;
957
958out_uninit:
959	nvme_mpath_uninit(ctrl);
960	return error;
961}
962
963void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
964{
965	kvfree(ctrl->ana_log_buf);
966	ctrl->ana_log_buf = NULL;
967	ctrl->ana_log_size = 0;
968}

  1// SPDX-License-Identifier: GPL-2.0
  2/*
  3 * Copyright (c) 2017-2018 Christoph Hellwig.
  4 */
  5
  6#include <linux/backing-dev.h>
  7#include <linux/moduleparam.h>
  8#include <linux/vmalloc.h>
  9#include <trace/events/block.h>
 10#include "nvme.h"
 11
 12bool multipath = true;
 13module_param(multipath, bool, 0444);
 14MODULE_PARM_DESC(multipath,
 15	"turn on native support for multiple controllers per subsystem");
 16
 17static const char *nvme_iopolicy_names[] = {
 18	[NVME_IOPOLICY_NUMA]	= "numa",
 19	[NVME_IOPOLICY_RR]	= "round-robin",
 20};
 21
 22static int iopolicy = NVME_IOPOLICY_NUMA;
 23
 24static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
 25{
 26	if (!val)
 27		return -EINVAL;
 28	if (!strncmp(val, "numa", 4))
 29		iopolicy = NVME_IOPOLICY_NUMA;
 30	else if (!strncmp(val, "round-robin", 11))
 31		iopolicy = NVME_IOPOLICY_RR;
 32	else
 33		return -EINVAL;
 34
 35	return 0;
 36}
 37
 38static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
 39{
 40	return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]);
 41}
 42
 43module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
 44	&iopolicy, 0644);
 45MODULE_PARM_DESC(iopolicy,
 46	"Default multipath I/O policy; 'numa' (default) or 'round-robin'");
 47
 48void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
 49{
 50	subsys->iopolicy = iopolicy;
 51}
 52
 53void nvme_mpath_unfreeze(struct nvme_subsystem *subsys)
 54{
 55	struct nvme_ns_head *h;
 56
 57	lockdep_assert_held(&subsys->lock);
 58	list_for_each_entry(h, &subsys->nsheads, entry)
 59		if (h->disk)
 60			blk_mq_unfreeze_queue(h->disk->queue);
 61}
 62
 63void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
 64{
 65	struct nvme_ns_head *h;
 66
 67	lockdep_assert_held(&subsys->lock);
 68	list_for_each_entry(h, &subsys->nsheads, entry)
 69		if (h->disk)
 70			blk_mq_freeze_queue_wait(h->disk->queue);
 71}
 72
 73void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
 74{
 75	struct nvme_ns_head *h;
 76
 77	lockdep_assert_held(&subsys->lock);
 78	list_for_each_entry(h, &subsys->nsheads, entry)
 79		if (h->disk)
 80			blk_freeze_queue_start(h->disk->queue);
 81}
 82
 83void nvme_failover_req(struct request *req)
 84{
 85	struct nvme_ns *ns = req->q->queuedata;
 86	u16 status = nvme_req(req)->status & 0x7ff;
 87	unsigned long flags;
 88	struct bio *bio;
 89
 90	nvme_mpath_clear_current_path(ns);
 91
 92	/*
 93	 * If we got back an ANA error, we know the controller is alive but not
 94	 * ready to serve this namespace.  Kick of a re-read of the ANA
 95	 * information page, and just try any other available path for now.
 96	 */
 97	if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
 98		set_bit(NVME_NS_ANA_PENDING, &ns->flags);
 99		queue_work(nvme_wq, &ns->ctrl->ana_work);
100	}
101
102	spin_lock_irqsave(&ns->head->requeue_lock, flags);
103	for (bio = req->bio; bio; bio = bio->bi_next) {
104		bio_set_dev(bio, ns->head->disk->part0);
105		if (bio->bi_opf & REQ_POLLED) {
106			bio->bi_opf &= ~REQ_POLLED;
107			bio->bi_cookie = BLK_QC_T_NONE;
108		}
 
 
 
 
 
 
 
 
109	}
110	blk_steal_bios(&ns->head->requeue_list, req);
111	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
112
113	blk_mq_end_request(req, 0);
 
114	kblockd_schedule_work(&ns->head->requeue_work);
115}
116
117void nvme_mpath_start_request(struct request *rq)
118{
119	struct nvme_ns *ns = rq->q->queuedata;
120	struct gendisk *disk = ns->head->disk;
121
122	if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq))
123		return;
124
125	nvme_req(rq)->flags |= NVME_MPATH_IO_STATS;
126	nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0,
127					blk_rq_bytes(rq) >> SECTOR_SHIFT,
128					req_op(rq), jiffies);
129}
130EXPORT_SYMBOL_GPL(nvme_mpath_start_request);
131
132void nvme_mpath_end_request(struct request *rq)
133{
134	struct nvme_ns *ns = rq->q->queuedata;
135
136	if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
137		return;
138	bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
139		nvme_req(rq)->start_time);
 
140}
141
142void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
143{
144	struct nvme_ns *ns;
145
146	down_read(&ctrl->namespaces_rwsem);
147	list_for_each_entry(ns, &ctrl->namespaces, list) {
148		if (!ns->head->disk)
149			continue;
150		kblockd_schedule_work(&ns->head->requeue_work);
151		if (ctrl->state == NVME_CTRL_LIVE)
152			disk_uevent(ns->head->disk, KOBJ_CHANGE);
153	}
154	up_read(&ctrl->namespaces_rwsem);
155}
156
157static const char *nvme_ana_state_names[] = {
158	[0]				= "invalid state",
159	[NVME_ANA_OPTIMIZED]		= "optimized",
160	[NVME_ANA_NONOPTIMIZED]		= "non-optimized",
161	[NVME_ANA_INACCESSIBLE]		= "inaccessible",
162	[NVME_ANA_PERSISTENT_LOSS]	= "persistent-loss",
163	[NVME_ANA_CHANGE]		= "change",
164};
165
166bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
167{
168	struct nvme_ns_head *head = ns->head;
169	bool changed = false;
170	int node;
171
172	if (!head)
173		goto out;
174
175	for_each_node(node) {
176		if (ns == rcu_access_pointer(head->current_path[node])) {
177			rcu_assign_pointer(head->current_path[node], NULL);
178			changed = true;
179		}
180	}
181out:
182	return changed;
183}
184
185void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
186{
187	struct nvme_ns *ns;
188
189	down_read(&ctrl->namespaces_rwsem);
190	list_for_each_entry(ns, &ctrl->namespaces, list) {
191		nvme_mpath_clear_current_path(ns);
192		kblockd_schedule_work(&ns->head->requeue_work);
193	}
194	up_read(&ctrl->namespaces_rwsem);
195}
196
197void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
198{
199	struct nvme_ns_head *head = ns->head;
200	sector_t capacity = get_capacity(head->disk);
201	int node;
202	int srcu_idx;
203
204	srcu_idx = srcu_read_lock(&head->srcu);
205	list_for_each_entry_rcu(ns, &head->list, siblings) {
206		if (capacity != get_capacity(ns->disk))
207			clear_bit(NVME_NS_READY, &ns->flags);
208	}
209	srcu_read_unlock(&head->srcu, srcu_idx);
210
211	for_each_node(node)
212		rcu_assign_pointer(head->current_path[node], NULL);
213	kblockd_schedule_work(&head->requeue_work);
214}
215
216static bool nvme_path_is_disabled(struct nvme_ns *ns)
217{
 
 
218	/*
219	 * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should
220	 * still be able to complete assuming that the controller is connected.
221	 * Otherwise it will fail immediately and return to the requeue list.
222	 */
223	if (ns->ctrl->state != NVME_CTRL_LIVE &&
224	    ns->ctrl->state != NVME_CTRL_DELETING)
225		return true;
226	if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
227	    !test_bit(NVME_NS_READY, &ns->flags))
228		return true;
229	return false;
230}
231
232static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
233{
234	int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
235	struct nvme_ns *found = NULL, *fallback = NULL, *ns;
236
237	list_for_each_entry_rcu(ns, &head->list, siblings) {
238		if (nvme_path_is_disabled(ns))
239			continue;
240
241		if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
 
242			distance = node_distance(node, ns->ctrl->numa_node);
243		else
244			distance = LOCAL_DISTANCE;
245
246		switch (ns->ana_state) {
247		case NVME_ANA_OPTIMIZED:
248			if (distance < found_distance) {
249				found_distance = distance;
250				found = ns;
251			}
252			break;
253		case NVME_ANA_NONOPTIMIZED:
254			if (distance < fallback_distance) {
255				fallback_distance = distance;
256				fallback = ns;
257			}
258			break;
259		default:
260			break;
261		}
262	}
263
264	if (!found)
265		found = fallback;
266	if (found)
267		rcu_assign_pointer(head->current_path[node], found);
268	return found;
269}
270
271static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
272		struct nvme_ns *ns)
273{
274	ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
275			siblings);
276	if (ns)
277		return ns;
278	return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
279}
280
281static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
282		int node, struct nvme_ns *old)
283{
284	struct nvme_ns *ns, *found = NULL;
285
286	if (list_is_singular(&head->list)) {
287		if (nvme_path_is_disabled(old))
288			return NULL;
289		return old;
290	}
291
292	for (ns = nvme_next_ns(head, old);
293	     ns && ns != old;
294	     ns = nvme_next_ns(head, ns)) {
295		if (nvme_path_is_disabled(ns))
296			continue;
297
298		if (ns->ana_state == NVME_ANA_OPTIMIZED) {
299			found = ns;
300			goto out;
301		}
302		if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
303			found = ns;
304	}
305
306	/*
307	 * The loop above skips the current path for round-robin semantics.
308	 * Fall back to the current path if either:
309	 *  - no other optimized path found and current is optimized,
310	 *  - no other usable path found and current is usable.
311	 */
312	if (!nvme_path_is_disabled(old) &&
313	    (old->ana_state == NVME_ANA_OPTIMIZED ||
314	     (!found && old->ana_state == NVME_ANA_NONOPTIMIZED)))
315		return old;
316
317	if (!found)
318		return NULL;
319out:
320	rcu_assign_pointer(head->current_path[node], found);
321	return found;
322}
323
324static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
325{
326	return ns->ctrl->state == NVME_CTRL_LIVE &&
327		ns->ana_state == NVME_ANA_OPTIMIZED;
328}
329
330inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
331{
332	int node = numa_node_id();
333	struct nvme_ns *ns;
334
335	ns = srcu_dereference(head->current_path[node], &head->srcu);
336	if (unlikely(!ns))
337		return __nvme_find_path(head, node);
338
339	if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
340		return nvme_round_robin_path(head, node, ns);
341	if (unlikely(!nvme_path_is_optimized(ns)))
342		return __nvme_find_path(head, node);
343	return ns;
344}
345
346static bool nvme_available_path(struct nvme_ns_head *head)
347{
348	struct nvme_ns *ns;
349
350	list_for_each_entry_rcu(ns, &head->list, siblings) {
351		if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
352			continue;
353		switch (ns->ctrl->state) {
354		case NVME_CTRL_LIVE:
355		case NVME_CTRL_RESETTING:
356		case NVME_CTRL_CONNECTING:
357			/* fallthru */
358			return true;
359		default:
360			break;
361		}
362	}
363	return false;
364}
365
366static void nvme_ns_head_submit_bio(struct bio *bio)
367{
368	struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data;
369	struct device *dev = disk_to_dev(head->disk);
370	struct nvme_ns *ns;
371	int srcu_idx;
372
373	/*
374	 * The namespace might be going away and the bio might be moved to a
375	 * different queue via blk_steal_bios(), so we need to use the bio_split
376	 * pool from the original queue to allocate the bvecs from.
377	 */
378	bio = bio_split_to_limits(bio);
379	if (!bio)
380		return;
381
382	srcu_idx = srcu_read_lock(&head->srcu);
383	ns = nvme_find_path(head);
384	if (likely(ns)) {
385		bio_set_dev(bio, ns->disk->part0);
386		bio->bi_opf |= REQ_NVME_MPATH;
387		trace_block_bio_remap(bio, disk_devt(ns->head->disk),
388				      bio->bi_iter.bi_sector);
389		submit_bio_noacct(bio);
390	} else if (nvme_available_path(head)) {
391		dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
392
393		spin_lock_irq(&head->requeue_lock);
394		bio_list_add(&head->requeue_list, bio);
395		spin_unlock_irq(&head->requeue_lock);
396	} else {
397		dev_warn_ratelimited(dev, "no available path - failing I/O\n");
398
399		bio_io_error(bio);
400	}
401
402	srcu_read_unlock(&head->srcu, srcu_idx);
403}
404
405static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode)
406{
407	if (!nvme_tryget_ns_head(bdev->bd_disk->private_data))
408		return -ENXIO;
409	return 0;
410}
411
412static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode)
413{
414	nvme_put_ns_head(disk->private_data);
415}
416
417#ifdef CONFIG_BLK_DEV_ZONED
418static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector,
419		unsigned int nr_zones, report_zones_cb cb, void *data)
420{
421	struct nvme_ns_head *head = disk->private_data;
422	struct nvme_ns *ns;
423	int srcu_idx, ret = -EWOULDBLOCK;
424
425	srcu_idx = srcu_read_lock(&head->srcu);
426	ns = nvme_find_path(head);
427	if (ns)
428		ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
429	srcu_read_unlock(&head->srcu, srcu_idx);
430	return ret;
431}
432#else
433#define nvme_ns_head_report_zones	NULL
434#endif /* CONFIG_BLK_DEV_ZONED */
435
436const struct block_device_operations nvme_ns_head_ops = {
437	.owner		= THIS_MODULE,
438	.submit_bio	= nvme_ns_head_submit_bio,
439	.open		= nvme_ns_head_open,
440	.release	= nvme_ns_head_release,
441	.ioctl		= nvme_ns_head_ioctl,
442	.compat_ioctl	= blkdev_compat_ptr_ioctl,
443	.getgeo		= nvme_getgeo,
444	.report_zones	= nvme_ns_head_report_zones,
445	.pr_ops		= &nvme_pr_ops,
446};
447
448static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev)
449{
450	return container_of(cdev, struct nvme_ns_head, cdev);
451}
452
453static int nvme_ns_head_chr_open(struct inode *inode, struct file *file)
454{
455	if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev)))
456		return -ENXIO;
457	return 0;
458}
459
460static int nvme_ns_head_chr_release(struct inode *inode, struct file *file)
461{
462	nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev));
463	return 0;
464}
465
466static const struct file_operations nvme_ns_head_chr_fops = {
467	.owner		= THIS_MODULE,
468	.open		= nvme_ns_head_chr_open,
469	.release	= nvme_ns_head_chr_release,
470	.unlocked_ioctl	= nvme_ns_head_chr_ioctl,
471	.compat_ioctl	= compat_ptr_ioctl,
472	.uring_cmd	= nvme_ns_head_chr_uring_cmd,
473	.uring_cmd_iopoll = nvme_ns_head_chr_uring_cmd_iopoll,
474};
475
476static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
477{
478	int ret;
479
480	head->cdev_device.parent = &head->subsys->dev;
481	ret = dev_set_name(&head->cdev_device, "ng%dn%d",
482			   head->subsys->instance, head->instance);
483	if (ret)
484		return ret;
485	ret = nvme_cdev_add(&head->cdev, &head->cdev_device,
486			    &nvme_ns_head_chr_fops, THIS_MODULE);
487	return ret;
488}
489
490static void nvme_requeue_work(struct work_struct *work)
491{
492	struct nvme_ns_head *head =
493		container_of(work, struct nvme_ns_head, requeue_work);
494	struct bio *bio, *next;
495
496	spin_lock_irq(&head->requeue_lock);
497	next = bio_list_get(&head->requeue_list);
498	spin_unlock_irq(&head->requeue_lock);
499
500	while ((bio = next) != NULL) {
501		next = bio->bi_next;
502		bio->bi_next = NULL;
503
504		submit_bio_noacct(bio);
505	}
506}
507
508int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
509{
 
510	bool vwc = false;
511
512	mutex_init(&head->lock);
513	bio_list_init(&head->requeue_list);
514	spin_lock_init(&head->requeue_lock);
515	INIT_WORK(&head->requeue_work, nvme_requeue_work);
516
517	/*
518	 * Add a multipath node if the subsystems supports multiple controllers.
519	 * We also do this for private namespaces as the namespace sharing flag
520	 * could change after a rescan.
521	 */
522	if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
523	    !nvme_is_unique_nsid(ctrl, head) || !multipath)
524		return 0;
525
526	head->disk = blk_alloc_disk(ctrl->numa_node);
527	if (!head->disk)
528		return -ENOMEM;
 
 
 
 
 
529	head->disk->fops = &nvme_ns_head_ops;
530	head->disk->private_data = head;
531	sprintf(head->disk->disk_name, "nvme%dn%d",
532			ctrl->subsys->instance, head->instance);
533
534	blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue);
535	blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue);
536	blk_queue_flag_set(QUEUE_FLAG_IO_STAT, head->disk->queue);
537	/*
538	 * This assumes all controllers that refer to a namespace either
539	 * support poll queues or not.  That is not a strict guarantee,
540	 * but if the assumption is wrong the effect is only suboptimal
541	 * performance but not correctness problem.
542	 */
543	if (ctrl->tagset->nr_maps > HCTX_TYPE_POLL &&
544	    ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues)
545		blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue);
546
547	/* set to a default value of 512 until the disk is validated */
548	blk_queue_logical_block_size(head->disk->queue, 512);
549	blk_set_stacking_limits(&head->disk->queue->limits);
550	blk_queue_dma_alignment(head->disk->queue, 3);
551
552	/* we need to propagate up the VMC settings */
553	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
554		vwc = true;
555	blk_queue_write_cache(head->disk->queue, vwc, vwc);
556	return 0;
557}
558
559static void nvme_mpath_set_live(struct nvme_ns *ns)
560{
561	struct nvme_ns_head *head = ns->head;
562	int rc;
563
564	if (!head->disk)
565		return;
566
567	/*
568	 * test_and_set_bit() is used because it is protecting against two nvme
569	 * paths simultaneously calling device_add_disk() on the same namespace
570	 * head.
571	 */
572	if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
573		rc = device_add_disk(&head->subsys->dev, head->disk,
574				     nvme_ns_id_attr_groups);
575		if (rc) {
576			clear_bit(NVME_NSHEAD_DISK_LIVE, &ns->flags);
577			return;
578		}
579		nvme_add_ns_head_cdev(head);
580	}
581
582	mutex_lock(&head->lock);
583	if (nvme_path_is_optimized(ns)) {
584		int node, srcu_idx;
585
586		srcu_idx = srcu_read_lock(&head->srcu);
587		for_each_node(node)
588			__nvme_find_path(head, node);
589		srcu_read_unlock(&head->srcu, srcu_idx);
590	}
591	mutex_unlock(&head->lock);
592
593	synchronize_srcu(&head->srcu);
594	kblockd_schedule_work(&head->requeue_work);
595}
596
597static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
598		int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
599			void *))
600{
601	void *base = ctrl->ana_log_buf;
602	size_t offset = sizeof(struct nvme_ana_rsp_hdr);
603	int error, i;
604
605	lockdep_assert_held(&ctrl->ana_lock);
606
607	for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
608		struct nvme_ana_group_desc *desc = base + offset;
609		u32 nr_nsids;
610		size_t nsid_buf_size;
611
612		if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
613			return -EINVAL;
614
615		nr_nsids = le32_to_cpu(desc->nnsids);
616		nsid_buf_size = flex_array_size(desc, nsids, nr_nsids);
617
618		if (WARN_ON_ONCE(desc->grpid == 0))
619			return -EINVAL;
620		if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
621			return -EINVAL;
622		if (WARN_ON_ONCE(desc->state == 0))
623			return -EINVAL;
624		if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
625			return -EINVAL;
626
627		offset += sizeof(*desc);
628		if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
629			return -EINVAL;
630
631		error = cb(ctrl, desc, data);
632		if (error)
633			return error;
634
635		offset += nsid_buf_size;
636	}
637
638	return 0;
639}
640
641static inline bool nvme_state_is_live(enum nvme_ana_state state)
642{
643	return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
644}
645
646static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
647		struct nvme_ns *ns)
648{
649	ns->ana_grpid = le32_to_cpu(desc->grpid);
650	ns->ana_state = desc->state;
651	clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
652	/*
653	 * nvme_mpath_set_live() will trigger I/O to the multipath path device
654	 * and in turn to this path device.  However we cannot accept this I/O
655	 * if the controller is not live.  This may deadlock if called from
656	 * nvme_mpath_init_identify() and the ctrl will never complete
657	 * initialization, preventing I/O from completing.  For this case we
658	 * will reprocess the ANA log page in nvme_mpath_update() once the
659	 * controller is ready.
660	 */
661	if (nvme_state_is_live(ns->ana_state) &&
662	    ns->ctrl->state == NVME_CTRL_LIVE)
663		nvme_mpath_set_live(ns);
664}
665
666static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
667		struct nvme_ana_group_desc *desc, void *data)
668{
669	u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
670	unsigned *nr_change_groups = data;
671	struct nvme_ns *ns;
672
673	dev_dbg(ctrl->device, "ANA group %d: %s.\n",
674			le32_to_cpu(desc->grpid),
675			nvme_ana_state_names[desc->state]);
676
677	if (desc->state == NVME_ANA_CHANGE)
678		(*nr_change_groups)++;
679
680	if (!nr_nsids)
681		return 0;
682
683	down_read(&ctrl->namespaces_rwsem);
684	list_for_each_entry(ns, &ctrl->namespaces, list) {
685		unsigned nsid;
686again:
687		nsid = le32_to_cpu(desc->nsids[n]);
688		if (ns->head->ns_id < nsid)
689			continue;
690		if (ns->head->ns_id == nsid)
691			nvme_update_ns_ana_state(desc, ns);
692		if (++n == nr_nsids)
693			break;
694		if (ns->head->ns_id > nsid)
695			goto again;
696	}
697	up_read(&ctrl->namespaces_rwsem);
698	return 0;
699}
700
701static int nvme_read_ana_log(struct nvme_ctrl *ctrl)
702{
703	u32 nr_change_groups = 0;
704	int error;
705
706	mutex_lock(&ctrl->ana_lock);
707	error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM,
708			ctrl->ana_log_buf, ctrl->ana_log_size, 0);
709	if (error) {
710		dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
711		goto out_unlock;
712	}
713
714	error = nvme_parse_ana_log(ctrl, &nr_change_groups,
715			nvme_update_ana_state);
716	if (error)
717		goto out_unlock;
718
719	/*
720	 * In theory we should have an ANATT timer per group as they might enter
721	 * the change state at different times.  But that is a lot of overhead
722	 * just to protect against a target that keeps entering new changes
723	 * states while never finishing previous ones.  But we'll still
724	 * eventually time out once all groups are in change state, so this
725	 * isn't a big deal.
726	 *
727	 * We also double the ANATT value to provide some slack for transports
728	 * or AEN processing overhead.
729	 */
730	if (nr_change_groups)
731		mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
732	else
733		del_timer_sync(&ctrl->anatt_timer);
734out_unlock:
735	mutex_unlock(&ctrl->ana_lock);
736	return error;
737}
738
739static void nvme_ana_work(struct work_struct *work)
740{
741	struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
742
743	if (ctrl->state != NVME_CTRL_LIVE)
744		return;
745
746	nvme_read_ana_log(ctrl);
747}
748
749void nvme_mpath_update(struct nvme_ctrl *ctrl)
750{
751	u32 nr_change_groups = 0;
752
753	if (!ctrl->ana_log_buf)
754		return;
755
756	mutex_lock(&ctrl->ana_lock);
757	nvme_parse_ana_log(ctrl, &nr_change_groups, nvme_update_ana_state);
758	mutex_unlock(&ctrl->ana_lock);
759}
760
761static void nvme_anatt_timeout(struct timer_list *t)
762{
763	struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);
764
765	dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
766	nvme_reset_ctrl(ctrl);
767}
768
769void nvme_mpath_stop(struct nvme_ctrl *ctrl)
770{
771	if (!nvme_ctrl_use_ana(ctrl))
772		return;
773	del_timer_sync(&ctrl->anatt_timer);
774	cancel_work_sync(&ctrl->ana_work);
775}
776
777#define SUBSYS_ATTR_RW(_name, _mode, _show, _store)  \
778	struct device_attribute subsys_attr_##_name =	\
779		__ATTR(_name, _mode, _show, _store)
780
781static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
782		struct device_attribute *attr, char *buf)
783{
784	struct nvme_subsystem *subsys =
785		container_of(dev, struct nvme_subsystem, dev);
786
787	return sysfs_emit(buf, "%s\n",
788			  nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
789}
790
791static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
792		struct device_attribute *attr, const char *buf, size_t count)
793{
794	struct nvme_subsystem *subsys =
795		container_of(dev, struct nvme_subsystem, dev);
796	int i;
797
798	for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
799		if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
800			WRITE_ONCE(subsys->iopolicy, i);
801			return count;
802		}
803	}
804
805	return -EINVAL;
806}
807SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
808		      nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
809
810static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
811		char *buf)
812{
813	return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
814}
815DEVICE_ATTR_RO(ana_grpid);
816
817static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
818		char *buf)
819{
820	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
821
822	return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
823}
824DEVICE_ATTR_RO(ana_state);
825
826static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
827		struct nvme_ana_group_desc *desc, void *data)
828{
829	struct nvme_ana_group_desc *dst = data;
830
831	if (desc->grpid != dst->grpid)
832		return 0;
833
834	*dst = *desc;
835	return -ENXIO; /* just break out of the loop */
836}
837
838void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
839{
840	if (nvme_ctrl_use_ana(ns->ctrl)) {
841		struct nvme_ana_group_desc desc = {
842			.grpid = anagrpid,
843			.state = 0,
844		};
845
846		mutex_lock(&ns->ctrl->ana_lock);
847		ns->ana_grpid = le32_to_cpu(anagrpid);
848		nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc);
849		mutex_unlock(&ns->ctrl->ana_lock);
850		if (desc.state) {
851			/* found the group desc: update */
852			nvme_update_ns_ana_state(&desc, ns);
853		} else {
854			/* group desc not found: trigger a re-read */
855			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
856			queue_work(nvme_wq, &ns->ctrl->ana_work);
857		}
858	} else {
859		ns->ana_state = NVME_ANA_OPTIMIZED;
860		nvme_mpath_set_live(ns);
861	}
862
863	if (blk_queue_stable_writes(ns->queue) && ns->head->disk)
864		blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES,
865				   ns->head->disk->queue);
866#ifdef CONFIG_BLK_DEV_ZONED
867	if (blk_queue_is_zoned(ns->queue) && ns->head->disk)
868		ns->head->disk->nr_zones = ns->disk->nr_zones;
869#endif
870}
871
872void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
873{
874	if (!head->disk)
875		return;
876	kblockd_schedule_work(&head->requeue_work);
877	if (test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
878		nvme_cdev_del(&head->cdev, &head->cdev_device);
879		del_gendisk(head->disk);
880	}
881}
882
883void nvme_mpath_remove_disk(struct nvme_ns_head *head)
884{
885	if (!head->disk)
886		return;
887	blk_mark_disk_dead(head->disk);
888	/* make sure all pending bios are cleaned up */
889	kblockd_schedule_work(&head->requeue_work);
890	flush_work(&head->requeue_work);
891	put_disk(head->disk);
892}
893
894void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl)
895{
896	mutex_init(&ctrl->ana_lock);
897	timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
898	INIT_WORK(&ctrl->ana_work, nvme_ana_work);
899}
900
901int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
902{
903	size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT;
904	size_t ana_log_size;
905	int error = 0;
906
907	/* check if multipath is enabled and we have the capability */
908	if (!multipath || !ctrl->subsys ||
909	    !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
910		return 0;
911
912	if (!ctrl->max_namespaces ||
913	    ctrl->max_namespaces > le32_to_cpu(id->nn)) {
914		dev_err(ctrl->device,
915			"Invalid MNAN value %u\n", ctrl->max_namespaces);
916		return -EINVAL;
917	}
918
919	ctrl->anacap = id->anacap;
920	ctrl->anatt = id->anatt;
921	ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
922	ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
923
924	ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
925		ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) +
926		ctrl->max_namespaces * sizeof(__le32);
927	if (ana_log_size > max_transfer_size) {
928		dev_err(ctrl->device,
929			"ANA log page size (%zd) larger than MDTS (%zd).\n",
930			ana_log_size, max_transfer_size);
931		dev_err(ctrl->device, "disabling ANA support.\n");
932		goto out_uninit;
933	}
934	if (ana_log_size > ctrl->ana_log_size) {
935		nvme_mpath_stop(ctrl);
936		nvme_mpath_uninit(ctrl);
937		ctrl->ana_log_buf = kvmalloc(ana_log_size, GFP_KERNEL);
938		if (!ctrl->ana_log_buf)
939			return -ENOMEM;
940	}
941	ctrl->ana_log_size = ana_log_size;
942	error = nvme_read_ana_log(ctrl);
943	if (error)
944		goto out_uninit;
945	return 0;
946
947out_uninit:
948	nvme_mpath_uninit(ctrl);
949	return error;
950}
951
952void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
953{
954	kvfree(ctrl->ana_log_buf);
955	ctrl->ana_log_buf = NULL;
956	ctrl->ana_log_size = 0;
957}