Loading...
Note: File does not exist in v3.1.
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Zoned block device handling
4 *
5 * Copyright (c) 2015, Hannes Reinecke
6 * Copyright (c) 2015, SUSE Linux GmbH
7 *
8 * Copyright (c) 2016, Damien Le Moal
9 * Copyright (c) 2016, Western Digital
10 * Copyright (c) 2024, Western Digital Corporation or its affiliates.
11 */
12
13#include <linux/kernel.h>
14#include <linux/module.h>
15#include <linux/blkdev.h>
16#include <linux/blk-mq.h>
17#include <linux/mm.h>
18#include <linux/vmalloc.h>
19#include <linux/sched/mm.h>
20#include <linux/spinlock.h>
21#include <linux/refcount.h>
22#include <linux/mempool.h>
23
24#include "blk.h"
25#include "blk-mq-sched.h"
26#include "blk-mq-debugfs.h"
27
28#define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name
29static const char *const zone_cond_name[] = {
30 ZONE_COND_NAME(NOT_WP),
31 ZONE_COND_NAME(EMPTY),
32 ZONE_COND_NAME(IMP_OPEN),
33 ZONE_COND_NAME(EXP_OPEN),
34 ZONE_COND_NAME(CLOSED),
35 ZONE_COND_NAME(READONLY),
36 ZONE_COND_NAME(FULL),
37 ZONE_COND_NAME(OFFLINE),
38};
39#undef ZONE_COND_NAME
40
41/*
42 * Per-zone write plug.
43 * @node: hlist_node structure for managing the plug using a hash table.
44 * @ref: Zone write plug reference counter. A zone write plug reference is
45 * always at least 1 when the plug is hashed in the disk plug hash table.
46 * The reference is incremented whenever a new BIO needing plugging is
47 * submitted and when a function needs to manipulate a plug. The
48 * reference count is decremented whenever a plugged BIO completes and
49 * when a function that referenced the plug returns. The initial
50 * reference is dropped whenever the zone of the zone write plug is reset,
51 * finished and when the zone becomes full (last write BIO to the zone
52 * completes).
53 * @lock: Spinlock to atomically manipulate the plug.
54 * @flags: Flags indicating the plug state.
55 * @zone_no: The number of the zone the plug is managing.
56 * @wp_offset: The zone write pointer location relative to the start of the zone
57 * as a number of 512B sectors.
58 * @bio_list: The list of BIOs that are currently plugged.
59 * @bio_work: Work struct to handle issuing of plugged BIOs
60 * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
61 * @disk: The gendisk the plug belongs to.
62 */
63struct blk_zone_wplug {
64 struct hlist_node node;
65 refcount_t ref;
66 spinlock_t lock;
67 unsigned int flags;
68 unsigned int zone_no;
69 unsigned int wp_offset;
70 struct bio_list bio_list;
71 struct work_struct bio_work;
72 struct rcu_head rcu_head;
73 struct gendisk *disk;
74};
75
76/*
77 * Zone write plug flags bits:
78 * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
79 * that is, that write BIOs are being throttled due to a write BIO already
80 * being executed or the zone write plug bio list is not empty.
81 * - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone
82 * write pointer offset and need to update it.
83 * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed
84 * from the disk hash table and that the initial reference to the zone
85 * write plug set when the plug was first added to the hash table has been
86 * dropped. This flag is set when a zone is reset, finished or become full,
87 * to prevent new references to the zone write plug to be taken for
88 * newly incoming BIOs. A zone write plug flagged with this flag will be
89 * freed once all remaining references from BIOs or functions are dropped.
90 */
91#define BLK_ZONE_WPLUG_PLUGGED (1U << 0)
92#define BLK_ZONE_WPLUG_NEED_WP_UPDATE (1U << 1)
93#define BLK_ZONE_WPLUG_UNHASHED (1U << 2)
94
95/**
96 * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX.
97 * @zone_cond: BLK_ZONE_COND_XXX.
98 *
99 * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX
100 * into string format. Useful in the debugging and tracing zone conditions. For
101 * invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN".
102 */
103const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
104{
105 static const char *zone_cond_str = "UNKNOWN";
106
107 if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond])
108 zone_cond_str = zone_cond_name[zone_cond];
109
110 return zone_cond_str;
111}
112EXPORT_SYMBOL_GPL(blk_zone_cond_str);
113
114struct disk_report_zones_cb_args {
115 struct gendisk *disk;
116 report_zones_cb user_cb;
117 void *user_data;
118};
119
120static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk,
121 struct blk_zone *zone);
122
123static int disk_report_zones_cb(struct blk_zone *zone, unsigned int idx,
124 void *data)
125{
126 struct disk_report_zones_cb_args *args = data;
127 struct gendisk *disk = args->disk;
128
129 if (disk->zone_wplugs_hash)
130 disk_zone_wplug_sync_wp_offset(disk, zone);
131
132 if (!args->user_cb)
133 return 0;
134
135 return args->user_cb(zone, idx, args->user_data);
136}
137
138/**
139 * blkdev_report_zones - Get zones information
140 * @bdev: Target block device
141 * @sector: Sector from which to report zones
142 * @nr_zones: Maximum number of zones to report
143 * @cb: Callback function called for each reported zone
144 * @data: Private data for the callback
145 *
146 * Description:
147 * Get zone information starting from the zone containing @sector for at most
148 * @nr_zones, and call @cb for each zone reported by the device.
149 * To report all zones in a device starting from @sector, the BLK_ALL_ZONES
150 * constant can be passed to @nr_zones.
151 * Returns the number of zones reported by the device, or a negative errno
152 * value in case of failure.
153 *
154 * Note: The caller must use memalloc_noXX_save/restore() calls to control
155 * memory allocations done within this function.
156 */
157int blkdev_report_zones(struct block_device *bdev, sector_t sector,
158 unsigned int nr_zones, report_zones_cb cb, void *data)
159{
160 struct gendisk *disk = bdev->bd_disk;
161 sector_t capacity = get_capacity(disk);
162 struct disk_report_zones_cb_args args = {
163 .disk = disk,
164 .user_cb = cb,
165 .user_data = data,
166 };
167
168 if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones))
169 return -EOPNOTSUPP;
170
171 if (!nr_zones || sector >= capacity)
172 return 0;
173
174 return disk->fops->report_zones(disk, sector, nr_zones,
175 disk_report_zones_cb, &args);
176}
177EXPORT_SYMBOL_GPL(blkdev_report_zones);
178
179static int blkdev_zone_reset_all(struct block_device *bdev)
180{
181 struct bio bio;
182
183 bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC);
184 return submit_bio_wait(&bio);
185}
186
187/**
188 * blkdev_zone_mgmt - Execute a zone management operation on a range of zones
189 * @bdev: Target block device
190 * @op: Operation to be performed on the zones
191 * @sector: Start sector of the first zone to operate on
192 * @nr_sectors: Number of sectors, should be at least the length of one zone and
193 * must be zone size aligned.
194 *
195 * Description:
196 * Perform the specified operation on the range of zones specified by
197 * @sector..@sector+@nr_sectors. Specifying the entire disk sector range
198 * is valid, but the specified range should not contain conventional zones.
199 * The operation to execute on each zone can be a zone reset, open, close
200 * or finish request.
201 */
202int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
203 sector_t sector, sector_t nr_sectors)
204{
205 sector_t zone_sectors = bdev_zone_sectors(bdev);
206 sector_t capacity = bdev_nr_sectors(bdev);
207 sector_t end_sector = sector + nr_sectors;
208 struct bio *bio = NULL;
209 int ret = 0;
210
211 if (!bdev_is_zoned(bdev))
212 return -EOPNOTSUPP;
213
214 if (bdev_read_only(bdev))
215 return -EPERM;
216
217 if (!op_is_zone_mgmt(op))
218 return -EOPNOTSUPP;
219
220 if (end_sector <= sector || end_sector > capacity)
221 /* Out of range */
222 return -EINVAL;
223
224 /* Check alignment (handle eventual smaller last zone) */
225 if (!bdev_is_zone_start(bdev, sector))
226 return -EINVAL;
227
228 if (!bdev_is_zone_start(bdev, nr_sectors) && end_sector != capacity)
229 return -EINVAL;
230
231 /*
232 * In the case of a zone reset operation over all zones, use
233 * REQ_OP_ZONE_RESET_ALL.
234 */
235 if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity)
236 return blkdev_zone_reset_all(bdev);
237
238 while (sector < end_sector) {
239 bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL);
240 bio->bi_iter.bi_sector = sector;
241 sector += zone_sectors;
242
243 /* This may take a while, so be nice to others */
244 cond_resched();
245 }
246
247 ret = submit_bio_wait(bio);
248 bio_put(bio);
249
250 return ret;
251}
252EXPORT_SYMBOL_GPL(blkdev_zone_mgmt);
253
254struct zone_report_args {
255 struct blk_zone __user *zones;
256};
257
258static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx,
259 void *data)
260{
261 struct zone_report_args *args = data;
262
263 if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone)))
264 return -EFAULT;
265 return 0;
266}
267
268/*
269 * BLKREPORTZONE ioctl processing.
270 * Called from blkdev_ioctl.
271 */
272int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
273 unsigned long arg)
274{
275 void __user *argp = (void __user *)arg;
276 struct zone_report_args args;
277 struct blk_zone_report rep;
278 int ret;
279
280 if (!argp)
281 return -EINVAL;
282
283 if (!bdev_is_zoned(bdev))
284 return -ENOTTY;
285
286 if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report)))
287 return -EFAULT;
288
289 if (!rep.nr_zones)
290 return -EINVAL;
291
292 args.zones = argp + sizeof(struct blk_zone_report);
293 ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones,
294 blkdev_copy_zone_to_user, &args);
295 if (ret < 0)
296 return ret;
297
298 rep.nr_zones = ret;
299 rep.flags = BLK_ZONE_REP_CAPACITY;
300 if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report)))
301 return -EFAULT;
302 return 0;
303}
304
305static int blkdev_truncate_zone_range(struct block_device *bdev,
306 blk_mode_t mode, const struct blk_zone_range *zrange)
307{
308 loff_t start, end;
309
310 if (zrange->sector + zrange->nr_sectors <= zrange->sector ||
311 zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk))
312 /* Out of range */
313 return -EINVAL;
314
315 start = zrange->sector << SECTOR_SHIFT;
316 end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1;
317
318 return truncate_bdev_range(bdev, mode, start, end);
319}
320
321/*
322 * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing.
323 * Called from blkdev_ioctl.
324 */
325int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
326 unsigned int cmd, unsigned long arg)
327{
328 void __user *argp = (void __user *)arg;
329 struct blk_zone_range zrange;
330 enum req_op op;
331 int ret;
332
333 if (!argp)
334 return -EINVAL;
335
336 if (!bdev_is_zoned(bdev))
337 return -ENOTTY;
338
339 if (!(mode & BLK_OPEN_WRITE))
340 return -EBADF;
341
342 if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range)))
343 return -EFAULT;
344
345 switch (cmd) {
346 case BLKRESETZONE:
347 op = REQ_OP_ZONE_RESET;
348
349 /* Invalidate the page cache, including dirty pages. */
350 filemap_invalidate_lock(bdev->bd_mapping);
351 ret = blkdev_truncate_zone_range(bdev, mode, &zrange);
352 if (ret)
353 goto fail;
354 break;
355 case BLKOPENZONE:
356 op = REQ_OP_ZONE_OPEN;
357 break;
358 case BLKCLOSEZONE:
359 op = REQ_OP_ZONE_CLOSE;
360 break;
361 case BLKFINISHZONE:
362 op = REQ_OP_ZONE_FINISH;
363 break;
364 default:
365 return -ENOTTY;
366 }
367
368 ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors);
369
370fail:
371 if (cmd == BLKRESETZONE)
372 filemap_invalidate_unlock(bdev->bd_mapping);
373
374 return ret;
375}
376
377static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone)
378{
379 return zone->start + zone->len >= get_capacity(disk);
380}
381
382static bool disk_zone_is_full(struct gendisk *disk,
383 unsigned int zno, unsigned int offset_in_zone)
384{
385 if (zno < disk->nr_zones - 1)
386 return offset_in_zone >= disk->zone_capacity;
387 return offset_in_zone >= disk->last_zone_capacity;
388}
389
390static bool disk_zone_wplug_is_full(struct gendisk *disk,
391 struct blk_zone_wplug *zwplug)
392{
393 return disk_zone_is_full(disk, zwplug->zone_no, zwplug->wp_offset);
394}
395
396static bool disk_insert_zone_wplug(struct gendisk *disk,
397 struct blk_zone_wplug *zwplug)
398{
399 struct blk_zone_wplug *zwplg;
400 unsigned long flags;
401 unsigned int idx =
402 hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits);
403
404 /*
405 * Add the new zone write plug to the hash table, but carefully as we
406 * are racing with other submission context, so we may already have a
407 * zone write plug for the same zone.
408 */
409 spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
410 hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) {
411 if (zwplg->zone_no == zwplug->zone_no) {
412 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
413 return false;
414 }
415 }
416 hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]);
417 atomic_inc(&disk->nr_zone_wplugs);
418 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
419
420 return true;
421}
422
423static struct blk_zone_wplug *disk_get_hashed_zone_wplug(struct gendisk *disk,
424 sector_t sector)
425{
426 unsigned int zno = disk_zone_no(disk, sector);
427 unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits);
428 struct blk_zone_wplug *zwplug;
429
430 rcu_read_lock();
431
432 hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) {
433 if (zwplug->zone_no == zno &&
434 refcount_inc_not_zero(&zwplug->ref)) {
435 rcu_read_unlock();
436 return zwplug;
437 }
438 }
439
440 rcu_read_unlock();
441
442 return NULL;
443}
444
445static inline struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
446 sector_t sector)
447{
448 if (!atomic_read(&disk->nr_zone_wplugs))
449 return NULL;
450
451 return disk_get_hashed_zone_wplug(disk, sector);
452}
453
454static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
455{
456 struct blk_zone_wplug *zwplug =
457 container_of(rcu_head, struct blk_zone_wplug, rcu_head);
458
459 mempool_free(zwplug, zwplug->disk->zone_wplugs_pool);
460}
461
462static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
463{
464 if (refcount_dec_and_test(&zwplug->ref)) {
465 WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list));
466 WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED);
467 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED));
468
469 call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu);
470 }
471}
472
473static inline bool disk_should_remove_zone_wplug(struct gendisk *disk,
474 struct blk_zone_wplug *zwplug)
475{
476 /* If the zone write plug was already removed, we are done. */
477 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
478 return false;
479
480 /* If the zone write plug is still plugged, it cannot be removed. */
481 if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)
482 return false;
483
484 /*
485 * Completions of BIOs with blk_zone_write_plug_bio_endio() may
486 * happen after handling a request completion with
487 * blk_zone_write_plug_finish_request() (e.g. with split BIOs
488 * that are chained). In such case, disk_zone_wplug_unplug_bio()
489 * should not attempt to remove the zone write plug until all BIO
490 * completions are seen. Check by looking at the zone write plug
491 * reference count, which is 2 when the plug is unused (one reference
492 * taken when the plug was allocated and another reference taken by the
493 * caller context).
494 */
495 if (refcount_read(&zwplug->ref) > 2)
496 return false;
497
498 /* We can remove zone write plugs for zones that are empty or full. */
499 return !zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug);
500}
501
502static void disk_remove_zone_wplug(struct gendisk *disk,
503 struct blk_zone_wplug *zwplug)
504{
505 unsigned long flags;
506
507 /* If the zone write plug was already removed, we have nothing to do. */
508 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
509 return;
510
511 /*
512 * Mark the zone write plug as unhashed and drop the extra reference we
513 * took when the plug was inserted in the hash table.
514 */
515 zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED;
516 spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
517 hlist_del_init_rcu(&zwplug->node);
518 atomic_dec(&disk->nr_zone_wplugs);
519 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
520 disk_put_zone_wplug(zwplug);
521}
522
523static void blk_zone_wplug_bio_work(struct work_struct *work);
524
525/*
526 * Get a reference on the write plug for the zone containing @sector.
527 * If the plug does not exist, it is allocated and hashed.
528 * Return a pointer to the zone write plug with the plug spinlock held.
529 */
530static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk,
531 sector_t sector, gfp_t gfp_mask,
532 unsigned long *flags)
533{
534 unsigned int zno = disk_zone_no(disk, sector);
535 struct blk_zone_wplug *zwplug;
536
537again:
538 zwplug = disk_get_zone_wplug(disk, sector);
539 if (zwplug) {
540 /*
541 * Check that a BIO completion or a zone reset or finish
542 * operation has not already removed the zone write plug from
543 * the hash table and dropped its reference count. In such case,
544 * we need to get a new plug so start over from the beginning.
545 */
546 spin_lock_irqsave(&zwplug->lock, *flags);
547 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) {
548 spin_unlock_irqrestore(&zwplug->lock, *flags);
549 disk_put_zone_wplug(zwplug);
550 goto again;
551 }
552 return zwplug;
553 }
554
555 /*
556 * Allocate and initialize a zone write plug with an extra reference
557 * so that it is not freed when the zone write plug becomes idle without
558 * the zone being full.
559 */
560 zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask);
561 if (!zwplug)
562 return NULL;
563
564 INIT_HLIST_NODE(&zwplug->node);
565 refcount_set(&zwplug->ref, 2);
566 spin_lock_init(&zwplug->lock);
567 zwplug->flags = 0;
568 zwplug->zone_no = zno;
569 zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector);
570 bio_list_init(&zwplug->bio_list);
571 INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
572 zwplug->disk = disk;
573
574 spin_lock_irqsave(&zwplug->lock, *flags);
575
576 /*
577 * Insert the new zone write plug in the hash table. This can fail only
578 * if another context already inserted a plug. Retry from the beginning
579 * in such case.
580 */
581 if (!disk_insert_zone_wplug(disk, zwplug)) {
582 spin_unlock_irqrestore(&zwplug->lock, *flags);
583 mempool_free(zwplug, disk->zone_wplugs_pool);
584 goto again;
585 }
586
587 return zwplug;
588}
589
590static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug,
591 struct bio *bio)
592{
593 struct request_queue *q = zwplug->disk->queue;
594
595 bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
596 bio_io_error(bio);
597 disk_put_zone_wplug(zwplug);
598 blk_queue_exit(q);
599}
600
601/*
602 * Abort (fail) all plugged BIOs of a zone write plug.
603 */
604static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
605{
606 struct bio *bio;
607
608 if (bio_list_empty(&zwplug->bio_list))
609 return;
610
611 pr_warn_ratelimited("%s: zone %u: Aborting plugged BIOs\n",
612 zwplug->disk->disk_name, zwplug->zone_no);
613 while ((bio = bio_list_pop(&zwplug->bio_list)))
614 blk_zone_wplug_bio_io_error(zwplug, bio);
615}
616
617/*
618 * Set a zone write plug write pointer offset to the specified value.
619 * This aborts all plugged BIOs, which is fine as this function is called for
620 * a zone reset operation, a zone finish operation or if the zone needs a wp
621 * update from a report zone after a write error.
622 */
623static void disk_zone_wplug_set_wp_offset(struct gendisk *disk,
624 struct blk_zone_wplug *zwplug,
625 unsigned int wp_offset)
626{
627 lockdep_assert_held(&zwplug->lock);
628
629 /* Update the zone write pointer and abort all plugged BIOs. */
630 zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE;
631 zwplug->wp_offset = wp_offset;
632 disk_zone_wplug_abort(zwplug);
633
634 /*
635 * The zone write plug now has no BIO plugged: remove it from the
636 * hash table so that it cannot be seen. The plug will be freed
637 * when the last reference is dropped.
638 */
639 if (disk_should_remove_zone_wplug(disk, zwplug))
640 disk_remove_zone_wplug(disk, zwplug);
641}
642
643static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
644{
645 switch (zone->cond) {
646 case BLK_ZONE_COND_IMP_OPEN:
647 case BLK_ZONE_COND_EXP_OPEN:
648 case BLK_ZONE_COND_CLOSED:
649 return zone->wp - zone->start;
650 case BLK_ZONE_COND_FULL:
651 return zone->len;
652 case BLK_ZONE_COND_EMPTY:
653 return 0;
654 case BLK_ZONE_COND_NOT_WP:
655 case BLK_ZONE_COND_OFFLINE:
656 case BLK_ZONE_COND_READONLY:
657 default:
658 /*
659 * Conventional, offline and read-only zones do not have a valid
660 * write pointer.
661 */
662 return UINT_MAX;
663 }
664}
665
666static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk,
667 struct blk_zone *zone)
668{
669 struct blk_zone_wplug *zwplug;
670 unsigned long flags;
671
672 zwplug = disk_get_zone_wplug(disk, zone->start);
673 if (!zwplug)
674 return;
675
676 spin_lock_irqsave(&zwplug->lock, flags);
677 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE)
678 disk_zone_wplug_set_wp_offset(disk, zwplug,
679 blk_zone_wp_offset(zone));
680 spin_unlock_irqrestore(&zwplug->lock, flags);
681
682 disk_put_zone_wplug(zwplug);
683}
684
685static int disk_zone_sync_wp_offset(struct gendisk *disk, sector_t sector)
686{
687 struct disk_report_zones_cb_args args = {
688 .disk = disk,
689 };
690
691 return disk->fops->report_zones(disk, sector, 1,
692 disk_report_zones_cb, &args);
693}
694
695static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio,
696 unsigned int wp_offset)
697{
698 struct gendisk *disk = bio->bi_bdev->bd_disk;
699 sector_t sector = bio->bi_iter.bi_sector;
700 struct blk_zone_wplug *zwplug;
701 unsigned long flags;
702
703 /* Conventional zones cannot be reset nor finished. */
704 if (!bdev_zone_is_seq(bio->bi_bdev, sector)) {
705 bio_io_error(bio);
706 return true;
707 }
708
709 /*
710 * No-wait reset or finish BIOs do not make much sense as the callers
711 * issue these as blocking operations in most cases. To avoid issues
712 * the BIO execution potentially failing with BLK_STS_AGAIN, warn about
713 * REQ_NOWAIT being set and ignore that flag.
714 */
715 if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT))
716 bio->bi_opf &= ~REQ_NOWAIT;
717
718 /*
719 * If we have a zone write plug, set its write pointer offset to 0
720 * (reset case) or to the zone size (finish case). This will abort all
721 * BIOs plugged for the target zone. It is fine as resetting or
722 * finishing zones while writes are still in-flight will result in the
723 * writes failing anyway.
724 */
725 zwplug = disk_get_zone_wplug(disk, sector);
726 if (zwplug) {
727 spin_lock_irqsave(&zwplug->lock, flags);
728 disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset);
729 spin_unlock_irqrestore(&zwplug->lock, flags);
730 disk_put_zone_wplug(zwplug);
731 }
732
733 return false;
734}
735
736static bool blk_zone_wplug_handle_reset_all(struct bio *bio)
737{
738 struct gendisk *disk = bio->bi_bdev->bd_disk;
739 struct blk_zone_wplug *zwplug;
740 unsigned long flags;
741 sector_t sector;
742
743 /*
744 * Set the write pointer offset of all zone write plugs to 0. This will
745 * abort all plugged BIOs. It is fine as resetting zones while writes
746 * are still in-flight will result in the writes failing anyway.
747 */
748 for (sector = 0; sector < get_capacity(disk);
749 sector += disk->queue->limits.chunk_sectors) {
750 zwplug = disk_get_zone_wplug(disk, sector);
751 if (zwplug) {
752 spin_lock_irqsave(&zwplug->lock, flags);
753 disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
754 spin_unlock_irqrestore(&zwplug->lock, flags);
755 disk_put_zone_wplug(zwplug);
756 }
757 }
758
759 return false;
760}
761
762static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk,
763 struct blk_zone_wplug *zwplug)
764{
765 /*
766 * Take a reference on the zone write plug and schedule the submission
767 * of the next plugged BIO. blk_zone_wplug_bio_work() will release the
768 * reference we take here.
769 */
770 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED));
771 refcount_inc(&zwplug->ref);
772 queue_work(disk->zone_wplugs_wq, &zwplug->bio_work);
773}
774
775static inline void disk_zone_wplug_add_bio(struct gendisk *disk,
776 struct blk_zone_wplug *zwplug,
777 struct bio *bio, unsigned int nr_segs)
778{
779 bool schedule_bio_work = false;
780
781 /*
782 * Grab an extra reference on the BIO request queue usage counter.
783 * This reference will be reused to submit a request for the BIO for
784 * blk-mq devices and dropped when the BIO is failed and after
785 * it is issued in the case of BIO-based devices.
786 */
787 percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter);
788
789 /*
790 * The BIO is being plugged and thus will have to wait for the on-going
791 * write and for all other writes already plugged. So polling makes
792 * no sense.
793 */
794 bio_clear_polled(bio);
795
796 /*
797 * REQ_NOWAIT BIOs are always handled using the zone write plug BIO
798 * work, which can block. So clear the REQ_NOWAIT flag and schedule the
799 * work if this is the first BIO we are plugging.
800 */
801 if (bio->bi_opf & REQ_NOWAIT) {
802 schedule_bio_work = !(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED);
803 bio->bi_opf &= ~REQ_NOWAIT;
804 }
805
806 /*
807 * Reuse the poll cookie field to store the number of segments when
808 * split to the hardware limits.
809 */
810 bio->__bi_nr_segments = nr_segs;
811
812 /*
813 * We always receive BIOs after they are split and ready to be issued.
814 * The block layer passes the parts of a split BIO in order, and the
815 * user must also issue write sequentially. So simply add the new BIO
816 * at the tail of the list to preserve the sequential write order.
817 */
818 bio_list_add(&zwplug->bio_list, bio);
819
820 zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
821
822 if (schedule_bio_work)
823 disk_zone_wplug_schedule_bio_work(disk, zwplug);
824}
825
826/*
827 * Called from bio_attempt_back_merge() when a BIO was merged with a request.
828 */
829void blk_zone_write_plug_bio_merged(struct bio *bio)
830{
831 struct blk_zone_wplug *zwplug;
832 unsigned long flags;
833
834 /*
835 * If the BIO was already plugged, then we were called through
836 * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge().
837 * For this case, we already hold a reference on the zone write plug for
838 * the BIO and blk_zone_write_plug_init_request() will handle the
839 * zone write pointer offset update.
840 */
841 if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
842 return;
843
844 bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
845
846 /*
847 * Get a reference on the zone write plug of the target zone and advance
848 * the zone write pointer offset. Given that this is a merge, we already
849 * have at least one request and one BIO referencing the zone write
850 * plug. So this should not fail.
851 */
852 zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk,
853 bio->bi_iter.bi_sector);
854 if (WARN_ON_ONCE(!zwplug))
855 return;
856
857 spin_lock_irqsave(&zwplug->lock, flags);
858 zwplug->wp_offset += bio_sectors(bio);
859 spin_unlock_irqrestore(&zwplug->lock, flags);
860}
861
862/*
863 * Attempt to merge plugged BIOs with a newly prepared request for a BIO that
864 * already went through zone write plugging (either a new BIO or one that was
865 * unplugged).
866 */
867void blk_zone_write_plug_init_request(struct request *req)
868{
869 sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req);
870 struct request_queue *q = req->q;
871 struct gendisk *disk = q->disk;
872 struct blk_zone_wplug *zwplug =
873 disk_get_zone_wplug(disk, blk_rq_pos(req));
874 unsigned long flags;
875 struct bio *bio;
876
877 if (WARN_ON_ONCE(!zwplug))
878 return;
879
880 /*
881 * Indicate that completion of this request needs to be handled with
882 * blk_zone_write_plug_finish_request(), which will drop the reference
883 * on the zone write plug we took above on entry to this function.
884 */
885 req->rq_flags |= RQF_ZONE_WRITE_PLUGGING;
886
887 if (blk_queue_nomerges(q))
888 return;
889
890 /*
891 * Walk through the list of plugged BIOs to check if they can be merged
892 * into the back of the request.
893 */
894 spin_lock_irqsave(&zwplug->lock, flags);
895 while (!disk_zone_wplug_is_full(disk, zwplug)) {
896 bio = bio_list_peek(&zwplug->bio_list);
897 if (!bio)
898 break;
899
900 if (bio->bi_iter.bi_sector != req_back_sector ||
901 !blk_rq_merge_ok(req, bio))
902 break;
903
904 WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES &&
905 !bio->__bi_nr_segments);
906
907 bio_list_pop(&zwplug->bio_list);
908 if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) !=
909 BIO_MERGE_OK) {
910 bio_list_add_head(&zwplug->bio_list, bio);
911 break;
912 }
913
914 /*
915 * Drop the extra reference on the queue usage we got when
916 * plugging the BIO and advance the write pointer offset.
917 */
918 blk_queue_exit(q);
919 zwplug->wp_offset += bio_sectors(bio);
920
921 req_back_sector += bio_sectors(bio);
922 }
923 spin_unlock_irqrestore(&zwplug->lock, flags);
924}
925
926/*
927 * Check and prepare a BIO for submission by incrementing the write pointer
928 * offset of its zone write plug and changing zone append operations into
929 * regular write when zone append emulation is needed.
930 */
931static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
932 struct bio *bio)
933{
934 struct gendisk *disk = bio->bi_bdev->bd_disk;
935
936 /*
937 * If we lost track of the zone write pointer due to a write error,
938 * the user must either execute a report zones, reset the zone or finish
939 * the to recover a reliable write pointer position. Fail BIOs if the
940 * user did not do that as we cannot handle emulated zone append
941 * otherwise.
942 */
943 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE)
944 return false;
945
946 /*
947 * Check that the user is not attempting to write to a full zone.
948 * We know such BIO will fail, and that would potentially overflow our
949 * write pointer offset beyond the end of the zone.
950 */
951 if (disk_zone_wplug_is_full(disk, zwplug))
952 return false;
953
954 if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
955 /*
956 * Use a regular write starting at the current write pointer.
957 * Similarly to native zone append operations, do not allow
958 * merging.
959 */
960 bio->bi_opf &= ~REQ_OP_MASK;
961 bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE;
962 bio->bi_iter.bi_sector += zwplug->wp_offset;
963
964 /*
965 * Remember that this BIO is in fact a zone append operation
966 * so that we can restore its operation code on completion.
967 */
968 bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND);
969 } else {
970 /*
971 * Check for non-sequential writes early as we know that BIOs
972 * with a start sector not unaligned to the zone write pointer
973 * will fail.
974 */
975 if (bio_offset_from_zone_start(bio) != zwplug->wp_offset)
976 return false;
977 }
978
979 /* Advance the zone write pointer offset. */
980 zwplug->wp_offset += bio_sectors(bio);
981
982 return true;
983}
984
985static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
986{
987 struct gendisk *disk = bio->bi_bdev->bd_disk;
988 sector_t sector = bio->bi_iter.bi_sector;
989 struct blk_zone_wplug *zwplug;
990 gfp_t gfp_mask = GFP_NOIO;
991 unsigned long flags;
992
993 /*
994 * BIOs must be fully contained within a zone so that we use the correct
995 * zone write plug for the entire BIO. For blk-mq devices, the block
996 * layer should already have done any splitting required to ensure this
997 * and this BIO should thus not be straddling zone boundaries. For
998 * BIO-based devices, it is the responsibility of the driver to split
999 * the bio before submitting it.
1000 */
1001 if (WARN_ON_ONCE(bio_straddles_zones(bio))) {
1002 bio_io_error(bio);
1003 return true;
1004 }
1005
1006 /* Conventional zones do not need write plugging. */
1007 if (!bdev_zone_is_seq(bio->bi_bdev, sector)) {
1008 /* Zone append to conventional zones is not allowed. */
1009 if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
1010 bio_io_error(bio);
1011 return true;
1012 }
1013 return false;
1014 }
1015
1016 if (bio->bi_opf & REQ_NOWAIT)
1017 gfp_mask = GFP_NOWAIT;
1018
1019 zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags);
1020 if (!zwplug) {
1021 if (bio->bi_opf & REQ_NOWAIT)
1022 bio_wouldblock_error(bio);
1023 else
1024 bio_io_error(bio);
1025 return true;
1026 }
1027
1028 /* Indicate that this BIO is being handled using zone write plugging. */
1029 bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
1030
1031 /*
1032 * If the zone is already plugged, add the BIO to the plug BIO list.
1033 * Do the same for REQ_NOWAIT BIOs to ensure that we will not see a
1034 * BLK_STS_AGAIN failure if we let the BIO execute.
1035 * Otherwise, plug and let the BIO execute.
1036 */
1037 if ((zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) ||
1038 (bio->bi_opf & REQ_NOWAIT))
1039 goto plug;
1040
1041 if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
1042 spin_unlock_irqrestore(&zwplug->lock, flags);
1043 bio_io_error(bio);
1044 return true;
1045 }
1046
1047 zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
1048
1049 spin_unlock_irqrestore(&zwplug->lock, flags);
1050
1051 return false;
1052
1053plug:
1054 disk_zone_wplug_add_bio(disk, zwplug, bio, nr_segs);
1055
1056 spin_unlock_irqrestore(&zwplug->lock, flags);
1057
1058 return true;
1059}
1060
1061static void blk_zone_wplug_handle_native_zone_append(struct bio *bio)
1062{
1063 struct gendisk *disk = bio->bi_bdev->bd_disk;
1064 struct blk_zone_wplug *zwplug;
1065 unsigned long flags;
1066
1067 /*
1068 * We have native support for zone append operations, so we are not
1069 * going to handle @bio through plugging. However, we may already have a
1070 * zone write plug for the target zone if that zone was previously
1071 * partially written using regular writes. In such case, we risk leaving
1072 * the plug in the disk hash table if the zone is fully written using
1073 * zone append operations. Avoid this by removing the zone write plug.
1074 */
1075 zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
1076 if (likely(!zwplug))
1077 return;
1078
1079 spin_lock_irqsave(&zwplug->lock, flags);
1080
1081 /*
1082 * We are about to remove the zone write plug. But if the user
1083 * (mistakenly) has issued regular writes together with native zone
1084 * append, we must aborts the writes as otherwise the plugged BIOs would
1085 * not be executed by the plug BIO work as disk_get_zone_wplug() will
1086 * return NULL after the plug is removed. Aborting the plugged write
1087 * BIOs is consistent with the fact that these writes will most likely
1088 * fail anyway as there is no ordering guarantees between zone append
1089 * operations and regular write operations.
1090 */
1091 if (!bio_list_empty(&zwplug->bio_list)) {
1092 pr_warn_ratelimited("%s: zone %u: Invalid mix of zone append and regular writes\n",
1093 disk->disk_name, zwplug->zone_no);
1094 disk_zone_wplug_abort(zwplug);
1095 }
1096 disk_remove_zone_wplug(disk, zwplug);
1097 spin_unlock_irqrestore(&zwplug->lock, flags);
1098
1099 disk_put_zone_wplug(zwplug);
1100}
1101
1102/**
1103 * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging
1104 * @bio: The BIO being submitted
1105 * @nr_segs: The number of physical segments of @bio
1106 *
1107 * Handle write, write zeroes and zone append operations requiring emulation
1108 * using zone write plugging.
1109 *
1110 * Return true whenever @bio execution needs to be delayed through the zone
1111 * write plug. Otherwise, return false to let the submission path process
1112 * @bio normally.
1113 */
1114bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
1115{
1116 struct block_device *bdev = bio->bi_bdev;
1117
1118 if (!bdev->bd_disk->zone_wplugs_hash)
1119 return false;
1120
1121 /*
1122 * If the BIO already has the plugging flag set, then it was already
1123 * handled through this path and this is a submission from the zone
1124 * plug bio submit work.
1125 */
1126 if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
1127 return false;
1128
1129 /*
1130 * We do not need to do anything special for empty flush BIOs, e.g
1131 * BIOs such as issued by blkdev_issue_flush(). The is because it is
1132 * the responsibility of the user to first wait for the completion of
1133 * write operations for flush to have any effect on the persistence of
1134 * the written data.
1135 */
1136 if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
1137 return false;
1138
1139 /*
1140 * Regular writes and write zeroes need to be handled through the target
1141 * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH
1142 * which may need to go through the flush machinery depending on the
1143 * target device capabilities. Plugging such writes is fine as the flush
1144 * machinery operates at the request level, below the plug, and
1145 * completion of the flush sequence will go through the regular BIO
1146 * completion, which will handle zone write plugging.
1147 * Zone append operations for devices that requested emulation must
1148 * also be plugged so that these BIOs can be changed into regular
1149 * write BIOs.
1150 * Zone reset, reset all and finish commands need special treatment
1151 * to correctly track the write pointer offset of zones. These commands
1152 * are not plugged as we do not need serialization with write
1153 * operations. It is the responsibility of the user to not issue reset
1154 * and finish commands when write operations are in flight.
1155 */
1156 switch (bio_op(bio)) {
1157 case REQ_OP_ZONE_APPEND:
1158 if (!bdev_emulates_zone_append(bdev)) {
1159 blk_zone_wplug_handle_native_zone_append(bio);
1160 return false;
1161 }
1162 fallthrough;
1163 case REQ_OP_WRITE:
1164 case REQ_OP_WRITE_ZEROES:
1165 return blk_zone_wplug_handle_write(bio, nr_segs);
1166 case REQ_OP_ZONE_RESET:
1167 return blk_zone_wplug_handle_reset_or_finish(bio, 0);
1168 case REQ_OP_ZONE_FINISH:
1169 return blk_zone_wplug_handle_reset_or_finish(bio,
1170 bdev_zone_sectors(bdev));
1171 case REQ_OP_ZONE_RESET_ALL:
1172 return blk_zone_wplug_handle_reset_all(bio);
1173 default:
1174 return false;
1175 }
1176
1177 return false;
1178}
1179EXPORT_SYMBOL_GPL(blk_zone_plug_bio);
1180
1181static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
1182 struct blk_zone_wplug *zwplug)
1183{
1184 unsigned long flags;
1185
1186 spin_lock_irqsave(&zwplug->lock, flags);
1187
1188 /* Schedule submission of the next plugged BIO if we have one. */
1189 if (!bio_list_empty(&zwplug->bio_list)) {
1190 disk_zone_wplug_schedule_bio_work(disk, zwplug);
1191 spin_unlock_irqrestore(&zwplug->lock, flags);
1192 return;
1193 }
1194
1195 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
1196
1197 /*
1198 * If the zone is full (it was fully written or finished, or empty
1199 * (it was reset), remove its zone write plug from the hash table.
1200 */
1201 if (disk_should_remove_zone_wplug(disk, zwplug))
1202 disk_remove_zone_wplug(disk, zwplug);
1203
1204 spin_unlock_irqrestore(&zwplug->lock, flags);
1205}
1206
1207void blk_zone_write_plug_bio_endio(struct bio *bio)
1208{
1209 struct gendisk *disk = bio->bi_bdev->bd_disk;
1210 struct blk_zone_wplug *zwplug =
1211 disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
1212 unsigned long flags;
1213
1214 if (WARN_ON_ONCE(!zwplug))
1215 return;
1216
1217 /* Make sure we do not see this BIO again by clearing the plug flag. */
1218 bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
1219
1220 /*
1221 * If this is a regular write emulating a zone append operation,
1222 * restore the original operation code.
1223 */
1224 if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) {
1225 bio->bi_opf &= ~REQ_OP_MASK;
1226 bio->bi_opf |= REQ_OP_ZONE_APPEND;
1227 }
1228
1229 /*
1230 * If the BIO failed, abort all plugged BIOs and mark the plug as
1231 * needing a write pointer update.
1232 */
1233 if (bio->bi_status != BLK_STS_OK) {
1234 spin_lock_irqsave(&zwplug->lock, flags);
1235 disk_zone_wplug_abort(zwplug);
1236 zwplug->flags |= BLK_ZONE_WPLUG_NEED_WP_UPDATE;
1237 spin_unlock_irqrestore(&zwplug->lock, flags);
1238 }
1239
1240 /* Drop the reference we took when the BIO was issued. */
1241 disk_put_zone_wplug(zwplug);
1242
1243 /*
1244 * For BIO-based devices, blk_zone_write_plug_finish_request()
1245 * is not called. So we need to schedule execution of the next
1246 * plugged BIO here.
1247 */
1248 if (bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO))
1249 disk_zone_wplug_unplug_bio(disk, zwplug);
1250
1251 /* Drop the reference we took when entering this function. */
1252 disk_put_zone_wplug(zwplug);
1253}
1254
1255void blk_zone_write_plug_finish_request(struct request *req)
1256{
1257 struct gendisk *disk = req->q->disk;
1258 struct blk_zone_wplug *zwplug;
1259
1260 zwplug = disk_get_zone_wplug(disk, req->__sector);
1261 if (WARN_ON_ONCE(!zwplug))
1262 return;
1263
1264 req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING;
1265
1266 /*
1267 * Drop the reference we took when the request was initialized in
1268 * blk_zone_write_plug_init_request().
1269 */
1270 disk_put_zone_wplug(zwplug);
1271
1272 disk_zone_wplug_unplug_bio(disk, zwplug);
1273
1274 /* Drop the reference we took when entering this function. */
1275 disk_put_zone_wplug(zwplug);
1276}
1277
1278static void blk_zone_wplug_bio_work(struct work_struct *work)
1279{
1280 struct blk_zone_wplug *zwplug =
1281 container_of(work, struct blk_zone_wplug, bio_work);
1282 struct block_device *bdev;
1283 unsigned long flags;
1284 struct bio *bio;
1285
1286 /*
1287 * Submit the next plugged BIO. If we do not have any, clear
1288 * the plugged flag.
1289 */
1290 spin_lock_irqsave(&zwplug->lock, flags);
1291
1292again:
1293 bio = bio_list_pop(&zwplug->bio_list);
1294 if (!bio) {
1295 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
1296 spin_unlock_irqrestore(&zwplug->lock, flags);
1297 goto put_zwplug;
1298 }
1299
1300 if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
1301 blk_zone_wplug_bio_io_error(zwplug, bio);
1302 goto again;
1303 }
1304
1305 spin_unlock_irqrestore(&zwplug->lock, flags);
1306
1307 bdev = bio->bi_bdev;
1308 submit_bio_noacct_nocheck(bio);
1309
1310 /*
1311 * blk-mq devices will reuse the extra reference on the request queue
1312 * usage counter we took when the BIO was plugged, but the submission
1313 * path for BIO-based devices will not do that. So drop this extra
1314 * reference here.
1315 */
1316 if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO))
1317 blk_queue_exit(bdev->bd_disk->queue);
1318
1319put_zwplug:
1320 /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */
1321 disk_put_zone_wplug(zwplug);
1322}
1323
1324static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
1325{
1326 return 1U << disk->zone_wplugs_hash_bits;
1327}
1328
1329void disk_init_zone_resources(struct gendisk *disk)
1330{
1331 spin_lock_init(&disk->zone_wplugs_lock);
1332}
1333
1334/*
1335 * For the size of a disk zone write plug hash table, use the size of the
1336 * zone write plug mempool, which is the maximum of the disk open zones and
1337 * active zones limits. But do not exceed 4KB (512 hlist head entries), that is,
1338 * 9 bits. For a disk that has no limits, mempool size defaults to 128.
1339 */
1340#define BLK_ZONE_WPLUG_MAX_HASH_BITS 9
1341#define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE 128
1342
1343static int disk_alloc_zone_resources(struct gendisk *disk,
1344 unsigned int pool_size)
1345{
1346 unsigned int i;
1347
1348 atomic_set(&disk->nr_zone_wplugs, 0);
1349 disk->zone_wplugs_hash_bits =
1350 min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS);
1351
1352 disk->zone_wplugs_hash =
1353 kcalloc(disk_zone_wplugs_hash_size(disk),
1354 sizeof(struct hlist_head), GFP_KERNEL);
1355 if (!disk->zone_wplugs_hash)
1356 return -ENOMEM;
1357
1358 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
1359 INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]);
1360
1361 disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size,
1362 sizeof(struct blk_zone_wplug));
1363 if (!disk->zone_wplugs_pool)
1364 goto free_hash;
1365
1366 disk->zone_wplugs_wq =
1367 alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI,
1368 pool_size, disk->disk_name);
1369 if (!disk->zone_wplugs_wq)
1370 goto destroy_pool;
1371
1372 return 0;
1373
1374destroy_pool:
1375 mempool_destroy(disk->zone_wplugs_pool);
1376 disk->zone_wplugs_pool = NULL;
1377free_hash:
1378 kfree(disk->zone_wplugs_hash);
1379 disk->zone_wplugs_hash = NULL;
1380 disk->zone_wplugs_hash_bits = 0;
1381 return -ENOMEM;
1382}
1383
1384static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
1385{
1386 struct blk_zone_wplug *zwplug;
1387 unsigned int i;
1388
1389 if (!disk->zone_wplugs_hash)
1390 return;
1391
1392 /* Free all the zone write plugs we have. */
1393 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
1394 while (!hlist_empty(&disk->zone_wplugs_hash[i])) {
1395 zwplug = hlist_entry(disk->zone_wplugs_hash[i].first,
1396 struct blk_zone_wplug, node);
1397 refcount_inc(&zwplug->ref);
1398 disk_remove_zone_wplug(disk, zwplug);
1399 disk_put_zone_wplug(zwplug);
1400 }
1401 }
1402
1403 WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs));
1404 kfree(disk->zone_wplugs_hash);
1405 disk->zone_wplugs_hash = NULL;
1406 disk->zone_wplugs_hash_bits = 0;
1407}
1408
1409static unsigned int disk_set_conv_zones_bitmap(struct gendisk *disk,
1410 unsigned long *bitmap)
1411{
1412 unsigned int nr_conv_zones = 0;
1413 unsigned long flags;
1414
1415 spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
1416 if (bitmap)
1417 nr_conv_zones = bitmap_weight(bitmap, disk->nr_zones);
1418 bitmap = rcu_replace_pointer(disk->conv_zones_bitmap, bitmap,
1419 lockdep_is_held(&disk->zone_wplugs_lock));
1420 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
1421
1422 kfree_rcu_mightsleep(bitmap);
1423
1424 return nr_conv_zones;
1425}
1426
1427void disk_free_zone_resources(struct gendisk *disk)
1428{
1429 if (!disk->zone_wplugs_pool)
1430 return;
1431
1432 if (disk->zone_wplugs_wq) {
1433 destroy_workqueue(disk->zone_wplugs_wq);
1434 disk->zone_wplugs_wq = NULL;
1435 }
1436
1437 disk_destroy_zone_wplugs_hash_table(disk);
1438
1439 /*
1440 * Wait for the zone write plugs to be RCU-freed before
1441 * destorying the mempool.
1442 */
1443 rcu_barrier();
1444
1445 mempool_destroy(disk->zone_wplugs_pool);
1446 disk->zone_wplugs_pool = NULL;
1447
1448 disk_set_conv_zones_bitmap(disk, NULL);
1449 disk->zone_capacity = 0;
1450 disk->last_zone_capacity = 0;
1451 disk->nr_zones = 0;
1452}
1453
1454static inline bool disk_need_zone_resources(struct gendisk *disk)
1455{
1456 /*
1457 * All mq zoned devices need zone resources so that the block layer
1458 * can automatically handle write BIO plugging. BIO-based device drivers
1459 * (e.g. DM devices) are normally responsible for handling zone write
1460 * ordering and do not need zone resources, unless the driver requires
1461 * zone append emulation.
1462 */
1463 return queue_is_mq(disk->queue) ||
1464 queue_emulates_zone_append(disk->queue);
1465}
1466
1467static int disk_revalidate_zone_resources(struct gendisk *disk,
1468 unsigned int nr_zones)
1469{
1470 struct queue_limits *lim = &disk->queue->limits;
1471 unsigned int pool_size;
1472
1473 if (!disk_need_zone_resources(disk))
1474 return 0;
1475
1476 /*
1477 * If the device has no limit on the maximum number of open and active
1478 * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE.
1479 */
1480 pool_size = max(lim->max_open_zones, lim->max_active_zones);
1481 if (!pool_size)
1482 pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_zones);
1483
1484 if (!disk->zone_wplugs_hash)
1485 return disk_alloc_zone_resources(disk, pool_size);
1486
1487 return 0;
1488}
1489
1490struct blk_revalidate_zone_args {
1491 struct gendisk *disk;
1492 unsigned long *conv_zones_bitmap;
1493 unsigned int nr_zones;
1494 unsigned int zone_capacity;
1495 unsigned int last_zone_capacity;
1496 sector_t sector;
1497};
1498
1499/*
1500 * Update the disk zone resources information and device queue limits.
1501 * The disk queue is frozen when this is executed.
1502 */
1503static int disk_update_zone_resources(struct gendisk *disk,
1504 struct blk_revalidate_zone_args *args)
1505{
1506 struct request_queue *q = disk->queue;
1507 unsigned int nr_seq_zones, nr_conv_zones;
1508 unsigned int pool_size;
1509 struct queue_limits lim;
1510
1511 disk->nr_zones = args->nr_zones;
1512 disk->zone_capacity = args->zone_capacity;
1513 disk->last_zone_capacity = args->last_zone_capacity;
1514 nr_conv_zones =
1515 disk_set_conv_zones_bitmap(disk, args->conv_zones_bitmap);
1516 if (nr_conv_zones >= disk->nr_zones) {
1517 pr_warn("%s: Invalid number of conventional zones %u / %u\n",
1518 disk->disk_name, nr_conv_zones, disk->nr_zones);
1519 return -ENODEV;
1520 }
1521
1522 lim = queue_limits_start_update(q);
1523
1524 /*
1525 * Some devices can advertize zone resource limits that are larger than
1526 * the number of sequential zones of the zoned block device, e.g. a
1527 * small ZNS namespace. For such case, assume that the zoned device has
1528 * no zone resource limits.
1529 */
1530 nr_seq_zones = disk->nr_zones - nr_conv_zones;
1531 if (lim.max_open_zones >= nr_seq_zones)
1532 lim.max_open_zones = 0;
1533 if (lim.max_active_zones >= nr_seq_zones)
1534 lim.max_active_zones = 0;
1535
1536 if (!disk->zone_wplugs_pool)
1537 goto commit;
1538
1539 /*
1540 * If the device has no limit on the maximum number of open and active
1541 * zones, set its max open zone limit to the mempool size to indicate
1542 * to the user that there is a potential performance impact due to
1543 * dynamic zone write plug allocation when simultaneously writing to
1544 * more zones than the size of the mempool.
1545 */
1546 pool_size = max(lim.max_open_zones, lim.max_active_zones);
1547 if (!pool_size)
1548 pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones);
1549
1550 mempool_resize(disk->zone_wplugs_pool, pool_size);
1551
1552 if (!lim.max_open_zones && !lim.max_active_zones) {
1553 if (pool_size < nr_seq_zones)
1554 lim.max_open_zones = pool_size;
1555 else
1556 lim.max_open_zones = 0;
1557 }
1558
1559commit:
1560 return queue_limits_commit_update_frozen(q, &lim);
1561}
1562
1563static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx,
1564 struct blk_revalidate_zone_args *args)
1565{
1566 struct gendisk *disk = args->disk;
1567
1568 if (zone->capacity != zone->len) {
1569 pr_warn("%s: Invalid conventional zone capacity\n",
1570 disk->disk_name);
1571 return -ENODEV;
1572 }
1573
1574 if (disk_zone_is_last(disk, zone))
1575 args->last_zone_capacity = zone->capacity;
1576
1577 if (!disk_need_zone_resources(disk))
1578 return 0;
1579
1580 if (!args->conv_zones_bitmap) {
1581 args->conv_zones_bitmap =
1582 bitmap_zalloc(args->nr_zones, GFP_NOIO);
1583 if (!args->conv_zones_bitmap)
1584 return -ENOMEM;
1585 }
1586
1587 set_bit(idx, args->conv_zones_bitmap);
1588
1589 return 0;
1590}
1591
1592static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx,
1593 struct blk_revalidate_zone_args *args)
1594{
1595 struct gendisk *disk = args->disk;
1596 struct blk_zone_wplug *zwplug;
1597 unsigned int wp_offset;
1598 unsigned long flags;
1599
1600 /*
1601 * Remember the capacity of the first sequential zone and check
1602 * if it is constant for all zones, ignoring the last zone as it can be
1603 * smaller.
1604 */
1605 if (!args->zone_capacity)
1606 args->zone_capacity = zone->capacity;
1607 if (disk_zone_is_last(disk, zone)) {
1608 args->last_zone_capacity = zone->capacity;
1609 } else if (zone->capacity != args->zone_capacity) {
1610 pr_warn("%s: Invalid variable zone capacity\n",
1611 disk->disk_name);
1612 return -ENODEV;
1613 }
1614
1615 /*
1616 * If the device needs zone append emulation, we need to track the
1617 * write pointer of all zones that are not empty nor full. So make sure
1618 * we have a zone write plug for such zone if the device has a zone
1619 * write plug hash table.
1620 */
1621 if (!queue_emulates_zone_append(disk->queue) || !disk->zone_wplugs_hash)
1622 return 0;
1623
1624 disk_zone_wplug_sync_wp_offset(disk, zone);
1625
1626 wp_offset = blk_zone_wp_offset(zone);
1627 if (!wp_offset || wp_offset >= zone->capacity)
1628 return 0;
1629
1630 zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags);
1631 if (!zwplug)
1632 return -ENOMEM;
1633 spin_unlock_irqrestore(&zwplug->lock, flags);
1634 disk_put_zone_wplug(zwplug);
1635
1636 return 0;
1637}
1638
1639/*
1640 * Helper function to check the validity of zones of a zoned block device.
1641 */
1642static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
1643 void *data)
1644{
1645 struct blk_revalidate_zone_args *args = data;
1646 struct gendisk *disk = args->disk;
1647 sector_t zone_sectors = disk->queue->limits.chunk_sectors;
1648 int ret;
1649
1650 /* Check for bad zones and holes in the zone report */
1651 if (zone->start != args->sector) {
1652 pr_warn("%s: Zone gap at sectors %llu..%llu\n",
1653 disk->disk_name, args->sector, zone->start);
1654 return -ENODEV;
1655 }
1656
1657 if (zone->start >= get_capacity(disk) || !zone->len) {
1658 pr_warn("%s: Invalid zone start %llu, length %llu\n",
1659 disk->disk_name, zone->start, zone->len);
1660 return -ENODEV;
1661 }
1662
1663 /*
1664 * All zones must have the same size, with the exception on an eventual
1665 * smaller last zone.
1666 */
1667 if (!disk_zone_is_last(disk, zone)) {
1668 if (zone->len != zone_sectors) {
1669 pr_warn("%s: Invalid zoned device with non constant zone size\n",
1670 disk->disk_name);
1671 return -ENODEV;
1672 }
1673 } else if (zone->len > zone_sectors) {
1674 pr_warn("%s: Invalid zoned device with larger last zone size\n",
1675 disk->disk_name);
1676 return -ENODEV;
1677 }
1678
1679 if (!zone->capacity || zone->capacity > zone->len) {
1680 pr_warn("%s: Invalid zone capacity\n",
1681 disk->disk_name);
1682 return -ENODEV;
1683 }
1684
1685 /* Check zone type */
1686 switch (zone->type) {
1687 case BLK_ZONE_TYPE_CONVENTIONAL:
1688 ret = blk_revalidate_conv_zone(zone, idx, args);
1689 break;
1690 case BLK_ZONE_TYPE_SEQWRITE_REQ:
1691 ret = blk_revalidate_seq_zone(zone, idx, args);
1692 break;
1693 case BLK_ZONE_TYPE_SEQWRITE_PREF:
1694 default:
1695 pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
1696 disk->disk_name, (int)zone->type, zone->start);
1697 ret = -ENODEV;
1698 }
1699
1700 if (!ret)
1701 args->sector += zone->len;
1702
1703 return ret;
1704}
1705
1706/**
1707 * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs
1708 * @disk: Target disk
1709 *
1710 * Helper function for low-level device drivers to check, (re) allocate and
1711 * initialize resources used for managing zoned disks. This function should
1712 * normally be called by blk-mq based drivers when a zoned gendisk is probed
1713 * and when the zone configuration of the gendisk changes (e.g. after a format).
1714 * Before calling this function, the device driver must already have set the
1715 * device zone size (chunk_sector limit) and the max zone append limit.
1716 * BIO based drivers can also use this function as long as the device queue
1717 * can be safely frozen.
1718 */
1719int blk_revalidate_disk_zones(struct gendisk *disk)
1720{
1721 struct request_queue *q = disk->queue;
1722 sector_t zone_sectors = q->limits.chunk_sectors;
1723 sector_t capacity = get_capacity(disk);
1724 struct blk_revalidate_zone_args args = { };
1725 unsigned int noio_flag;
1726 int ret = -ENOMEM;
1727
1728 if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
1729 return -EIO;
1730
1731 if (!capacity)
1732 return -ENODEV;
1733
1734 /*
1735 * Checks that the device driver indicated a valid zone size and that
1736 * the max zone append limit is set.
1737 */
1738 if (!zone_sectors || !is_power_of_2(zone_sectors)) {
1739 pr_warn("%s: Invalid non power of two zone size (%llu)\n",
1740 disk->disk_name, zone_sectors);
1741 return -ENODEV;
1742 }
1743
1744 /*
1745 * Ensure that all memory allocations in this context are done as if
1746 * GFP_NOIO was specified.
1747 */
1748 args.disk = disk;
1749 args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors);
1750 noio_flag = memalloc_noio_save();
1751 ret = disk_revalidate_zone_resources(disk, args.nr_zones);
1752 if (ret) {
1753 memalloc_noio_restore(noio_flag);
1754 return ret;
1755 }
1756
1757 ret = disk->fops->report_zones(disk, 0, UINT_MAX,
1758 blk_revalidate_zone_cb, &args);
1759 if (!ret) {
1760 pr_warn("%s: No zones reported\n", disk->disk_name);
1761 ret = -ENODEV;
1762 }
1763 memalloc_noio_restore(noio_flag);
1764
1765 /*
1766 * If zones where reported, make sure that the entire disk capacity
1767 * has been checked.
1768 */
1769 if (ret > 0 && args.sector != capacity) {
1770 pr_warn("%s: Missing zones from sector %llu\n",
1771 disk->disk_name, args.sector);
1772 ret = -ENODEV;
1773 }
1774
1775 /*
1776 * Set the new disk zone parameters only once the queue is frozen and
1777 * all I/Os are completed.
1778 */
1779 if (ret > 0)
1780 ret = disk_update_zone_resources(disk, &args);
1781 else
1782 pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
1783 if (ret) {
1784 blk_mq_freeze_queue(q);
1785 disk_free_zone_resources(disk);
1786 blk_mq_unfreeze_queue(q);
1787 }
1788
1789 return ret;
1790}
1791EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
1792
1793/**
1794 * blk_zone_issue_zeroout - zero-fill a block range in a zone
1795 * @bdev: blockdev to write
1796 * @sector: start sector
1797 * @nr_sects: number of sectors to write
1798 * @gfp_mask: memory allocation flags (for bio_alloc)
1799 *
1800 * Description:
1801 * Zero-fill a block range in a zone (@sector must be equal to the zone write
1802 * pointer), handling potential errors due to the (initially unknown) lack of
1803 * hardware offload (See blkdev_issue_zeroout()).
1804 */
1805int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector,
1806 sector_t nr_sects, gfp_t gfp_mask)
1807{
1808 int ret;
1809
1810 if (WARN_ON_ONCE(!bdev_is_zoned(bdev)))
1811 return -EIO;
1812
1813 ret = blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
1814 BLKDEV_ZERO_NOFALLBACK);
1815 if (ret != -EOPNOTSUPP)
1816 return ret;
1817
1818 /*
1819 * The failed call to blkdev_issue_zeroout() advanced the zone write
1820 * pointer. Undo this using a report zone to update the zone write
1821 * pointer to the correct current value.
1822 */
1823 ret = disk_zone_sync_wp_offset(bdev->bd_disk, sector);
1824 if (ret != 1)
1825 return ret < 0 ? ret : -EIO;
1826
1827 /*
1828 * Retry without BLKDEV_ZERO_NOFALLBACK to force the fallback to a
1829 * regular write with zero-pages.
1830 */
1831 return blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 0);
1832}
1833EXPORT_SYMBOL_GPL(blk_zone_issue_zeroout);
1834
1835#ifdef CONFIG_BLK_DEBUG_FS
1836
1837int queue_zone_wplugs_show(void *data, struct seq_file *m)
1838{
1839 struct request_queue *q = data;
1840 struct gendisk *disk = q->disk;
1841 struct blk_zone_wplug *zwplug;
1842 unsigned int zwp_wp_offset, zwp_flags;
1843 unsigned int zwp_zone_no, zwp_ref;
1844 unsigned int zwp_bio_list_size, i;
1845 unsigned long flags;
1846
1847 if (!disk->zone_wplugs_hash)
1848 return 0;
1849
1850 rcu_read_lock();
1851 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
1852 hlist_for_each_entry_rcu(zwplug,
1853 &disk->zone_wplugs_hash[i], node) {
1854 spin_lock_irqsave(&zwplug->lock, flags);
1855 zwp_zone_no = zwplug->zone_no;
1856 zwp_flags = zwplug->flags;
1857 zwp_ref = refcount_read(&zwplug->ref);
1858 zwp_wp_offset = zwplug->wp_offset;
1859 zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
1860 spin_unlock_irqrestore(&zwplug->lock, flags);
1861
1862 seq_printf(m, "%u 0x%x %u %u %u\n",
1863 zwp_zone_no, zwp_flags, zwp_ref,
1864 zwp_wp_offset, zwp_bio_list_size);
1865 }
1866 }
1867 rcu_read_unlock();
1868
1869 return 0;
1870}
1871
1872#endif