direct-io.c - fs/btrfs/direct-io.c - Linux source code v6.2

Note: File does not exist in v6.2.
   1// SPDX-License-Identifier: GPL-2.0
   2
   3#include <linux/fsverity.h>
   4#include <linux/iomap.h>
   5#include "ctree.h"
   6#include "delalloc-space.h"
   7#include "direct-io.h"
   8#include "extent-tree.h"
   9#include "file.h"
  10#include "fs.h"
  11#include "transaction.h"
  12#include "volumes.h"
  13
  14struct btrfs_dio_data {
  15	ssize_t submitted;
  16	struct extent_changeset *data_reserved;
  17	struct btrfs_ordered_extent *ordered;
  18	bool data_space_reserved;
  19	bool nocow_done;
  20};
  21
  22struct btrfs_dio_private {
  23	/* Range of I/O */
  24	u64 file_offset;
  25	u32 bytes;
  26
  27	/* This must be last */
  28	struct btrfs_bio bbio;
  29};
  30
  31static struct bio_set btrfs_dio_bioset;
  32
  33static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
  34			      struct extent_state **cached_state,
  35			      unsigned int iomap_flags)
  36{
  37	const bool writing = (iomap_flags & IOMAP_WRITE);
  38	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
  39	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
  40	struct btrfs_ordered_extent *ordered;
  41	int ret = 0;
  42
  43	/* Direct lock must be taken before the extent lock. */
  44	if (nowait) {
  45		if (!try_lock_dio_extent(io_tree, lockstart, lockend, cached_state))
  46			return -EAGAIN;
  47	} else {
  48		lock_dio_extent(io_tree, lockstart, lockend, cached_state);
  49	}
  50
  51	while (1) {
  52		if (nowait) {
  53			if (!try_lock_extent(io_tree, lockstart, lockend,
  54					     cached_state)) {
  55				ret = -EAGAIN;
  56				break;
  57			}
  58		} else {
  59			lock_extent(io_tree, lockstart, lockend, cached_state);
  60		}
  61		/*
  62		 * We're concerned with the entire range that we're going to be
  63		 * doing DIO to, so we need to make sure there's no ordered
  64		 * extents in this range.
  65		 */
  66		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
  67						     lockend - lockstart + 1);
  68
  69		/*
  70		 * We need to make sure there are no buffered pages in this
  71		 * range either, we could have raced between the invalidate in
  72		 * generic_file_direct_write and locking the extent.  The
  73		 * invalidate needs to happen so that reads after a write do not
  74		 * get stale data.
  75		 */
  76		if (!ordered &&
  77		    (!writing || !filemap_range_has_page(inode->i_mapping,
  78							 lockstart, lockend)))
  79			break;
  80
  81		unlock_extent(io_tree, lockstart, lockend, cached_state);
  82
  83		if (ordered) {
  84			if (nowait) {
  85				btrfs_put_ordered_extent(ordered);
  86				ret = -EAGAIN;
  87				break;
  88			}
  89			/*
  90			 * If we are doing a DIO read and the ordered extent we
  91			 * found is for a buffered write, we can not wait for it
  92			 * to complete and retry, because if we do so we can
  93			 * deadlock with concurrent buffered writes on page
  94			 * locks. This happens only if our DIO read covers more
  95			 * than one extent map, if at this point has already
  96			 * created an ordered extent for a previous extent map
  97			 * and locked its range in the inode's io tree, and a
  98			 * concurrent write against that previous extent map's
  99			 * range and this range started (we unlock the ranges
 100			 * in the io tree only when the bios complete and
 101			 * buffered writes always lock pages before attempting
 102			 * to lock range in the io tree).
 103			 */
 104			if (writing ||
 105			    test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
 106				btrfs_start_ordered_extent(ordered);
 107			else
 108				ret = nowait ? -EAGAIN : -ENOTBLK;
 109			btrfs_put_ordered_extent(ordered);
 110		} else {
 111			/*
 112			 * We could trigger writeback for this range (and wait
 113			 * for it to complete) and then invalidate the pages for
 114			 * this range (through invalidate_inode_pages2_range()),
 115			 * but that can lead us to a deadlock with a concurrent
 116			 * call to readahead (a buffered read or a defrag call
 117			 * triggered a readahead) on a page lock due to an
 118			 * ordered dio extent we created before but did not have
 119			 * yet a corresponding bio submitted (whence it can not
 120			 * complete), which makes readahead wait for that
 121			 * ordered extent to complete while holding a lock on
 122			 * that page.
 123			 */
 124			ret = nowait ? -EAGAIN : -ENOTBLK;
 125		}
 126
 127		if (ret)
 128			break;
 129
 130		cond_resched();
 131	}
 132
 133	if (ret)
 134		unlock_dio_extent(io_tree, lockstart, lockend, cached_state);
 135	return ret;
 136}
 137
 138static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
 139						  struct btrfs_dio_data *dio_data,
 140						  const u64 start,
 141						  const struct btrfs_file_extent *file_extent,
 142						  const int type)
 143{
 144	struct extent_map *em = NULL;
 145	struct btrfs_ordered_extent *ordered;
 146
 147	if (type != BTRFS_ORDERED_NOCOW) {
 148		em = btrfs_create_io_em(inode, start, file_extent, type);
 149		if (IS_ERR(em))
 150			goto out;
 151	}
 152
 153	ordered = btrfs_alloc_ordered_extent(inode, start, file_extent,
 154					     (1 << type) |
 155					     (1 << BTRFS_ORDERED_DIRECT));
 156	if (IS_ERR(ordered)) {
 157		if (em) {
 158			free_extent_map(em);
 159			btrfs_drop_extent_map_range(inode, start,
 160					start + file_extent->num_bytes - 1, false);
 161		}
 162		em = ERR_CAST(ordered);
 163	} else {
 164		ASSERT(!dio_data->ordered);
 165		dio_data->ordered = ordered;
 166	}
 167 out:
 168
 169	return em;
 170}
 171
 172static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
 173						  struct btrfs_dio_data *dio_data,
 174						  u64 start, u64 len)
 175{
 176	struct btrfs_root *root = inode->root;
 177	struct btrfs_fs_info *fs_info = root->fs_info;
 178	struct btrfs_file_extent file_extent;
 179	struct extent_map *em;
 180	struct btrfs_key ins;
 181	u64 alloc_hint;
 182	int ret;
 183
 184	alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len);
 185again:
 186	ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
 187				   0, alloc_hint, &ins, 1, 1);
 188	if (ret == -EAGAIN) {
 189		ASSERT(btrfs_is_zoned(fs_info));
 190		wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
 191			       TASK_UNINTERRUPTIBLE);
 192		goto again;
 193	}
 194	if (ret)
 195		return ERR_PTR(ret);
 196
 197	file_extent.disk_bytenr = ins.objectid;
 198	file_extent.disk_num_bytes = ins.offset;
 199	file_extent.num_bytes = ins.offset;
 200	file_extent.ram_bytes = ins.offset;
 201	file_extent.offset = 0;
 202	file_extent.compression = BTRFS_COMPRESS_NONE;
 203	em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent,
 204				     BTRFS_ORDERED_REGULAR);
 205	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 206	if (IS_ERR(em))
 207		btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
 208					   1);
 209
 210	return em;
 211}
 212
 213static int btrfs_get_blocks_direct_write(struct extent_map **map,
 214					 struct inode *inode,
 215					 struct btrfs_dio_data *dio_data,
 216					 u64 start, u64 *lenp,
 217					 unsigned int iomap_flags)
 218{
 219	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
 220	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 221	struct btrfs_file_extent file_extent;
 222	struct extent_map *em = *map;
 223	int type;
 224	u64 block_start;
 225	struct btrfs_block_group *bg;
 226	bool can_nocow = false;
 227	bool space_reserved = false;
 228	u64 len = *lenp;
 229	u64 prev_len;
 230	int ret = 0;
 231
 232	/*
 233	 * We don't allocate a new extent in the following cases
 234	 *
 235	 * 1) The inode is marked as NODATACOW. In this case we'll just use the
 236	 * existing extent.
 237	 * 2) The extent is marked as PREALLOC. We're good to go here and can
 238	 * just use the extent.
 239	 *
 240	 */
 241	if ((em->flags & EXTENT_FLAG_PREALLOC) ||
 242	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
 243	     em->disk_bytenr != EXTENT_MAP_HOLE)) {
 244		if (em->flags & EXTENT_FLAG_PREALLOC)
 245			type = BTRFS_ORDERED_PREALLOC;
 246		else
 247			type = BTRFS_ORDERED_NOCOW;
 248		len = min(len, em->len - (start - em->start));
 249		block_start = extent_map_block_start(em) + (start - em->start);
 250
 251		if (can_nocow_extent(inode, start, &len,
 252				     &file_extent, false, false) == 1) {
 253			bg = btrfs_inc_nocow_writers(fs_info, block_start);
 254			if (bg)
 255				can_nocow = true;
 256		}
 257	}
 258
 259	prev_len = len;
 260	if (can_nocow) {
 261		struct extent_map *em2;
 262
 263		/* We can NOCOW, so only need to reserve metadata space. */
 264		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
 265						      nowait);
 266		if (ret < 0) {
 267			/* Our caller expects us to free the input extent map. */
 268			free_extent_map(em);
 269			*map = NULL;
 270			btrfs_dec_nocow_writers(bg);
 271			if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
 272				ret = -EAGAIN;
 273			goto out;
 274		}
 275		space_reserved = true;
 276
 277		em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start,
 278					      &file_extent, type);
 279		btrfs_dec_nocow_writers(bg);
 280		if (type == BTRFS_ORDERED_PREALLOC) {
 281			free_extent_map(em);
 282			*map = em2;
 283			em = em2;
 284		}
 285
 286		if (IS_ERR(em2)) {
 287			ret = PTR_ERR(em2);
 288			goto out;
 289		}
 290
 291		dio_data->nocow_done = true;
 292	} else {
 293		/* Our caller expects us to free the input extent map. */
 294		free_extent_map(em);
 295		*map = NULL;
 296
 297		if (nowait) {
 298			ret = -EAGAIN;
 299			goto out;
 300		}
 301
 302		/*
 303		 * If we could not allocate data space before locking the file
 304		 * range and we can't do a NOCOW write, then we have to fail.
 305		 */
 306		if (!dio_data->data_space_reserved) {
 307			ret = -ENOSPC;
 308			goto out;
 309		}
 310
 311		/*
 312		 * We have to COW and we have already reserved data space before,
 313		 * so now we reserve only metadata.
 314		 */
 315		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
 316						      false);
 317		if (ret < 0)
 318			goto out;
 319		space_reserved = true;
 320
 321		em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
 322		if (IS_ERR(em)) {
 323			ret = PTR_ERR(em);
 324			goto out;
 325		}
 326		*map = em;
 327		len = min(len, em->len - (start - em->start));
 328		if (len < prev_len)
 329			btrfs_delalloc_release_metadata(BTRFS_I(inode),
 330							prev_len - len, true);
 331	}
 332
 333	/*
 334	 * We have created our ordered extent, so we can now release our reservation
 335	 * for an outstanding extent.
 336	 */
 337	btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
 338
 339	/*
 340	 * Need to update the i_size under the extent lock so buffered
 341	 * readers will get the updated i_size when we unlock.
 342	 */
 343	if (start + len > i_size_read(inode))
 344		i_size_write(inode, start + len);
 345out:
 346	if (ret && space_reserved) {
 347		btrfs_delalloc_release_extents(BTRFS_I(inode), len);
 348		btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
 349	}
 350	*lenp = len;
 351	return ret;
 352}
 353
 354static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
 355		loff_t length, unsigned int flags, struct iomap *iomap,
 356		struct iomap *srcmap)
 357{
 358	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
 359	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 360	struct extent_map *em;
 361	struct extent_state *cached_state = NULL;
 362	struct btrfs_dio_data *dio_data = iter->private;
 363	u64 lockstart, lockend;
 364	const bool write = !!(flags & IOMAP_WRITE);
 365	int ret = 0;
 366	u64 len = length;
 367	const u64 data_alloc_len = length;
 368	u32 unlock_bits = EXTENT_LOCKED;
 369
 370	/*
 371	 * We could potentially fault if we have a buffer > PAGE_SIZE, and if
 372	 * we're NOWAIT we may submit a bio for a partial range and return
 373	 * EIOCBQUEUED, which would result in an errant short read.
 374	 *
 375	 * The best way to handle this would be to allow for partial completions
 376	 * of iocb's, so we could submit the partial bio, return and fault in
 377	 * the rest of the pages, and then submit the io for the rest of the
 378	 * range.  However we don't have that currently, so simply return
 379	 * -EAGAIN at this point so that the normal path is used.
 380	 */
 381	if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
 382		return -EAGAIN;
 383
 384	/*
 385	 * Cap the size of reads to that usually seen in buffered I/O as we need
 386	 * to allocate a contiguous array for the checksums.
 387	 */
 388	if (!write)
 389		len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
 390
 391	lockstart = start;
 392	lockend = start + len - 1;
 393
 394	/*
 395	 * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
 396	 * enough if we've written compressed pages to this area, so we need to
 397	 * flush the dirty pages again to make absolutely sure that any
 398	 * outstanding dirty pages are on disk - the first flush only starts
 399	 * compression on the data, while keeping the pages locked, so by the
 400	 * time the second flush returns we know bios for the compressed pages
 401	 * were submitted and finished, and the pages no longer under writeback.
 402	 *
 403	 * If we have a NOWAIT request and we have any pages in the range that
 404	 * are locked, likely due to compression still in progress, we don't want
 405	 * to block on page locks. We also don't want to block on pages marked as
 406	 * dirty or under writeback (same as for the non-compression case).
 407	 * iomap_dio_rw() did the same check, but after that and before we got
 408	 * here, mmap'ed writes may have happened or buffered reads started
 409	 * (readpage() and readahead(), which lock pages), as we haven't locked
 410	 * the file range yet.
 411	 */
 412	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
 413		     &BTRFS_I(inode)->runtime_flags)) {
 414		if (flags & IOMAP_NOWAIT) {
 415			if (filemap_range_needs_writeback(inode->i_mapping,
 416							  lockstart, lockend))
 417				return -EAGAIN;
 418		} else {
 419			ret = filemap_fdatawrite_range(inode->i_mapping, start,
 420						       start + length - 1);
 421			if (ret)
 422				return ret;
 423		}
 424	}
 425
 426	memset(dio_data, 0, sizeof(*dio_data));
 427
 428	/*
 429	 * We always try to allocate data space and must do it before locking
 430	 * the file range, to avoid deadlocks with concurrent writes to the same
 431	 * range if the range has several extents and the writes don't expand the
 432	 * current i_size (the inode lock is taken in shared mode). If we fail to
 433	 * allocate data space here we continue and later, after locking the
 434	 * file range, we fail with ENOSPC only if we figure out we can not do a
 435	 * NOCOW write.
 436	 */
 437	if (write && !(flags & IOMAP_NOWAIT)) {
 438		ret = btrfs_check_data_free_space(BTRFS_I(inode),
 439						  &dio_data->data_reserved,
 440						  start, data_alloc_len, false);
 441		if (!ret)
 442			dio_data->data_space_reserved = true;
 443		else if (ret && !(BTRFS_I(inode)->flags &
 444				  (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
 445			goto err;
 446	}
 447
 448	/*
 449	 * If this errors out it's because we couldn't invalidate pagecache for
 450	 * this range and we need to fallback to buffered IO, or we are doing a
 451	 * NOWAIT read/write and we need to block.
 452	 */
 453	ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
 454	if (ret < 0)
 455		goto err;
 456
 457	em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
 458	if (IS_ERR(em)) {
 459		ret = PTR_ERR(em);
 460		goto unlock_err;
 461	}
 462
 463	/*
 464	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
 465	 * io.  INLINE is special, and we could probably kludge it in here, but
 466	 * it's still buffered so for safety lets just fall back to the generic
 467	 * buffered path.
 468	 *
 469	 * For COMPRESSED we _have_ to read the entire extent in so we can
 470	 * decompress it, so there will be buffering required no matter what we
 471	 * do, so go ahead and fallback to buffered.
 472	 *
 473	 * We return -ENOTBLK because that's what makes DIO go ahead and go back
 474	 * to buffered IO.  Don't blame me, this is the price we pay for using
 475	 * the generic code.
 476	 */
 477	if (extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
 478		free_extent_map(em);
 479		/*
 480		 * If we are in a NOWAIT context, return -EAGAIN in order to
 481		 * fallback to buffered IO. This is not only because we can
 482		 * block with buffered IO (no support for NOWAIT semantics at
 483		 * the moment) but also to avoid returning short reads to user
 484		 * space - this happens if we were able to read some data from
 485		 * previous non-compressed extents and then when we fallback to
 486		 * buffered IO, at btrfs_file_read_iter() by calling
 487		 * filemap_read(), we fail to fault in pages for the read buffer,
 488		 * in which case filemap_read() returns a short read (the number
 489		 * of bytes previously read is > 0, so it does not return -EFAULT).
 490		 */
 491		ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
 492		goto unlock_err;
 493	}
 494
 495	len = min(len, em->len - (start - em->start));
 496
 497	/*
 498	 * If we have a NOWAIT request and the range contains multiple extents
 499	 * (or a mix of extents and holes), then we return -EAGAIN to make the
 500	 * caller fallback to a context where it can do a blocking (without
 501	 * NOWAIT) request. This way we avoid doing partial IO and returning
 502	 * success to the caller, which is not optimal for writes and for reads
 503	 * it can result in unexpected behaviour for an application.
 504	 *
 505	 * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
 506	 * iomap_dio_rw(), we can end up returning less data then what the caller
 507	 * asked for, resulting in an unexpected, and incorrect, short read.
 508	 * That is, the caller asked to read N bytes and we return less than that,
 509	 * which is wrong unless we are crossing EOF. This happens if we get a
 510	 * page fault error when trying to fault in pages for the buffer that is
 511	 * associated to the struct iov_iter passed to iomap_dio_rw(), and we
 512	 * have previously submitted bios for other extents in the range, in
 513	 * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
 514	 * those bios have completed by the time we get the page fault error,
 515	 * which we return back to our caller - we should only return EIOCBQUEUED
 516	 * after we have submitted bios for all the extents in the range.
 517	 */
 518	if ((flags & IOMAP_NOWAIT) && len < length) {
 519		free_extent_map(em);
 520		ret = -EAGAIN;
 521		goto unlock_err;
 522	}
 523
 524	if (write) {
 525		ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
 526						    start, &len, flags);
 527		if (ret < 0)
 528			goto unlock_err;
 529		/* Recalc len in case the new em is smaller than requested */
 530		len = min(len, em->len - (start - em->start));
 531		if (dio_data->data_space_reserved) {
 532			u64 release_offset;
 533			u64 release_len = 0;
 534
 535			if (dio_data->nocow_done) {
 536				release_offset = start;
 537				release_len = data_alloc_len;
 538			} else if (len < data_alloc_len) {
 539				release_offset = start + len;
 540				release_len = data_alloc_len - len;
 541			}
 542
 543			if (release_len > 0)
 544				btrfs_free_reserved_data_space(BTRFS_I(inode),
 545							       dio_data->data_reserved,
 546							       release_offset,
 547							       release_len);
 548		}
 549	}
 550
 551	/*
 552	 * Translate extent map information to iomap.
 553	 * We trim the extents (and move the addr) even though iomap code does
 554	 * that, since we have locked only the parts we are performing I/O in.
 555	 */
 556	if ((em->disk_bytenr == EXTENT_MAP_HOLE) ||
 557	    ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
 558		iomap->addr = IOMAP_NULL_ADDR;
 559		iomap->type = IOMAP_HOLE;
 560	} else {
 561		iomap->addr = extent_map_block_start(em) + (start - em->start);
 562		iomap->type = IOMAP_MAPPED;
 563	}
 564	iomap->offset = start;
 565	iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
 566	iomap->length = len;
 567	free_extent_map(em);
 568
 569	/*
 570	 * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed,
 571	 * writes only hold it for this part.  We hold the extent lock until
 572	 * we're completely done with the extent map to make sure it remains
 573	 * valid.
 574	 */
 575	if (write)
 576		unlock_bits |= EXTENT_DIO_LOCKED;
 577
 578	clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
 579			 unlock_bits, &cached_state);
 580
 581	/* We didn't use everything, unlock the dio extent for the remainder. */
 582	if (!write && (start + len) < lockend)
 583		unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len,
 584				  lockend, NULL);
 585
 586	return 0;
 587
 588unlock_err:
 589	/*
 590	 * Don't use EXTENT_LOCK_BITS here in case we extend it later and forget
 591	 * to update this, be explicit that we expect EXTENT_LOCKED and
 592	 * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing.
 593	 */
 594	clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
 595			 EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state);
 596err:
 597	if (dio_data->data_space_reserved) {
 598		btrfs_free_reserved_data_space(BTRFS_I(inode),
 599					       dio_data->data_reserved,
 600					       start, data_alloc_len);
 601		extent_changeset_free(dio_data->data_reserved);
 602	}
 603
 604	return ret;
 605}
 606
 607static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
 608		ssize_t written, unsigned int flags, struct iomap *iomap)
 609{
 610	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
 611	struct btrfs_dio_data *dio_data = iter->private;
 612	size_t submitted = dio_data->submitted;
 613	const bool write = !!(flags & IOMAP_WRITE);
 614	int ret = 0;
 615
 616	if (!write && (iomap->type == IOMAP_HOLE)) {
 617		/* If reading from a hole, unlock and return */
 618		unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
 619				  pos + length - 1, NULL);
 620		return 0;
 621	}
 622
 623	if (submitted < length) {
 624		pos += submitted;
 625		length -= submitted;
 626		if (write)
 627			btrfs_finish_ordered_extent(dio_data->ordered, NULL,
 628						    pos, length, false);
 629		else
 630			unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
 631					  pos + length - 1, NULL);
 632		ret = -ENOTBLK;
 633	}
 634	if (write) {
 635		btrfs_put_ordered_extent(dio_data->ordered);
 636		dio_data->ordered = NULL;
 637	}
 638
 639	if (write)
 640		extent_changeset_free(dio_data->data_reserved);
 641	return ret;
 642}
 643
 644static void btrfs_dio_end_io(struct btrfs_bio *bbio)
 645{
 646	struct btrfs_dio_private *dip =
 647		container_of(bbio, struct btrfs_dio_private, bbio);
 648	struct btrfs_inode *inode = bbio->inode;
 649	struct bio *bio = &bbio->bio;
 650
 651	if (bio->bi_status) {
 652		btrfs_warn(inode->root->fs_info,
 653		"direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
 654			   btrfs_ino(inode), bio->bi_opf,
 655			   dip->file_offset, dip->bytes, bio->bi_status);
 656	}
 657
 658	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
 659		btrfs_finish_ordered_extent(bbio->ordered, NULL,
 660					    dip->file_offset, dip->bytes,
 661					    !bio->bi_status);
 662	} else {
 663		unlock_dio_extent(&inode->io_tree, dip->file_offset,
 664				  dip->file_offset + dip->bytes - 1, NULL);
 665	}
 666
 667	bbio->bio.bi_private = bbio->private;
 668	iomap_dio_bio_end_io(bio);
 669}
 670
 671static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
 672					struct btrfs_ordered_extent *ordered)
 673{
 674	u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
 675	u64 len = bbio->bio.bi_iter.bi_size;
 676	struct btrfs_ordered_extent *new;
 677	int ret;
 678
 679	/* Must always be called for the beginning of an ordered extent. */
 680	if (WARN_ON_ONCE(start != ordered->disk_bytenr))
 681		return -EINVAL;
 682
 683	/* No need to split if the ordered extent covers the entire bio. */
 684	if (ordered->disk_num_bytes == len) {
 685		refcount_inc(&ordered->refs);
 686		bbio->ordered = ordered;
 687		return 0;
 688	}
 689
 690	/*
 691	 * Don't split the extent_map for NOCOW extents, as we're writing into
 692	 * a pre-existing one.
 693	 */
 694	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
 695		ret = split_extent_map(bbio->inode, bbio->file_offset,
 696				       ordered->num_bytes, len,
 697				       ordered->disk_bytenr);
 698		if (ret)
 699			return ret;
 700	}
 701
 702	new = btrfs_split_ordered_extent(ordered, len);
 703	if (IS_ERR(new))
 704		return PTR_ERR(new);
 705	bbio->ordered = new;
 706	return 0;
 707}
 708
 709static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
 710				loff_t file_offset)
 711{
 712	struct btrfs_bio *bbio = btrfs_bio(bio);
 713	struct btrfs_dio_private *dip =
 714		container_of(bbio, struct btrfs_dio_private, bbio);
 715	struct btrfs_dio_data *dio_data = iter->private;
 716
 717	btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
 718		       btrfs_dio_end_io, bio->bi_private);
 719	bbio->inode = BTRFS_I(iter->inode);
 720	bbio->file_offset = file_offset;
 721
 722	dip->file_offset = file_offset;
 723	dip->bytes = bio->bi_iter.bi_size;
 724
 725	dio_data->submitted += bio->bi_iter.bi_size;
 726
 727	/*
 728	 * Check if we are doing a partial write.  If we are, we need to split
 729	 * the ordered extent to match the submitted bio.  Hang on to the
 730	 * remaining unfinishable ordered_extent in dio_data so that it can be
 731	 * cancelled in iomap_end to avoid a deadlock wherein faulting the
 732	 * remaining pages is blocked on the outstanding ordered extent.
 733	 */
 734	if (iter->flags & IOMAP_WRITE) {
 735		int ret;
 736
 737		ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
 738		if (ret) {
 739			btrfs_finish_ordered_extent(dio_data->ordered, NULL,
 740						    file_offset, dip->bytes,
 741						    !ret);
 742			bio->bi_status = errno_to_blk_status(ret);
 743			iomap_dio_bio_end_io(bio);
 744			return;
 745		}
 746	}
 747
 748	btrfs_submit_bbio(bbio, 0);
 749}
 750
 751static const struct iomap_ops btrfs_dio_iomap_ops = {
 752	.iomap_begin            = btrfs_dio_iomap_begin,
 753	.iomap_end              = btrfs_dio_iomap_end,
 754};
 755
 756static const struct iomap_dio_ops btrfs_dio_ops = {
 757	.submit_io		= btrfs_dio_submit_io,
 758	.bio_set		= &btrfs_dio_bioset,
 759};
 760
 761static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
 762			      size_t done_before)
 763{
 764	struct btrfs_dio_data data = { 0 };
 765
 766	return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
 767			    IOMAP_DIO_PARTIAL, &data, done_before);
 768}
 769
 770static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
 771					 size_t done_before)
 772{
 773	struct btrfs_dio_data data = { 0 };
 774
 775	return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
 776			    IOMAP_DIO_PARTIAL, &data, done_before);
 777}
 778
 779static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
 780			       const struct iov_iter *iter, loff_t offset)
 781{
 782	const u32 blocksize_mask = fs_info->sectorsize - 1;
 783
 784	if (offset & blocksize_mask)
 785		return -EINVAL;
 786
 787	if (iov_iter_alignment(iter) & blocksize_mask)
 788		return -EINVAL;
 789
 790	return 0;
 791}
 792
 793ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
 794{
 795	struct file *file = iocb->ki_filp;
 796	struct inode *inode = file_inode(file);
 797	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 798	loff_t pos;
 799	ssize_t written = 0;
 800	ssize_t written_buffered;
 801	size_t prev_left = 0;
 802	loff_t endbyte;
 803	ssize_t ret;
 804	unsigned int ilock_flags = 0;
 805	struct iomap_dio *dio;
 806
 807	if (iocb->ki_flags & IOCB_NOWAIT)
 808		ilock_flags |= BTRFS_ILOCK_TRY;
 809
 810	/*
 811	 * If the write DIO is within EOF, use a shared lock and also only if
 812	 * security bits will likely not be dropped by file_remove_privs() called
 813	 * from btrfs_write_check(). Either will need to be rechecked after the
 814	 * lock was acquired.
 815	 */
 816	if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
 817		ilock_flags |= BTRFS_ILOCK_SHARED;
 818
 819relock:
 820	ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
 821	if (ret < 0)
 822		return ret;
 823
 824	/* Shared lock cannot be used with security bits set. */
 825	if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
 826		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
 827		ilock_flags &= ~BTRFS_ILOCK_SHARED;
 828		goto relock;
 829	}
 830
 831	ret = generic_write_checks(iocb, from);
 832	if (ret <= 0) {
 833		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
 834		return ret;
 835	}
 836
 837	ret = btrfs_write_check(iocb, ret);
 838	if (ret < 0) {
 839		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
 840		goto out;
 841	}
 842
 843	pos = iocb->ki_pos;
 844	/*
 845	 * Re-check since file size may have changed just before taking the
 846	 * lock or pos may have changed because of O_APPEND in generic_write_check()
 847	 */
 848	if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
 849	    pos + iov_iter_count(from) > i_size_read(inode)) {
 850		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
 851		ilock_flags &= ~BTRFS_ILOCK_SHARED;
 852		goto relock;
 853	}
 854
 855	if (check_direct_IO(fs_info, from, pos)) {
 856		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
 857		goto buffered;
 858	}
 859
 860	/*
 861	 * The iov_iter can be mapped to the same file range we are writing to.
 862	 * If that's the case, then we will deadlock in the iomap code, because
 863	 * it first calls our callback btrfs_dio_iomap_begin(), which will create
 864	 * an ordered extent, and after that it will fault in the pages that the
 865	 * iov_iter refers to. During the fault in we end up in the readahead
 866	 * pages code (starting at btrfs_readahead()), which will lock the range,
 867	 * find that ordered extent and then wait for it to complete (at
 868	 * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
 869	 * obviously the ordered extent can never complete as we didn't submit
 870	 * yet the respective bio(s). This always happens when the buffer is
 871	 * memory mapped to the same file range, since the iomap DIO code always
 872	 * invalidates pages in the target file range (after starting and waiting
 873	 * for any writeback).
 874	 *
 875	 * So here we disable page faults in the iov_iter and then retry if we
 876	 * got -EFAULT, faulting in the pages before the retry.
 877	 */
 878again:
 879	from->nofault = true;
 880	dio = btrfs_dio_write(iocb, from, written);
 881	from->nofault = false;
 882
 883	if (IS_ERR_OR_NULL(dio)) {
 884		ret = PTR_ERR_OR_ZERO(dio);
 885	} else {
 886		/*
 887		 * If we have a synchronous write, we must make sure the fsync
 888		 * triggered by the iomap_dio_complete() call below doesn't
 889		 * deadlock on the inode lock - we are already holding it and we
 890		 * can't call it after unlocking because we may need to complete
 891		 * partial writes due to the input buffer (or parts of it) not
 892		 * being already faulted in.
 893		 */
 894		ASSERT(current->journal_info == NULL);
 895		current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
 896		ret = iomap_dio_complete(dio);
 897		current->journal_info = NULL;
 898	}
 899
 900	/* No increment (+=) because iomap returns a cumulative value. */
 901	if (ret > 0)
 902		written = ret;
 903
 904	if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
 905		const size_t left = iov_iter_count(from);
 906		/*
 907		 * We have more data left to write. Try to fault in as many as
 908		 * possible of the remainder pages and retry. We do this without
 909		 * releasing and locking again the inode, to prevent races with
 910		 * truncate.
 911		 *
 912		 * Also, in case the iov refers to pages in the file range of the
 913		 * file we want to write to (due to a mmap), we could enter an
 914		 * infinite loop if we retry after faulting the pages in, since
 915		 * iomap will invalidate any pages in the range early on, before
 916		 * it tries to fault in the pages of the iov. So we keep track of
 917		 * how much was left of iov in the previous EFAULT and fallback
 918		 * to buffered IO in case we haven't made any progress.
 919		 */
 920		if (left == prev_left) {
 921			ret = -ENOTBLK;
 922		} else {
 923			fault_in_iov_iter_readable(from, left);
 924			prev_left = left;
 925			goto again;
 926		}
 927	}
 928
 929	btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
 930
 931	/*
 932	 * If 'ret' is -ENOTBLK or we have not written all data, then it means
 933	 * we must fallback to buffered IO.
 934	 */
 935	if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
 936		goto out;
 937
 938buffered:
 939	/*
 940	 * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
 941	 * it must retry the operation in a context where blocking is acceptable,
 942	 * because even if we end up not blocking during the buffered IO attempt
 943	 * below, we will block when flushing and waiting for the IO.
 944	 */
 945	if (iocb->ki_flags & IOCB_NOWAIT) {
 946		ret = -EAGAIN;
 947		goto out;
 948	}
 949
 950	pos = iocb->ki_pos;
 951	written_buffered = btrfs_buffered_write(iocb, from);
 952	if (written_buffered < 0) {
 953		ret = written_buffered;
 954		goto out;
 955	}
 956	/*
 957	 * Ensure all data is persisted. We want the next direct IO read to be
 958	 * able to read what was just written.
 959	 */
 960	endbyte = pos + written_buffered - 1;
 961	ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte);
 962	if (ret)
 963		goto out;
 964	ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
 965	if (ret)
 966		goto out;
 967	written += written_buffered;
 968	iocb->ki_pos = pos + written_buffered;
 969	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
 970				 endbyte >> PAGE_SHIFT);
 971out:
 972	return ret < 0 ? ret : written;
 973}
 974
 975static int check_direct_read(struct btrfs_fs_info *fs_info,
 976			     const struct iov_iter *iter, loff_t offset)
 977{
 978	int ret;
 979	int i, seg;
 980
 981	ret = check_direct_IO(fs_info, iter, offset);
 982	if (ret < 0)
 983		return ret;
 984
 985	if (!iter_is_iovec(iter))
 986		return 0;
 987
 988	for (seg = 0; seg < iter->nr_segs; seg++) {
 989		for (i = seg + 1; i < iter->nr_segs; i++) {
 990			const struct iovec *iov1 = iter_iov(iter) + seg;
 991			const struct iovec *iov2 = iter_iov(iter) + i;
 992
 993			if (iov1->iov_base == iov2->iov_base)
 994				return -EINVAL;
 995		}
 996	}
 997	return 0;
 998}
 999
1000ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
1001{
1002	struct inode *inode = file_inode(iocb->ki_filp);
1003	size_t prev_left = 0;
1004	ssize_t read = 0;
1005	ssize_t ret;
1006
1007	if (fsverity_active(inode))
1008		return 0;
1009
1010	if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
1011		return 0;
1012
1013	btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
1014again:
1015	/*
1016	 * This is similar to what we do for direct IO writes, see the comment
1017	 * at btrfs_direct_write(), but we also disable page faults in addition
1018	 * to disabling them only at the iov_iter level. This is because when
1019	 * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
1020	 * which can still trigger page fault ins despite having set ->nofault
1021	 * to true of our 'to' iov_iter.
1022	 *
1023	 * The difference to direct IO writes is that we deadlock when trying
1024	 * to lock the extent range in the inode's tree during he page reads
1025	 * triggered by the fault in (while for writes it is due to waiting for
1026	 * our own ordered extent). This is because for direct IO reads,
1027	 * btrfs_dio_iomap_begin() returns with the extent range locked, which
1028	 * is only unlocked in the endio callback (end_bio_extent_readpage()).
1029	 */
1030	pagefault_disable();
1031	to->nofault = true;
1032	ret = btrfs_dio_read(iocb, to, read);
1033	to->nofault = false;
1034	pagefault_enable();
1035
1036	/* No increment (+=) because iomap returns a cumulative value. */
1037	if (ret > 0)
1038		read = ret;
1039
1040	if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
1041		const size_t left = iov_iter_count(to);
1042
1043		if (left == prev_left) {
1044			/*
1045			 * We didn't make any progress since the last attempt,
1046			 * fallback to a buffered read for the remainder of the
1047			 * range. This is just to avoid any possibility of looping
1048			 * for too long.
1049			 */
1050			ret = read;
1051		} else {
1052			/*
1053			 * We made some progress since the last retry or this is
1054			 * the first time we are retrying. Fault in as many pages
1055			 * as possible and retry.
1056			 */
1057			fault_in_iov_iter_writeable(to, left);
1058			prev_left = left;
1059			goto again;
1060		}
1061	}
1062	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
1063	return ret < 0 ? ret : read;
1064}
1065
1066int __init btrfs_init_dio(void)
1067{
1068	if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
1069			offsetof(struct btrfs_dio_private, bbio.bio),
1070			BIOSET_NEED_BVECS))
1071		return -ENOMEM;
1072
1073	return 0;
1074}
1075
1076void __cold btrfs_destroy_dio(void)
1077{
1078	bioset_exit(&btrfs_dio_bioset);
1079}