xfs_aops.c - fs/xfs/xfs_aops.c - Linux diff v6.9.4 - Bootlin Elixir Cross Referencer

  1// SPDX-License-Identifier: GPL-2.0
  2/*
  3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
  4 * Copyright (c) 2016-2018 Christoph Hellwig.
  5 * All Rights Reserved.
  6 */
  7#include "xfs.h"
  8#include "xfs_shared.h"
  9#include "xfs_format.h"
 10#include "xfs_log_format.h"
 11#include "xfs_trans_resv.h"
 12#include "xfs_mount.h"
 13#include "xfs_inode.h"
 14#include "xfs_trans.h"
 15#include "xfs_iomap.h"
 16#include "xfs_trace.h"
 17#include "xfs_bmap.h"
 18#include "xfs_bmap_util.h"
 19#include "xfs_reflink.h"
 20#include "xfs_errortag.h"
 21#include "xfs_error.h"
 22
 23struct xfs_writepage_ctx {
 24	struct iomap_writepage_ctx ctx;
 25	unsigned int		data_seq;
 26	unsigned int		cow_seq;
 27};
 28
 29static inline struct xfs_writepage_ctx *
 30XFS_WPC(struct iomap_writepage_ctx *ctx)
 31{
 32	return container_of(ctx, struct xfs_writepage_ctx, ctx);
 33}
 34
 35/*
 36 * Fast and loose check if this write could update the on-disk inode size.
 37 */
 38static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend)
 39{
 40	return ioend->io_offset + ioend->io_size >
 41		XFS_I(ioend->io_inode)->i_disk_size;
 42}
 43
 44/*
 45 * Update on-disk file size now that data has been written to disk.
 46 */
 47int
 48xfs_setfilesize(
 49	struct xfs_inode	*ip,
 50	xfs_off_t		offset,
 51	size_t			size)
 52{
 53	struct xfs_mount	*mp = ip->i_mount;
 54	struct xfs_trans	*tp;
 55	xfs_fsize_t		isize;
 56	int			error;
 57
 58	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
 59	if (error)
 60		return error;
 61
 62	xfs_ilock(ip, XFS_ILOCK_EXCL);
 63	isize = xfs_new_eof(ip, offset + size);
 64	if (!isize) {
 65		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 66		xfs_trans_cancel(tp);
 67		return 0;
 68	}
 69
 70	trace_xfs_setfilesize(ip, offset, size);
 71
 72	ip->i_disk_size = isize;
 73	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 74	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 75
 76	return xfs_trans_commit(tp);
 77}
 78
 79/*
 80 * IO write completion.
 81 */
 82STATIC void
 83xfs_end_ioend(
 84	struct iomap_ioend	*ioend)
 85{
 86	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
 87	struct xfs_mount	*mp = ip->i_mount;
 88	xfs_off_t		offset = ioend->io_offset;
 89	size_t			size = ioend->io_size;
 90	unsigned int		nofs_flag;
 91	int			error;
 92
 93	/*
 94	 * We can allocate memory here while doing writeback on behalf of
 95	 * memory reclaim.  To avoid memory allocation deadlocks set the
 96	 * task-wide nofs context for the following operations.
 97	 */
 98	nofs_flag = memalloc_nofs_save();
 99
100	/*
101	 * Just clean up the in-memory structures if the fs has been shut down.
102	 */
103	if (xfs_is_shutdown(mp)) {
104		error = -EIO;
105		goto done;
106	}
107
108	/*
109	 * Clean up all COW blocks and underlying data fork delalloc blocks on
110	 * I/O error. The delalloc punch is required because this ioend was
111	 * mapped to blocks in the COW fork and the associated pages are no
112	 * longer dirty. If we don't remove delalloc blocks here, they become
113	 * stale and can corrupt free space accounting on unmount.
114	 */
115	error = blk_status_to_errno(ioend->io_bio.bi_status);
116	if (unlikely(error)) {
117		if (ioend->io_flags & IOMAP_F_SHARED) {
118			xfs_reflink_cancel_cow_range(ip, offset, size, true);
119			xfs_bmap_punch_delalloc_range(ip, offset,
120					offset + size);
121		}
122		goto done;
123	}
124
125	/*
126	 * Success: commit the COW or unwritten blocks if needed.
127	 */
128	if (ioend->io_flags & IOMAP_F_SHARED)
129		error = xfs_reflink_end_cow(ip, offset, size);
130	else if (ioend->io_type == IOMAP_UNWRITTEN)
131		error = xfs_iomap_write_unwritten(ip, offset, size, false);
132
133	if (!error && xfs_ioend_is_append(ioend))
134		error = xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
135done:
136	iomap_finish_ioends(ioend, error);
137	memalloc_nofs_restore(nofs_flag);
138}
139
140/*
141 * Finish all pending IO completions that require transactional modifications.
142 *
143 * We try to merge physical and logically contiguous ioends before completion to
144 * minimise the number of transactions we need to perform during IO completion.
145 * Both unwritten extent conversion and COW remapping need to iterate and modify
146 * one physical extent at a time, so we gain nothing by merging physically
147 * discontiguous extents here.
148 *
149 * The ioend chain length that we can be processing here is largely unbound in
150 * length and we may have to perform significant amounts of work on each ioend
151 * to complete it. Hence we have to be careful about holding the CPU for too
152 * long in this loop.
153 */
154void
155xfs_end_io(
156	struct work_struct	*work)
157{
158	struct xfs_inode	*ip =
159		container_of(work, struct xfs_inode, i_ioend_work);
160	struct iomap_ioend	*ioend;
161	struct list_head	tmp;
162	unsigned long		flags;
163
164	spin_lock_irqsave(&ip->i_ioend_lock, flags);
165	list_replace_init(&ip->i_ioend_list, &tmp);
166	spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
167
168	iomap_sort_ioends(&tmp);
169	while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
170			io_list))) {
171		list_del_init(&ioend->io_list);
172		iomap_ioend_try_merge(ioend, &tmp);
173		xfs_end_ioend(ioend);
174		cond_resched();
175	}
176}
177
178STATIC void
179xfs_end_bio(
180	struct bio		*bio)
181{
182	struct iomap_ioend	*ioend = iomap_ioend_from_bio(bio);
183	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
184	unsigned long		flags;
185
186	spin_lock_irqsave(&ip->i_ioend_lock, flags);
187	if (list_empty(&ip->i_ioend_list))
188		WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
189					 &ip->i_ioend_work));
190	list_add_tail(&ioend->io_list, &ip->i_ioend_list);
191	spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
192}
193
194/*
195 * Fast revalidation of the cached writeback mapping. Return true if the current
196 * mapping is valid, false otherwise.
197 */
198static bool
199xfs_imap_valid(
200	struct iomap_writepage_ctx	*wpc,
201	struct xfs_inode		*ip,
202	loff_t				offset)
203{
204	if (offset < wpc->iomap.offset ||
205	    offset >= wpc->iomap.offset + wpc->iomap.length)
206		return false;
207	/*
208	 * If this is a COW mapping, it is sufficient to check that the mapping
209	 * covers the offset. Be careful to check this first because the caller
210	 * can revalidate a COW mapping without updating the data seqno.
211	 */
212	if (wpc->iomap.flags & IOMAP_F_SHARED)
213		return true;
214
215	/*
216	 * This is not a COW mapping. Check the sequence number of the data fork
217	 * because concurrent changes could have invalidated the extent. Check
218	 * the COW fork because concurrent changes since the last time we
219	 * checked (and found nothing at this offset) could have added
220	 * overlapping blocks.
221	 */
222	if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) {
223		trace_xfs_wb_data_iomap_invalid(ip, &wpc->iomap,
224				XFS_WPC(wpc)->data_seq, XFS_DATA_FORK);
225		return false;
226	}
227	if (xfs_inode_has_cow_data(ip) &&
228	    XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) {
229		trace_xfs_wb_cow_iomap_invalid(ip, &wpc->iomap,
230				XFS_WPC(wpc)->cow_seq, XFS_COW_FORK);
231		return false;
232	}
233	return true;
234}
235
236/*
237 * Pass in a dellalloc extent and convert it to real extents, return the real
238 * extent that maps offset_fsb in wpc->iomap.
239 *
240 * The current page is held locked so nothing could have removed the block
241 * backing offset_fsb, although it could have moved from the COW to the data
242 * fork by another thread.
243 */
244static int
245xfs_convert_blocks(
246	struct iomap_writepage_ctx *wpc,
247	struct xfs_inode	*ip,
248	int			whichfork,
249	loff_t			offset)
250{
251	int			error;
252	unsigned		*seq;
253
254	if (whichfork == XFS_COW_FORK)
255		seq = &XFS_WPC(wpc)->cow_seq;
256	else
257		seq = &XFS_WPC(wpc)->data_seq;
258
259	/*
260	 * Attempt to allocate whatever delalloc extent currently backs offset
261	 * and put the result into wpc->iomap.  Allocate in a loop because it
262	 * may take several attempts to allocate real blocks for a contiguous
263	 * delalloc extent if free space is sufficiently fragmented.
264	 */
265	do {
266		error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
267				&wpc->iomap, seq);
268		if (error)
269			return error;
270	} while (wpc->iomap.offset + wpc->iomap.length <= offset);
271
272	return 0;
273}
274
275static int
276xfs_map_blocks(
277	struct iomap_writepage_ctx *wpc,
278	struct inode		*inode,
279	loff_t			offset,
280	unsigned int		len)
281{
282	struct xfs_inode	*ip = XFS_I(inode);
283	struct xfs_mount	*mp = ip->i_mount;
284	ssize_t			count = i_blocksize(inode);
285	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
286	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
287	xfs_fileoff_t		cow_fsb;
288	int			whichfork;
289	struct xfs_bmbt_irec	imap;
290	struct xfs_iext_cursor	icur;
291	int			retries = 0;
292	int			error = 0;
293
294	if (xfs_is_shutdown(mp))
295		return -EIO;
296
297	XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
298
299	/*
300	 * COW fork blocks can overlap data fork blocks even if the blocks
301	 * aren't shared.  COW I/O always takes precedent, so we must always
302	 * check for overlap on reflink inodes unless the mapping is already a
303	 * COW one, or the COW fork hasn't changed from the last time we looked
304	 * at it.
305	 *
306	 * It's safe to check the COW fork if_seq here without the ILOCK because
307	 * we've indirectly protected against concurrent updates: writeback has
308	 * the page locked, which prevents concurrent invalidations by reflink
309	 * and directio and prevents concurrent buffered writes to the same
310	 * page.  Changes to if_seq always happen under i_lock, which protects
311	 * against concurrent updates and provides a memory barrier on the way
312	 * out that ensures that we always see the current value.
313	 */
314	if (xfs_imap_valid(wpc, ip, offset))
315		return 0;
316
317	/*
318	 * If we don't have a valid map, now it's time to get a new one for this
319	 * offset.  This will convert delayed allocations (including COW ones)
320	 * into real extents.  If we return without a valid map, it means we
321	 * landed in a hole and we skip the block.
322	 */
323retry:
324	cow_fsb = NULLFILEOFF;
325	whichfork = XFS_DATA_FORK;
326	xfs_ilock(ip, XFS_ILOCK_SHARED);
327	ASSERT(!xfs_need_iread_extents(&ip->i_df));
328
329	/*
330	 * Check if this is offset is covered by a COW extents, and if yes use
331	 * it directly instead of looking up anything in the data fork.
332	 */
333	if (xfs_inode_has_cow_data(ip) &&
334	    xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
335		cow_fsb = imap.br_startoff;
336	if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
337		XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
338		xfs_iunlock(ip, XFS_ILOCK_SHARED);
339
340		whichfork = XFS_COW_FORK;
341		goto allocate_blocks;
342	}
343
344	/*
345	 * No COW extent overlap. Revalidate now that we may have updated
346	 * ->cow_seq. If the data mapping is still valid, we're done.
347	 */
348	if (xfs_imap_valid(wpc, ip, offset)) {
349		xfs_iunlock(ip, XFS_ILOCK_SHARED);
350		return 0;
351	}
352
353	/*
354	 * If we don't have a valid map, now it's time to get a new one for this
355	 * offset.  This will convert delayed allocations (including COW ones)
356	 * into real extents.
357	 */
358	if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
359		imap.br_startoff = end_fsb;	/* fake a hole past EOF */
360	XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq);
361	xfs_iunlock(ip, XFS_ILOCK_SHARED);
362
363	/* landed in a hole or beyond EOF? */
364	if (imap.br_startoff > offset_fsb) {
365		imap.br_blockcount = imap.br_startoff - offset_fsb;
366		imap.br_startoff = offset_fsb;
367		imap.br_startblock = HOLESTARTBLOCK;
368		imap.br_state = XFS_EXT_NORM;
369	}
370
371	/*
372	 * Truncate to the next COW extent if there is one.  This is the only
373	 * opportunity to do this because we can skip COW fork lookups for the
374	 * subsequent blocks in the mapping; however, the requirement to treat
375	 * the COW range separately remains.
376	 */
377	if (cow_fsb != NULLFILEOFF &&
378	    cow_fsb < imap.br_startoff + imap.br_blockcount)
379		imap.br_blockcount = cow_fsb - imap.br_startoff;
380
381	/* got a delalloc extent? */
382	if (imap.br_startblock != HOLESTARTBLOCK &&
383	    isnullstartblock(imap.br_startblock))
384		goto allocate_blocks;
385
386	xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq);
387	trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
388	return 0;
389allocate_blocks:
390	error = xfs_convert_blocks(wpc, ip, whichfork, offset);
391	if (error) {
392		/*
393		 * If we failed to find the extent in the COW fork we might have
394		 * raced with a COW to data fork conversion or truncate.
395		 * Restart the lookup to catch the extent in the data fork for
396		 * the former case, but prevent additional retries to avoid
397		 * looping forever for the latter case.
398		 */
399		if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++)
400			goto retry;
401		ASSERT(error != -EAGAIN);
402		return error;
403	}
404
405	/*
406	 * Due to merging the return real extent might be larger than the
407	 * original delalloc one.  Trim the return extent to the next COW
408	 * boundary again to force a re-lookup.
409	 */
410	if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) {
411		loff_t		cow_offset = XFS_FSB_TO_B(mp, cow_fsb);
412
413		if (cow_offset < wpc->iomap.offset + wpc->iomap.length)
414			wpc->iomap.length = cow_offset - wpc->iomap.offset;
415	}
416
417	ASSERT(wpc->iomap.offset <= offset);
418	ASSERT(wpc->iomap.offset + wpc->iomap.length > offset);
419	trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap);
420	return 0;
421}
422
423static int
424xfs_prepare_ioend(
425	struct iomap_ioend	*ioend,
426	int			status)
427{
428	unsigned int		nofs_flag;
429
430	/*
431	 * We can allocate memory here while doing writeback on behalf of
432	 * memory reclaim.  To avoid memory allocation deadlocks set the
433	 * task-wide nofs context for the following operations.
434	 */
435	nofs_flag = memalloc_nofs_save();
436
437	/* Convert CoW extents to regular */
438	if (!status && (ioend->io_flags & IOMAP_F_SHARED)) {
439		status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
440				ioend->io_offset, ioend->io_size);
441	}
442
443	memalloc_nofs_restore(nofs_flag);
444
445	/* send ioends that might require a transaction to the completion wq */
446	if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
447	    (ioend->io_flags & IOMAP_F_SHARED))
448		ioend->io_bio.bi_end_io = xfs_end_bio;
449	return status;
450}
451
452/*
453 * If the folio has delalloc blocks on it, the caller is asking us to punch them
454 * out. If we don't, we can leave a stale delalloc mapping covered by a clean
455 * page that needs to be dirtied again before the delalloc mapping can be
456 * converted. This stale delalloc mapping can trip up a later direct I/O read
457 * operation on the same region.
458 *
459 * We prevent this by truncating away the delalloc regions on the folio. Because
460 * they are delalloc, we can do this without needing a transaction. Indeed - if
461 * we get ENOSPC errors, we have to be able to do this truncation without a
462 * transaction as there is no space left for block reservation (typically why
463 * we see a ENOSPC in writeback).
464 */
465static void
466xfs_discard_folio(
467	struct folio		*folio,
468	loff_t			pos)
469{
470	struct xfs_inode	*ip = XFS_I(folio->mapping->host);
 
471	struct xfs_mount	*mp = ip->i_mount;
 
 
 
472	int			error;
473
474	if (xfs_is_shutdown(mp))
475		return;
476
477	xfs_alert_ratelimited(mp,
478		"page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
479			folio, ip->i_ino, pos);
480
481	/*
482	 * The end of the punch range is always the offset of the first
483	 * byte of the next folio. Hence the end offset is only dependent on the
484	 * folio itself and not the start offset that is passed in.
485	 */
486	error = xfs_bmap_punch_delalloc_range(ip, pos,
487				folio_pos(folio) + folio_size(folio));
488
489	if (error && !xfs_is_shutdown(mp))
490		xfs_alert(mp, "page discard unable to remove delalloc mapping.");
 
 
491}
492
493static const struct iomap_writeback_ops xfs_writeback_ops = {
494	.map_blocks		= xfs_map_blocks,
495	.prepare_ioend		= xfs_prepare_ioend,
496	.discard_folio		= xfs_discard_folio,
497};
498
499STATIC int
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
500xfs_vm_writepages(
501	struct address_space	*mapping,
502	struct writeback_control *wbc)
503{
504	struct xfs_writepage_ctx wpc = { };
505
 
 
 
 
 
 
 
506	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
507	return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
508}
509
510STATIC int
511xfs_dax_writepages(
512	struct address_space	*mapping,
513	struct writeback_control *wbc)
514{
515	struct xfs_inode	*ip = XFS_I(mapping->host);
516
517	xfs_iflags_clear(ip, XFS_ITRUNCATED);
518	return dax_writeback_mapping_range(mapping,
519			xfs_inode_buftarg(ip)->bt_daxdev, wbc);
520}
521
522STATIC sector_t
523xfs_vm_bmap(
524	struct address_space	*mapping,
525	sector_t		block)
526{
527	struct xfs_inode	*ip = XFS_I(mapping->host);
528
529	trace_xfs_vm_bmap(ip);
530
531	/*
532	 * The swap code (ab-)uses ->bmap to get a block mapping and then
533	 * bypasses the file system for actual I/O.  We really can't allow
534	 * that on reflinks inodes, so we have to skip out here.  And yes,
535	 * 0 is the magic code for a bmap error.
536	 *
537	 * Since we don't pass back blockdev info, we can't return bmap
538	 * information for rt files either.
539	 */
540	if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
541		return 0;
542	return iomap_bmap(mapping, block, &xfs_read_iomap_ops);
543}
544
545STATIC int
546xfs_vm_read_folio(
547	struct file		*unused,
548	struct folio		*folio)
549{
550	return iomap_read_folio(folio, &xfs_read_iomap_ops);
551}
552
553STATIC void
554xfs_vm_readahead(
555	struct readahead_control	*rac)
556{
557	iomap_readahead(rac, &xfs_read_iomap_ops);
558}
559
560static int
561xfs_iomap_swapfile_activate(
562	struct swap_info_struct		*sis,
563	struct file			*swap_file,
564	sector_t			*span)
565{
566	sis->bdev = xfs_inode_buftarg(XFS_I(file_inode(swap_file)))->bt_bdev;
567	return iomap_swapfile_activate(sis, swap_file, span,
568			&xfs_read_iomap_ops);
569}
570
571const struct address_space_operations xfs_address_space_operations = {
572	.read_folio		= xfs_vm_read_folio,
573	.readahead		= xfs_vm_readahead,
 
574	.writepages		= xfs_vm_writepages,
575	.dirty_folio		= iomap_dirty_folio,
576	.release_folio		= iomap_release_folio,
577	.invalidate_folio	= iomap_invalidate_folio,
578	.bmap			= xfs_vm_bmap,
579	.migrate_folio		= filemap_migrate_folio,
 
580	.is_partially_uptodate  = iomap_is_partially_uptodate,
581	.error_remove_folio	= generic_error_remove_folio,
582	.swap_activate		= xfs_iomap_swapfile_activate,
583};
584
585const struct address_space_operations xfs_dax_aops = {
586	.writepages		= xfs_dax_writepages,
587	.dirty_folio		= noop_dirty_folio,
 
 
588	.swap_activate		= xfs_iomap_swapfile_activate,
589};

  1// SPDX-License-Identifier: GPL-2.0
  2/*
  3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
  4 * Copyright (c) 2016-2018 Christoph Hellwig.
  5 * All Rights Reserved.
  6 */
  7#include "xfs.h"
  8#include "xfs_shared.h"
  9#include "xfs_format.h"
 10#include "xfs_log_format.h"
 11#include "xfs_trans_resv.h"
 12#include "xfs_mount.h"
 13#include "xfs_inode.h"
 14#include "xfs_trans.h"
 15#include "xfs_iomap.h"
 16#include "xfs_trace.h"
 17#include "xfs_bmap.h"
 18#include "xfs_bmap_util.h"
 19#include "xfs_reflink.h"
 
 
 20
 21struct xfs_writepage_ctx {
 22	struct iomap_writepage_ctx ctx;
 23	unsigned int		data_seq;
 24	unsigned int		cow_seq;
 25};
 26
 27static inline struct xfs_writepage_ctx *
 28XFS_WPC(struct iomap_writepage_ctx *ctx)
 29{
 30	return container_of(ctx, struct xfs_writepage_ctx, ctx);
 31}
 32
 33/*
 34 * Fast and loose check if this write could update the on-disk inode size.
 35 */
 36static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend)
 37{
 38	return ioend->io_offset + ioend->io_size >
 39		XFS_I(ioend->io_inode)->i_disk_size;
 40}
 41
 42/*
 43 * Update on-disk file size now that data has been written to disk.
 44 */
 45int
 46xfs_setfilesize(
 47	struct xfs_inode	*ip,
 48	xfs_off_t		offset,
 49	size_t			size)
 50{
 51	struct xfs_mount	*mp = ip->i_mount;
 52	struct xfs_trans	*tp;
 53	xfs_fsize_t		isize;
 54	int			error;
 55
 56	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
 57	if (error)
 58		return error;
 59
 60	xfs_ilock(ip, XFS_ILOCK_EXCL);
 61	isize = xfs_new_eof(ip, offset + size);
 62	if (!isize) {
 63		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 64		xfs_trans_cancel(tp);
 65		return 0;
 66	}
 67
 68	trace_xfs_setfilesize(ip, offset, size);
 69
 70	ip->i_disk_size = isize;
 71	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 72	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 73
 74	return xfs_trans_commit(tp);
 75}
 76
 77/*
 78 * IO write completion.
 79 */
 80STATIC void
 81xfs_end_ioend(
 82	struct iomap_ioend	*ioend)
 83{
 84	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
 
 85	xfs_off_t		offset = ioend->io_offset;
 86	size_t			size = ioend->io_size;
 87	unsigned int		nofs_flag;
 88	int			error;
 89
 90	/*
 91	 * We can allocate memory here while doing writeback on behalf of
 92	 * memory reclaim.  To avoid memory allocation deadlocks set the
 93	 * task-wide nofs context for the following operations.
 94	 */
 95	nofs_flag = memalloc_nofs_save();
 96
 97	/*
 98	 * Just clean up the in-memory structures if the fs has been shut down.
 99	 */
100	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
101		error = -EIO;
102		goto done;
103	}
104
105	/*
106	 * Clean up any COW blocks on an I/O error.
 
 
 
 
107	 */
108	error = blk_status_to_errno(ioend->io_bio->bi_status);
109	if (unlikely(error)) {
110		if (ioend->io_flags & IOMAP_F_SHARED)
111			xfs_reflink_cancel_cow_range(ip, offset, size, true);
 
 
 
112		goto done;
113	}
114
115	/*
116	 * Success: commit the COW or unwritten blocks if needed.
117	 */
118	if (ioend->io_flags & IOMAP_F_SHARED)
119		error = xfs_reflink_end_cow(ip, offset, size);
120	else if (ioend->io_type == IOMAP_UNWRITTEN)
121		error = xfs_iomap_write_unwritten(ip, offset, size, false);
122
123	if (!error && xfs_ioend_is_append(ioend))
124		error = xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
125done:
126	iomap_finish_ioends(ioend, error);
127	memalloc_nofs_restore(nofs_flag);
128}
129
130/* Finish all pending io completions. */
 
 
 
 
 
 
 
 
 
 
 
 
 
131void
132xfs_end_io(
133	struct work_struct	*work)
134{
135	struct xfs_inode	*ip =
136		container_of(work, struct xfs_inode, i_ioend_work);
137	struct iomap_ioend	*ioend;
138	struct list_head	tmp;
139	unsigned long		flags;
140
141	spin_lock_irqsave(&ip->i_ioend_lock, flags);
142	list_replace_init(&ip->i_ioend_list, &tmp);
143	spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
144
145	iomap_sort_ioends(&tmp);
146	while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
147			io_list))) {
148		list_del_init(&ioend->io_list);
149		iomap_ioend_try_merge(ioend, &tmp);
150		xfs_end_ioend(ioend);
 
151	}
152}
153
154STATIC void
155xfs_end_bio(
156	struct bio		*bio)
157{
158	struct iomap_ioend	*ioend = bio->bi_private;
159	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
160	unsigned long		flags;
161
162	spin_lock_irqsave(&ip->i_ioend_lock, flags);
163	if (list_empty(&ip->i_ioend_list))
164		WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
165					 &ip->i_ioend_work));
166	list_add_tail(&ioend->io_list, &ip->i_ioend_list);
167	spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
168}
169
170/*
171 * Fast revalidation of the cached writeback mapping. Return true if the current
172 * mapping is valid, false otherwise.
173 */
174static bool
175xfs_imap_valid(
176	struct iomap_writepage_ctx	*wpc,
177	struct xfs_inode		*ip,
178	loff_t				offset)
179{
180	if (offset < wpc->iomap.offset ||
181	    offset >= wpc->iomap.offset + wpc->iomap.length)
182		return false;
183	/*
184	 * If this is a COW mapping, it is sufficient to check that the mapping
185	 * covers the offset. Be careful to check this first because the caller
186	 * can revalidate a COW mapping without updating the data seqno.
187	 */
188	if (wpc->iomap.flags & IOMAP_F_SHARED)
189		return true;
190
191	/*
192	 * This is not a COW mapping. Check the sequence number of the data fork
193	 * because concurrent changes could have invalidated the extent. Check
194	 * the COW fork because concurrent changes since the last time we
195	 * checked (and found nothing at this offset) could have added
196	 * overlapping blocks.
197	 */
198	if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq))
 
 
199		return false;
 
200	if (xfs_inode_has_cow_data(ip) &&
201	    XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
 
 
202		return false;
 
203	return true;
204}
205
206/*
207 * Pass in a dellalloc extent and convert it to real extents, return the real
208 * extent that maps offset_fsb in wpc->iomap.
209 *
210 * The current page is held locked so nothing could have removed the block
211 * backing offset_fsb, although it could have moved from the COW to the data
212 * fork by another thread.
213 */
214static int
215xfs_convert_blocks(
216	struct iomap_writepage_ctx *wpc,
217	struct xfs_inode	*ip,
218	int			whichfork,
219	loff_t			offset)
220{
221	int			error;
222	unsigned		*seq;
223
224	if (whichfork == XFS_COW_FORK)
225		seq = &XFS_WPC(wpc)->cow_seq;
226	else
227		seq = &XFS_WPC(wpc)->data_seq;
228
229	/*
230	 * Attempt to allocate whatever delalloc extent currently backs offset
231	 * and put the result into wpc->iomap.  Allocate in a loop because it
232	 * may take several attempts to allocate real blocks for a contiguous
233	 * delalloc extent if free space is sufficiently fragmented.
234	 */
235	do {
236		error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
237				&wpc->iomap, seq);
238		if (error)
239			return error;
240	} while (wpc->iomap.offset + wpc->iomap.length <= offset);
241
242	return 0;
243}
244
245static int
246xfs_map_blocks(
247	struct iomap_writepage_ctx *wpc,
248	struct inode		*inode,
249	loff_t			offset)
 
250{
251	struct xfs_inode	*ip = XFS_I(inode);
252	struct xfs_mount	*mp = ip->i_mount;
253	ssize_t			count = i_blocksize(inode);
254	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
255	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
256	xfs_fileoff_t		cow_fsb;
257	int			whichfork;
258	struct xfs_bmbt_irec	imap;
259	struct xfs_iext_cursor	icur;
260	int			retries = 0;
261	int			error = 0;
262
263	if (XFS_FORCED_SHUTDOWN(mp))
264		return -EIO;
265
 
 
266	/*
267	 * COW fork blocks can overlap data fork blocks even if the blocks
268	 * aren't shared.  COW I/O always takes precedent, so we must always
269	 * check for overlap on reflink inodes unless the mapping is already a
270	 * COW one, or the COW fork hasn't changed from the last time we looked
271	 * at it.
272	 *
273	 * It's safe to check the COW fork if_seq here without the ILOCK because
274	 * we've indirectly protected against concurrent updates: writeback has
275	 * the page locked, which prevents concurrent invalidations by reflink
276	 * and directio and prevents concurrent buffered writes to the same
277	 * page.  Changes to if_seq always happen under i_lock, which protects
278	 * against concurrent updates and provides a memory barrier on the way
279	 * out that ensures that we always see the current value.
280	 */
281	if (xfs_imap_valid(wpc, ip, offset))
282		return 0;
283
284	/*
285	 * If we don't have a valid map, now it's time to get a new one for this
286	 * offset.  This will convert delayed allocations (including COW ones)
287	 * into real extents.  If we return without a valid map, it means we
288	 * landed in a hole and we skip the block.
289	 */
290retry:
291	cow_fsb = NULLFILEOFF;
292	whichfork = XFS_DATA_FORK;
293	xfs_ilock(ip, XFS_ILOCK_SHARED);
294	ASSERT(!xfs_need_iread_extents(&ip->i_df));
295
296	/*
297	 * Check if this is offset is covered by a COW extents, and if yes use
298	 * it directly instead of looking up anything in the data fork.
299	 */
300	if (xfs_inode_has_cow_data(ip) &&
301	    xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
302		cow_fsb = imap.br_startoff;
303	if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
304		XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
305		xfs_iunlock(ip, XFS_ILOCK_SHARED);
306
307		whichfork = XFS_COW_FORK;
308		goto allocate_blocks;
309	}
310
311	/*
312	 * No COW extent overlap. Revalidate now that we may have updated
313	 * ->cow_seq. If the data mapping is still valid, we're done.
314	 */
315	if (xfs_imap_valid(wpc, ip, offset)) {
316		xfs_iunlock(ip, XFS_ILOCK_SHARED);
317		return 0;
318	}
319
320	/*
321	 * If we don't have a valid map, now it's time to get a new one for this
322	 * offset.  This will convert delayed allocations (including COW ones)
323	 * into real extents.
324	 */
325	if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
326		imap.br_startoff = end_fsb;	/* fake a hole past EOF */
327	XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq);
328	xfs_iunlock(ip, XFS_ILOCK_SHARED);
329
330	/* landed in a hole or beyond EOF? */
331	if (imap.br_startoff > offset_fsb) {
332		imap.br_blockcount = imap.br_startoff - offset_fsb;
333		imap.br_startoff = offset_fsb;
334		imap.br_startblock = HOLESTARTBLOCK;
335		imap.br_state = XFS_EXT_NORM;
336	}
337
338	/*
339	 * Truncate to the next COW extent if there is one.  This is the only
340	 * opportunity to do this because we can skip COW fork lookups for the
341	 * subsequent blocks in the mapping; however, the requirement to treat
342	 * the COW range separately remains.
343	 */
344	if (cow_fsb != NULLFILEOFF &&
345	    cow_fsb < imap.br_startoff + imap.br_blockcount)
346		imap.br_blockcount = cow_fsb - imap.br_startoff;
347
348	/* got a delalloc extent? */
349	if (imap.br_startblock != HOLESTARTBLOCK &&
350	    isnullstartblock(imap.br_startblock))
351		goto allocate_blocks;
352
353	xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0);
354	trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
355	return 0;
356allocate_blocks:
357	error = xfs_convert_blocks(wpc, ip, whichfork, offset);
358	if (error) {
359		/*
360		 * If we failed to find the extent in the COW fork we might have
361		 * raced with a COW to data fork conversion or truncate.
362		 * Restart the lookup to catch the extent in the data fork for
363		 * the former case, but prevent additional retries to avoid
364		 * looping forever for the latter case.
365		 */
366		if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++)
367			goto retry;
368		ASSERT(error != -EAGAIN);
369		return error;
370	}
371
372	/*
373	 * Due to merging the return real extent might be larger than the
374	 * original delalloc one.  Trim the return extent to the next COW
375	 * boundary again to force a re-lookup.
376	 */
377	if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) {
378		loff_t		cow_offset = XFS_FSB_TO_B(mp, cow_fsb);
379
380		if (cow_offset < wpc->iomap.offset + wpc->iomap.length)
381			wpc->iomap.length = cow_offset - wpc->iomap.offset;
382	}
383
384	ASSERT(wpc->iomap.offset <= offset);
385	ASSERT(wpc->iomap.offset + wpc->iomap.length > offset);
386	trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap);
387	return 0;
388}
389
390static int
391xfs_prepare_ioend(
392	struct iomap_ioend	*ioend,
393	int			status)
394{
395	unsigned int		nofs_flag;
396
397	/*
398	 * We can allocate memory here while doing writeback on behalf of
399	 * memory reclaim.  To avoid memory allocation deadlocks set the
400	 * task-wide nofs context for the following operations.
401	 */
402	nofs_flag = memalloc_nofs_save();
403
404	/* Convert CoW extents to regular */
405	if (!status && (ioend->io_flags & IOMAP_F_SHARED)) {
406		status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
407				ioend->io_offset, ioend->io_size);
408	}
409
410	memalloc_nofs_restore(nofs_flag);
411
412	/* send ioends that might require a transaction to the completion wq */
413	if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
414	    (ioend->io_flags & IOMAP_F_SHARED))
415		ioend->io_bio->bi_end_io = xfs_end_bio;
416	return status;
417}
418
419/*
420 * If the page has delalloc blocks on it, we need to punch them out before we
421 * invalidate the page.  If we don't, we leave a stale delalloc mapping on the
422 * inode that can trip up a later direct I/O read operation on the same region.
 
 
423 *
424 * We prevent this by truncating away the delalloc regions on the page.  Because
425 * they are delalloc, we can do this without needing a transaction. Indeed - if
426 * we get ENOSPC errors, we have to be able to do this truncation without a
427 * transaction as there is no space left for block reservation (typically why we
428 * see a ENOSPC in writeback).
429 */
430static void
431xfs_discard_page(
432	struct page		*page,
433	loff_t			fileoff)
434{
435	struct inode		*inode = page->mapping->host;
436	struct xfs_inode	*ip = XFS_I(inode);
437	struct xfs_mount	*mp = ip->i_mount;
438	unsigned int		pageoff = offset_in_page(fileoff);
439	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, fileoff);
440	xfs_fileoff_t		pageoff_fsb = XFS_B_TO_FSBT(mp, pageoff);
441	int			error;
442
443	if (XFS_FORCED_SHUTDOWN(mp))
444		goto out_invalidate;
445
446	xfs_alert_ratelimited(mp,
447		"page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
448			page, ip->i_ino, fileoff);
449
450	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
451			i_blocks_per_page(inode, page) - pageoff_fsb);
452	if (error && !XFS_FORCED_SHUTDOWN(mp))
 
 
 
 
 
 
453		xfs_alert(mp, "page discard unable to remove delalloc mapping.");
454out_invalidate:
455	iomap_invalidatepage(page, pageoff, PAGE_SIZE - pageoff);
456}
457
458static const struct iomap_writeback_ops xfs_writeback_ops = {
459	.map_blocks		= xfs_map_blocks,
460	.prepare_ioend		= xfs_prepare_ioend,
461	.discard_page		= xfs_discard_page,
462};
463
464STATIC int
465xfs_vm_writepage(
466	struct page		*page,
467	struct writeback_control *wbc)
468{
469	struct xfs_writepage_ctx wpc = { };
470
471	if (WARN_ON_ONCE(current->journal_info)) {
472		redirty_page_for_writepage(wbc, page);
473		unlock_page(page);
474		return 0;
475	}
476
477	return iomap_writepage(page, wbc, &wpc.ctx, &xfs_writeback_ops);
478}
479
480STATIC int
481xfs_vm_writepages(
482	struct address_space	*mapping,
483	struct writeback_control *wbc)
484{
485	struct xfs_writepage_ctx wpc = { };
486
487	/*
488	 * Writing back data in a transaction context can result in recursive
489	 * transactions. This is bad, so issue a warning and get out of here.
490	 */
491	if (WARN_ON_ONCE(current->journal_info))
492		return 0;
493
494	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
495	return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
496}
497
498STATIC int
499xfs_dax_writepages(
500	struct address_space	*mapping,
501	struct writeback_control *wbc)
502{
503	struct xfs_inode	*ip = XFS_I(mapping->host);
504
505	xfs_iflags_clear(ip, XFS_ITRUNCATED);
506	return dax_writeback_mapping_range(mapping,
507			xfs_inode_buftarg(ip)->bt_daxdev, wbc);
508}
509
510STATIC sector_t
511xfs_vm_bmap(
512	struct address_space	*mapping,
513	sector_t		block)
514{
515	struct xfs_inode	*ip = XFS_I(mapping->host);
516
517	trace_xfs_vm_bmap(ip);
518
519	/*
520	 * The swap code (ab-)uses ->bmap to get a block mapping and then
521	 * bypasses the file system for actual I/O.  We really can't allow
522	 * that on reflinks inodes, so we have to skip out here.  And yes,
523	 * 0 is the magic code for a bmap error.
524	 *
525	 * Since we don't pass back blockdev info, we can't return bmap
526	 * information for rt files either.
527	 */
528	if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
529		return 0;
530	return iomap_bmap(mapping, block, &xfs_read_iomap_ops);
531}
532
533STATIC int
534xfs_vm_readpage(
535	struct file		*unused,
536	struct page		*page)
537{
538	return iomap_readpage(page, &xfs_read_iomap_ops);
539}
540
541STATIC void
542xfs_vm_readahead(
543	struct readahead_control	*rac)
544{
545	iomap_readahead(rac, &xfs_read_iomap_ops);
546}
547
548static int
549xfs_iomap_swapfile_activate(
550	struct swap_info_struct		*sis,
551	struct file			*swap_file,
552	sector_t			*span)
553{
554	sis->bdev = xfs_inode_buftarg(XFS_I(file_inode(swap_file)))->bt_bdev;
555	return iomap_swapfile_activate(sis, swap_file, span,
556			&xfs_read_iomap_ops);
557}
558
559const struct address_space_operations xfs_address_space_operations = {
560	.readpage		= xfs_vm_readpage,
561	.readahead		= xfs_vm_readahead,
562	.writepage		= xfs_vm_writepage,
563	.writepages		= xfs_vm_writepages,
564	.set_page_dirty		= __set_page_dirty_nobuffers,
565	.releasepage		= iomap_releasepage,
566	.invalidatepage		= iomap_invalidatepage,
567	.bmap			= xfs_vm_bmap,
568	.direct_IO		= noop_direct_IO,
569	.migratepage		= iomap_migrate_page,
570	.is_partially_uptodate  = iomap_is_partially_uptodate,
571	.error_remove_page	= generic_error_remove_page,
572	.swap_activate		= xfs_iomap_swapfile_activate,
573};
574
575const struct address_space_operations xfs_dax_aops = {
576	.writepages		= xfs_dax_writepages,
577	.direct_IO		= noop_direct_IO,
578	.set_page_dirty		= __set_page_dirty_no_writeback,
579	.invalidatepage		= noop_invalidatepage,
580	.swap_activate		= xfs_iomap_swapfile_activate,
581};