xfs_exchrange.c - fs/xfs/xfs_exchrange.c - Linux source code v3.1

Note: File does not exist in v3.1.
  1// SPDX-License-Identifier: GPL-2.0-or-later
  2/*
  3 * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
  4 * Author: Darrick J. Wong <djwong@kernel.org>
  5 */
  6#include "xfs.h"
  7#include "xfs_shared.h"
  8#include "xfs_format.h"
  9#include "xfs_log_format.h"
 10#include "xfs_trans_resv.h"
 11#include "xfs_mount.h"
 12#include "xfs_defer.h"
 13#include "xfs_inode.h"
 14#include "xfs_trans.h"
 15#include "xfs_quota.h"
 16#include "xfs_bmap_util.h"
 17#include "xfs_reflink.h"
 18#include "xfs_trace.h"
 19#include "xfs_exchrange.h"
 20#include "xfs_exchmaps.h"
 21#include "xfs_sb.h"
 22#include "xfs_icache.h"
 23#include "xfs_log.h"
 24#include "xfs_rtbitmap.h"
 25#include <linux/fsnotify.h>
 26
 27/* Lock (and optionally join) two inodes for a file range exchange. */
 28void
 29xfs_exchrange_ilock(
 30	struct xfs_trans	*tp,
 31	struct xfs_inode	*ip1,
 32	struct xfs_inode	*ip2)
 33{
 34	if (ip1 != ip2)
 35		xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL,
 36				    ip2, XFS_ILOCK_EXCL);
 37	else
 38		xfs_ilock(ip1, XFS_ILOCK_EXCL);
 39	if (tp) {
 40		xfs_trans_ijoin(tp, ip1, 0);
 41		if (ip2 != ip1)
 42			xfs_trans_ijoin(tp, ip2, 0);
 43	}
 44
 45}
 46
 47/* Unlock two inodes after a file range exchange operation. */
 48void
 49xfs_exchrange_iunlock(
 50	struct xfs_inode	*ip1,
 51	struct xfs_inode	*ip2)
 52{
 53	if (ip2 != ip1)
 54		xfs_iunlock(ip2, XFS_ILOCK_EXCL);
 55	xfs_iunlock(ip1, XFS_ILOCK_EXCL);
 56}
 57
 58/*
 59 * Estimate the resource requirements to exchange file contents between the two
 60 * files.  The caller is required to hold the IOLOCK and the MMAPLOCK and to
 61 * have flushed both inodes' pagecache and active direct-ios.
 62 */
 63int
 64xfs_exchrange_estimate(
 65	struct xfs_exchmaps_req	*req)
 66{
 67	int			error;
 68
 69	xfs_exchrange_ilock(NULL, req->ip1, req->ip2);
 70	error = xfs_exchmaps_estimate(req);
 71	xfs_exchrange_iunlock(req->ip1, req->ip2);
 72	return error;
 73}
 74
 75/*
 76 * Check that file2's metadata agree with the snapshot that we took for the
 77 * range commit request.
 78 *
 79 * This should be called after the filesystem has locked /all/ inode metadata
 80 * against modification.
 81 */
 82STATIC int
 83xfs_exchrange_check_freshness(
 84	const struct xfs_exchrange	*fxr,
 85	struct xfs_inode		*ip2)
 86{
 87	struct inode			*inode2 = VFS_I(ip2);
 88	struct timespec64		ctime = inode_get_ctime(inode2);
 89	struct timespec64		mtime = inode_get_mtime(inode2);
 90
 91	trace_xfs_exchrange_freshness(fxr, ip2);
 92
 93	/* Check that file2 hasn't otherwise been modified. */
 94	if (fxr->file2_ino != ip2->i_ino ||
 95	    fxr->file2_gen != inode2->i_generation ||
 96	    !timespec64_equal(&fxr->file2_ctime, &ctime) ||
 97	    !timespec64_equal(&fxr->file2_mtime, &mtime))
 98		return -EBUSY;
 99
100	return 0;
101}
102
103#define QRETRY_IP1	(0x1)
104#define QRETRY_IP2	(0x2)
105
106/*
107 * Obtain a quota reservation to make sure we don't hit EDQUOT.  We can skip
108 * this if quota enforcement is disabled or if both inodes' dquots are the
109 * same.  The qretry structure must be initialized to zeroes before the first
110 * call to this function.
111 */
112STATIC int
113xfs_exchrange_reserve_quota(
114	struct xfs_trans		*tp,
115	const struct xfs_exchmaps_req	*req,
116	unsigned int			*qretry)
117{
118	int64_t				ddelta, rdelta;
119	int				ip1_error = 0;
120	int				error;
121
122	/*
123	 * Don't bother with a quota reservation if we're not enforcing them
124	 * or the two inodes have the same dquots.
125	 */
126	if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
127	    (req->ip1->i_udquot == req->ip2->i_udquot &&
128	     req->ip1->i_gdquot == req->ip2->i_gdquot &&
129	     req->ip1->i_pdquot == req->ip2->i_pdquot))
130		return 0;
131
132	*qretry = 0;
133
134	/*
135	 * For each file, compute the net gain in the number of regular blocks
136	 * that will be mapped into that file and reserve that much quota.  The
137	 * quota counts must be able to absorb at least that much space.
138	 */
139	ddelta = req->ip2_bcount - req->ip1_bcount;
140	rdelta = req->ip2_rtbcount - req->ip1_rtbcount;
141	if (ddelta > 0 || rdelta > 0) {
142		error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
143				ddelta > 0 ? ddelta : 0,
144				rdelta > 0 ? rdelta : 0,
145				false);
146		if (error == -EDQUOT || error == -ENOSPC) {
147			/*
148			 * Save this error and see what happens if we try to
149			 * reserve quota for ip2.  Then report both.
150			 */
151			*qretry |= QRETRY_IP1;
152			ip1_error = error;
153			error = 0;
154		}
155		if (error)
156			return error;
157	}
158	if (ddelta < 0 || rdelta < 0) {
159		error = xfs_trans_reserve_quota_nblks(tp, req->ip2,
160				ddelta < 0 ? -ddelta : 0,
161				rdelta < 0 ? -rdelta : 0,
162				false);
163		if (error == -EDQUOT || error == -ENOSPC)
164			*qretry |= QRETRY_IP2;
165		if (error)
166			return error;
167	}
168	if (ip1_error)
169		return ip1_error;
170
171	/*
172	 * For each file, forcibly reserve the gross gain in mapped blocks so
173	 * that we don't trip over any quota block reservation assertions.
174	 * We must reserve the gross gain because the quota code subtracts from
175	 * bcount the number of blocks that we unmap; it does not add that
176	 * quantity back to the quota block reservation.
177	 */
178	error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount,
179			req->ip1_rtbcount, true);
180	if (error)
181		return error;
182
183	return xfs_trans_reserve_quota_nblks(tp, req->ip2, req->ip2_bcount,
184			req->ip2_rtbcount, true);
185}
186
187/* Exchange the mappings (and hence the contents) of two files' forks. */
188STATIC int
189xfs_exchrange_mappings(
190	const struct xfs_exchrange	*fxr,
191	struct xfs_inode		*ip1,
192	struct xfs_inode		*ip2)
193{
194	struct xfs_mount		*mp = ip1->i_mount;
195	struct xfs_exchmaps_req		req = {
196		.ip1			= ip1,
197		.ip2			= ip2,
198		.startoff1		= XFS_B_TO_FSBT(mp, fxr->file1_offset),
199		.startoff2		= XFS_B_TO_FSBT(mp, fxr->file2_offset),
200		.blockcount		= XFS_B_TO_FSB(mp, fxr->length),
201	};
202	struct xfs_trans		*tp;
203	unsigned int			qretry;
204	bool				retried = false;
205	int				error;
206
207	trace_xfs_exchrange_mappings(fxr, ip1, ip2);
208
209	if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
210		req.flags |= XFS_EXCHMAPS_SET_SIZES;
211	if (fxr->flags & XFS_EXCHANGE_RANGE_FILE1_WRITTEN)
212		req.flags |= XFS_EXCHMAPS_INO1_WRITTEN;
213
214	/*
215	 * Round the request length up to the nearest file allocation unit.
216	 * The prep function already checked that the request offsets and
217	 * length in @fxr are safe to round up.
218	 */
219	if (xfs_inode_has_bigrtalloc(ip2))
220		req.blockcount = xfs_blen_roundup_rtx(mp, req.blockcount);
221
222	error = xfs_exchrange_estimate(&req);
223	if (error)
224		return error;
225
226retry:
227	/* Allocate the transaction, lock the inodes, and join them. */
228	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0,
229			XFS_TRANS_RES_FDBLKS, &tp);
230	if (error)
231		return error;
232
233	xfs_exchrange_ilock(tp, ip1, ip2);
234
235	trace_xfs_exchrange_before(ip2, 2);
236	trace_xfs_exchrange_before(ip1, 1);
237
238	error = xfs_exchmaps_check_forks(mp, &req);
239	if (error)
240		goto out_trans_cancel;
241
242	/*
243	 * Reserve ourselves some quota if any of them are in enforcing mode.
244	 * In theory we only need enough to satisfy the change in the number
245	 * of blocks between the two ranges being remapped.
246	 */
247	error = xfs_exchrange_reserve_quota(tp, &req, &qretry);
248	if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
249		xfs_trans_cancel(tp);
250		xfs_exchrange_iunlock(ip1, ip2);
251		if (qretry & QRETRY_IP1)
252			xfs_blockgc_free_quota(ip1, 0);
253		if (qretry & QRETRY_IP2)
254			xfs_blockgc_free_quota(ip2, 0);
255		retried = true;
256		goto retry;
257	}
258	if (error)
259		goto out_trans_cancel;
260
261	/* If we got this far on a dry run, all parameters are ok. */
262	if (fxr->flags & XFS_EXCHANGE_RANGE_DRY_RUN)
263		goto out_trans_cancel;
264
265	/* Update the mtime and ctime of both files. */
266	if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME1)
267		xfs_trans_ichgtime(tp, ip1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
268	if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME2)
269		xfs_trans_ichgtime(tp, ip2, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
270
271	xfs_exchange_mappings(tp, &req);
272
273	/*
274	 * Force the log to persist metadata updates if the caller or the
275	 * administrator requires this.  The generic prep function already
276	 * flushed the relevant parts of the page cache.
277	 */
278	if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCHANGE_RANGE_DSYNC))
279		xfs_trans_set_sync(tp);
280
281	error = xfs_trans_commit(tp);
282
283	trace_xfs_exchrange_after(ip2, 2);
284	trace_xfs_exchrange_after(ip1, 1);
285
286	if (error)
287		goto out_unlock;
288
289	/*
290	 * If the caller wanted us to exchange the contents of two complete
291	 * files of unequal length, exchange the incore sizes now.  This should
292	 * be safe because we flushed both files' page caches, exchanged all
293	 * the mappings, and updated the ondisk sizes.
294	 */
295	if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
296		loff_t	temp;
297
298		temp = i_size_read(VFS_I(ip2));
299		i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1)));
300		i_size_write(VFS_I(ip1), temp);
301	}
302
303out_unlock:
304	xfs_exchrange_iunlock(ip1, ip2);
305	return error;
306
307out_trans_cancel:
308	xfs_trans_cancel(tp);
309	goto out_unlock;
310}
311
312/*
313 * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE.
314 * This part deals with struct file objects and byte ranges and does not deal
315 * with XFS-specific data structures such as xfs_inodes and block ranges.  This
316 * separation may some day facilitate porting to another filesystem.
317 *
318 * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in
319 * file1 with the same number of bytes starting at fxr.file2_offset in file2.
320 * Implementations must call xfs_exchange_range_prep to prepare the two
321 * files prior to taking locks; and they must update the inode change and mod
322 * times of both files as part of the metadata update.  The timestamp update
323 * and freshness checks must be done atomically as part of the data exchange
324 * operation to ensure correctness of the freshness check.
325 * xfs_exchange_range_finish must be called after the operation completes
326 * successfully but before locks are dropped.
327 */
328
329/*
330 * Performs necessary checks before doing a range exchange, having stabilized
331 * mutable inode attributes via i_rwsem.
332 */
333static inline int
334xfs_exchange_range_checks(
335	struct xfs_exchrange	*fxr,
336	unsigned int		alloc_unit)
337{
338	struct inode		*inode1 = file_inode(fxr->file1);
339	loff_t			size1 = i_size_read(inode1);
340	struct inode		*inode2 = file_inode(fxr->file2);
341	loff_t			size2 = i_size_read(inode2);
342	uint64_t		allocmask = alloc_unit - 1;
343	int64_t			test_len;
344	uint64_t		blen;
345	loff_t			tmp;
346	int			error;
347
348	/* Don't touch certain kinds of inodes */
349	if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2))
350		return -EPERM;
351	if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2))
352		return -ETXTBSY;
353
354	/* Ranges cannot start after EOF. */
355	if (fxr->file1_offset > size1 || fxr->file2_offset > size2)
356		return -EINVAL;
357
358	if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
359		/*
360		 * If the caller said to exchange to EOF, we set the length of
361		 * the request large enough to cover everything to the end of
362		 * both files.
363		 */
364		fxr->length = max_t(int64_t, size1 - fxr->file1_offset,
365					     size2 - fxr->file2_offset);
366	} else {
367		/*
368		 * Otherwise we require both ranges to end within EOF.
369		 */
370		if (fxr->file1_offset + fxr->length > size1 ||
371		    fxr->file2_offset + fxr->length > size2)
372			return -EINVAL;
373	}
374
375	/*
376	 * The start of both ranges must be aligned to the file allocation
377	 * unit.
378	 */
379	if (!IS_ALIGNED(fxr->file1_offset, alloc_unit) ||
380	    !IS_ALIGNED(fxr->file2_offset, alloc_unit))
381		return -EINVAL;
382
383	/* Ensure offsets don't wrap. */
384	if (check_add_overflow(fxr->file1_offset, fxr->length, &tmp) ||
385	    check_add_overflow(fxr->file2_offset, fxr->length, &tmp))
386		return -EINVAL;
387
388	/*
389	 * Make sure we don't hit any file size limits.  If we hit any size
390	 * limits such that test_length was adjusted, we abort the whole
391	 * operation.
392	 */
393	test_len = fxr->length;
394	error = generic_write_check_limits(fxr->file2, fxr->file2_offset,
395			&test_len);
396	if (error)
397		return error;
398	error = generic_write_check_limits(fxr->file1, fxr->file1_offset,
399			&test_len);
400	if (error)
401		return error;
402	if (test_len != fxr->length)
403		return -EINVAL;
404
405	/*
406	 * If the user wanted us to exchange up to the infile's EOF, round up
407	 * to the next allocation unit boundary for this check.  Do the same
408	 * for the outfile.
409	 *
410	 * Otherwise, reject the range length if it's not aligned to an
411	 * allocation unit.
412	 */
413	if (fxr->file1_offset + fxr->length == size1)
414		blen = ALIGN(size1, alloc_unit) - fxr->file1_offset;
415	else if (fxr->file2_offset + fxr->length == size2)
416		blen = ALIGN(size2, alloc_unit) - fxr->file2_offset;
417	else if (!IS_ALIGNED(fxr->length, alloc_unit))
418		return -EINVAL;
419	else
420		blen = fxr->length;
421
422	/* Don't allow overlapped exchanges within the same file. */
423	if (inode1 == inode2 &&
424	    fxr->file2_offset + blen > fxr->file1_offset &&
425	    fxr->file1_offset + blen > fxr->file2_offset)
426		return -EINVAL;
427
428	/*
429	 * Ensure that we don't exchange a partial EOF block into the middle of
430	 * another file.
431	 */
432	if ((fxr->length & allocmask) == 0)
433		return 0;
434
435	blen = fxr->length;
436	if (fxr->file2_offset + blen < size2)
437		blen &= ~allocmask;
438
439	if (fxr->file1_offset + blen < size1)
440		blen &= ~allocmask;
441
442	return blen == fxr->length ? 0 : -EINVAL;
443}
444
445/*
446 * Check that the two inodes are eligible for range exchanges, the ranges make
447 * sense, and then flush all dirty data.  Caller must ensure that the inodes
448 * have been locked against any other modifications.
449 */
450static inline int
451xfs_exchange_range_prep(
452	struct xfs_exchrange	*fxr,
453	unsigned int		alloc_unit)
454{
455	struct inode		*inode1 = file_inode(fxr->file1);
456	struct inode		*inode2 = file_inode(fxr->file2);
457	bool			same_inode = (inode1 == inode2);
458	int			error;
459
460	/* Check that we don't violate system file offset limits. */
461	error = xfs_exchange_range_checks(fxr, alloc_unit);
462	if (error || fxr->length == 0)
463		return error;
464
465	/* Wait for the completion of any pending IOs on both files */
466	inode_dio_wait(inode1);
467	if (!same_inode)
468		inode_dio_wait(inode2);
469
470	error = filemap_write_and_wait_range(inode1->i_mapping,
471			fxr->file1_offset,
472			fxr->file1_offset + fxr->length - 1);
473	if (error)
474		return error;
475
476	error = filemap_write_and_wait_range(inode2->i_mapping,
477			fxr->file2_offset,
478			fxr->file2_offset + fxr->length - 1);
479	if (error)
480		return error;
481
482	/*
483	 * If the files or inodes involved require synchronous writes, amend
484	 * the request to force the filesystem to flush all data and metadata
485	 * to disk after the operation completes.
486	 */
487	if (((fxr->file1->f_flags | fxr->file2->f_flags) & O_SYNC) ||
488	    IS_SYNC(inode1) || IS_SYNC(inode2))
489		fxr->flags |= XFS_EXCHANGE_RANGE_DSYNC;
490
491	return 0;
492}
493
494/*
495 * Finish a range exchange operation, if it was successful.  Caller must ensure
496 * that the inodes are still locked against any other modifications.
497 */
498static inline int
499xfs_exchange_range_finish(
500	struct xfs_exchrange	*fxr)
501{
502	int			error;
503
504	error = file_remove_privs(fxr->file1);
505	if (error)
506		return error;
507	if (file_inode(fxr->file1) == file_inode(fxr->file2))
508		return 0;
509
510	return file_remove_privs(fxr->file2);
511}
512
513/*
514 * Check the alignment of an exchange request when the allocation unit size
515 * isn't a power of two.  The generic file-level helpers use (fast)
516 * bitmask-based alignment checks, but here we have to use slow long division.
517 */
518static int
519xfs_exchrange_check_rtalign(
520	const struct xfs_exchrange	*fxr,
521	struct xfs_inode		*ip1,
522	struct xfs_inode		*ip2,
523	unsigned int			alloc_unit)
524{
525	uint64_t			length = fxr->length;
526	uint64_t			blen;
527	loff_t				size1, size2;
528
529	size1 = i_size_read(VFS_I(ip1));
530	size2 = i_size_read(VFS_I(ip2));
531
532	/* The start of both ranges must be aligned to a rt extent. */
533	if (!isaligned_64(fxr->file1_offset, alloc_unit) ||
534	    !isaligned_64(fxr->file2_offset, alloc_unit))
535		return -EINVAL;
536
537	if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
538		length = max_t(int64_t, size1 - fxr->file1_offset,
539					size2 - fxr->file2_offset);
540
541	/*
542	 * If the user wanted us to exchange up to the infile's EOF, round up
543	 * to the next rt extent boundary for this check.  Do the same for the
544	 * outfile.
545	 *
546	 * Otherwise, reject the range length if it's not rt extent aligned.
547	 * We already confirmed the starting offsets' rt extent block
548	 * alignment.
549	 */
550	if (fxr->file1_offset + length == size1)
551		blen = roundup_64(size1, alloc_unit) - fxr->file1_offset;
552	else if (fxr->file2_offset + length == size2)
553		blen = roundup_64(size2, alloc_unit) - fxr->file2_offset;
554	else if (!isaligned_64(length, alloc_unit))
555		return -EINVAL;
556	else
557		blen = length;
558
559	/* Don't allow overlapped exchanges within the same file. */
560	if (ip1 == ip2 &&
561	    fxr->file2_offset + blen > fxr->file1_offset &&
562	    fxr->file1_offset + blen > fxr->file2_offset)
563		return -EINVAL;
564
565	/*
566	 * Ensure that we don't exchange a partial EOF rt extent into the
567	 * middle of another file.
568	 */
569	if (isaligned_64(length, alloc_unit))
570		return 0;
571
572	blen = length;
573	if (fxr->file2_offset + length < size2)
574		blen = rounddown_64(blen, alloc_unit);
575
576	if (fxr->file1_offset + blen < size1)
577		blen = rounddown_64(blen, alloc_unit);
578
579	return blen == length ? 0 : -EINVAL;
580}
581
582/* Prepare two files to have their data exchanged. */
583STATIC int
584xfs_exchrange_prep(
585	struct xfs_exchrange	*fxr,
586	struct xfs_inode	*ip1,
587	struct xfs_inode	*ip2)
588{
589	struct xfs_mount	*mp = ip2->i_mount;
590	unsigned int		alloc_unit = xfs_inode_alloc_unitsize(ip2);
591	int			error;
592
593	trace_xfs_exchrange_prep(fxr, ip1, ip2);
594
595	/* Verify both files are either real-time or non-realtime */
596	if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2))
597		return -EINVAL;
598
599	/* Check non-power of two alignment issues, if necessary. */
600	if (!is_power_of_2(alloc_unit)) {
601		error = xfs_exchrange_check_rtalign(fxr, ip1, ip2, alloc_unit);
602		if (error)
603			return error;
604
605		/*
606		 * Do the generic file-level checks with the regular block
607		 * alignment.
608		 */
609		alloc_unit = mp->m_sb.sb_blocksize;
610	}
611
612	error = xfs_exchange_range_prep(fxr, alloc_unit);
613	if (error || fxr->length == 0)
614		return error;
615
616	if (fxr->flags & __XFS_EXCHANGE_RANGE_CHECK_FRESH2) {
617		error = xfs_exchrange_check_freshness(fxr, ip2);
618		if (error)
619			return error;
620	}
621
622	/* Attach dquots to both inodes before changing block maps. */
623	error = xfs_qm_dqattach(ip2);
624	if (error)
625		return error;
626	error = xfs_qm_dqattach(ip1);
627	if (error)
628		return error;
629
630	trace_xfs_exchrange_flush(fxr, ip1, ip2);
631
632	/* Flush the relevant ranges of both files. */
633	error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length);
634	if (error)
635		return error;
636	error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length);
637	if (error)
638		return error;
639
640	/*
641	 * Cancel CoW fork preallocations for the ranges of both files.  The
642	 * prep function should have flushed all the dirty data, so the only
643	 * CoW mappings remaining should be speculative.
644	 */
645	if (xfs_inode_has_cow_data(ip1)) {
646		error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset,
647				fxr->length, true);
648		if (error)
649			return error;
650	}
651
652	if (xfs_inode_has_cow_data(ip2)) {
653		error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset,
654				fxr->length, true);
655		if (error)
656			return error;
657	}
658
659	return 0;
660}
661
662/*
663 * Exchange contents of files.  This is the binding between the generic
664 * file-level concepts and the XFS inode-specific implementation.
665 */
666STATIC int
667xfs_exchrange_contents(
668	struct xfs_exchrange	*fxr)
669{
670	struct inode		*inode1 = file_inode(fxr->file1);
671	struct inode		*inode2 = file_inode(fxr->file2);
672	struct xfs_inode	*ip1 = XFS_I(inode1);
673	struct xfs_inode	*ip2 = XFS_I(inode2);
674	struct xfs_mount	*mp = ip1->i_mount;
675	int			error;
676
677	if (!xfs_has_exchange_range(mp))
678		return -EOPNOTSUPP;
679
680	if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
681			   XFS_EXCHANGE_RANGE_PRIV_FLAGS))
682		return -EINVAL;
683
684	if (xfs_is_shutdown(mp))
685		return -EIO;
686
687	/* Lock both files against IO */
688	error = xfs_ilock2_io_mmap(ip1, ip2);
689	if (error)
690		goto out_err;
691
692	/* Prepare and then exchange file contents. */
693	error = xfs_exchrange_prep(fxr, ip1, ip2);
694	if (error)
695		goto out_unlock;
696
697	error = xfs_exchrange_mappings(fxr, ip1, ip2);
698	if (error)
699		goto out_unlock;
700
701	/*
702	 * Finish the exchange by removing special file privileges like any
703	 * other file write would do.  This may involve turning on support for
704	 * logged xattrs if either file has security capabilities.
705	 */
706	error = xfs_exchange_range_finish(fxr);
707	if (error)
708		goto out_unlock;
709
710out_unlock:
711	xfs_iunlock2_io_mmap(ip1, ip2);
712out_err:
713	if (error)
714		trace_xfs_exchrange_error(ip2, error, _RET_IP_);
715	return error;
716}
717
718/* Exchange parts of two files. */
719static int
720xfs_exchange_range(
721	struct xfs_exchrange	*fxr)
722{
723	struct inode		*inode1 = file_inode(fxr->file1);
724	struct inode		*inode2 = file_inode(fxr->file2);
725	loff_t			check_len = fxr->length;
726	int			ret;
727
728	BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS &
729		     XFS_EXCHANGE_RANGE_PRIV_FLAGS);
730
731	/* Both files must be on the same mount/filesystem. */
732	if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt)
733		return -EXDEV;
734
735	if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
736			 __XFS_EXCHANGE_RANGE_CHECK_FRESH2))
737		return -EINVAL;
738
739	/* Userspace requests only honored for regular files. */
740	if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode))
741		return -EISDIR;
742	if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
743		return -EINVAL;
744
745	/* Both files must be opened for read and write. */
746	if (!(fxr->file1->f_mode & FMODE_READ) ||
747	    !(fxr->file1->f_mode & FMODE_WRITE) ||
748	    !(fxr->file2->f_mode & FMODE_READ) ||
749	    !(fxr->file2->f_mode & FMODE_WRITE))
750		return -EBADF;
751
752	/* Neither file can be opened append-only. */
753	if ((fxr->file1->f_flags & O_APPEND) ||
754	    (fxr->file2->f_flags & O_APPEND))
755		return -EBADF;
756
757	/*
758	 * If we're exchanging to EOF we can't calculate the length until taking
759	 * the iolock.  Pass a 0 length to remap_verify_area similar to the
760	 * FICLONE and FICLONERANGE ioctls that support cloning to EOF as well.
761	 */
762	if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
763		check_len = 0;
764	ret = remap_verify_area(fxr->file1, fxr->file1_offset, check_len, true);
765	if (ret)
766		return ret;
767	ret = remap_verify_area(fxr->file2, fxr->file2_offset, check_len, true);
768	if (ret)
769		return ret;
770
771	/* Update cmtime if the fd/inode don't forbid it. */
772	if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1))
773		fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME1;
774	if (!(fxr->file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2))
775		fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME2;
776
777	file_start_write(fxr->file2);
778	ret = xfs_exchrange_contents(fxr);
779	file_end_write(fxr->file2);
780	if (ret)
781		return ret;
782
783	fsnotify_modify(fxr->file1);
784	if (fxr->file2 != fxr->file1)
785		fsnotify_modify(fxr->file2);
786	return 0;
787}
788
789/* Collect exchange-range arguments from userspace. */
790long
791xfs_ioc_exchange_range(
792	struct file			*file,
793	struct xfs_exchange_range __user *argp)
794{
795	struct xfs_exchrange		fxr = {
796		.file2			= file,
797	};
798	struct xfs_exchange_range	args;
799
800	if (copy_from_user(&args, argp, sizeof(args)))
801		return -EFAULT;
802	if (memchr_inv(&args.pad, 0, sizeof(args.pad)))
803		return -EINVAL;
804	if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
805		return -EINVAL;
806
807	fxr.file1_offset	= args.file1_offset;
808	fxr.file2_offset	= args.file2_offset;
809	fxr.length		= args.length;
810	fxr.flags		= args.flags;
811
812	CLASS(fd, file1)(args.file1_fd);
813	if (fd_empty(file1))
814		return -EBADF;
815	fxr.file1 = fd_file(file1);
816
817	return xfs_exchange_range(&fxr);
818}
819
820/* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */
821struct xfs_commit_range_fresh {
822	xfs_fsid_t	fsid;		/* m_fixedfsid */
823	__u64		file2_ino;	/* inode number */
824	__s64		file2_mtime;	/* modification time */
825	__s64		file2_ctime;	/* change time */
826	__s32		file2_mtime_nsec; /* mod time, nsec */
827	__s32		file2_ctime_nsec; /* change time, nsec */
828	__u32		file2_gen;	/* inode generation */
829	__u32		magic;		/* zero */
830};
831#define XCR_FRESH_MAGIC	0x444F524B	/* DORK */
832
833/* Set up a commitrange operation by sampling file2's write-related attrs */
834long
835xfs_ioc_start_commit(
836	struct file			*file,
837	struct xfs_commit_range __user	*argp)
838{
839	struct xfs_commit_range		args = { };
840	struct kstat			kstat = { };
841	struct xfs_commit_range_fresh	*kern_f;
842	struct xfs_commit_range_fresh	__user *user_f;
843	struct inode			*inode2 = file_inode(file);
844	struct xfs_inode		*ip2 = XFS_I(inode2);
845	const unsigned int		lockflags = XFS_IOLOCK_SHARED |
846						    XFS_MMAPLOCK_SHARED |
847						    XFS_ILOCK_SHARED;
848
849	BUILD_BUG_ON(sizeof(struct xfs_commit_range_fresh) !=
850		     sizeof(args.file2_freshness));
851
852	kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
853
854	memcpy(&kern_f->fsid, ip2->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
855
856	xfs_ilock(ip2, lockflags);
857	/* Force writing of a distinct ctime if any writes happen. */
858	fill_mg_cmtime(&kstat, STATX_CTIME | STATX_MTIME, inode2);
859	kern_f->file2_ctime		= kstat.ctime.tv_sec;
860	kern_f->file2_ctime_nsec	= kstat.ctime.tv_nsec;
861	kern_f->file2_mtime		= kstat.mtime.tv_sec;
862	kern_f->file2_mtime_nsec	= kstat.mtime.tv_nsec;
863	kern_f->file2_ino		= ip2->i_ino;
864	kern_f->file2_gen		= inode2->i_generation;
865	kern_f->magic			= XCR_FRESH_MAGIC;
866	xfs_iunlock(ip2, lockflags);
867
868	user_f = (struct xfs_commit_range_fresh __user *)&argp->file2_freshness;
869	if (copy_to_user(user_f, kern_f, sizeof(*kern_f)))
870		return -EFAULT;
871
872	return 0;
873}
874
875/*
876 * Exchange file1 and file2 contents if file2 has not been written since the
877 * start commit operation.
878 */
879long
880xfs_ioc_commit_range(
881	struct file			*file,
882	struct xfs_commit_range __user	*argp)
883{
884	struct xfs_exchrange		fxr = {
885		.file2			= file,
886	};
887	struct xfs_commit_range		args;
888	struct xfs_commit_range_fresh	*kern_f;
889	struct xfs_inode		*ip2 = XFS_I(file_inode(file));
890	struct xfs_mount		*mp = ip2->i_mount;
891
892	kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
893
894	if (copy_from_user(&args, argp, sizeof(args)))
895		return -EFAULT;
896	if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
897		return -EINVAL;
898	if (kern_f->magic != XCR_FRESH_MAGIC)
899		return -EBUSY;
900	if (memcmp(&kern_f->fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)))
901		return -EBUSY;
902
903	fxr.file1_offset	= args.file1_offset;
904	fxr.file2_offset	= args.file2_offset;
905	fxr.length		= args.length;
906	fxr.flags		= args.flags | __XFS_EXCHANGE_RANGE_CHECK_FRESH2;
907	fxr.file2_ino		= kern_f->file2_ino;
908	fxr.file2_gen		= kern_f->file2_gen;
909	fxr.file2_mtime.tv_sec	= kern_f->file2_mtime;
910	fxr.file2_mtime.tv_nsec	= kern_f->file2_mtime_nsec;
911	fxr.file2_ctime.tv_sec	= kern_f->file2_ctime;
912	fxr.file2_ctime.tv_nsec	= kern_f->file2_ctime_nsec;
913
914	CLASS(fd, file1)(args.file1_fd);
915	if (fd_empty(file1))
916		return -EBADF;
917	fxr.file1 = fd_file(file1);
918
919	return xfs_exchange_range(&fxr);
920}