xfs_discard.c - fs/xfs/xfs_discard.c - Linux diff v3.15

 
  1/*
  2 * Copyright (C) 2010 Red Hat, Inc.
  3 * All Rights Reserved.
  4 *
  5 * This program is free software; you can redistribute it and/or
  6 * modify it under the terms of the GNU General Public License as
  7 * published by the Free Software Foundation.
  8 *
  9 * This program is distributed in the hope that it would be useful,
 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 * GNU General Public License for more details.
 13 *
 14 * You should have received a copy of the GNU General Public License
 15 * along with this program; if not, write the Free Software Foundation,
 16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 17 */
 18#include "xfs.h"
 
 19#include "xfs_format.h"
 20#include "xfs_log_format.h"
 21#include "xfs_trans_resv.h"
 22#include "xfs_sb.h"
 23#include "xfs_ag.h"
 24#include "xfs_mount.h"
 25#include "xfs_quota.h"
 26#include "xfs_inode.h"
 27#include "xfs_btree.h"
 28#include "xfs_alloc_btree.h"
 29#include "xfs_alloc.h"
 
 30#include "xfs_error.h"
 31#include "xfs_extent_busy.h"
 32#include "xfs_discard.h"
 33#include "xfs_trace.h"
 34#include "xfs_log.h"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 35
 36STATIC int
 37xfs_trim_extents(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 38	struct xfs_mount	*mp,
 39	xfs_agnumber_t		agno,
 40	xfs_daddr_t		start,
 41	xfs_daddr_t		end,
 42	xfs_daddr_t		minlen,
 43	__uint64_t		*blocks_trimmed)
 44{
 45	struct block_device	*bdev = mp->m_ddev_targp->bt_bdev;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 46	struct xfs_btree_cur	*cur;
 47	struct xfs_buf		*agbp;
 48	struct xfs_perag	*pag;
 49	int			error;
 50	int			i;
 51
 52	pag = xfs_perag_get(mp, agno);
 53
 54	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
 55	if (error || !agbp)
 56		goto out_put_perag;
 57
 58	cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
 59
 60	/*
 61	 * Force out the log.  This means any transactions that might have freed
 62	 * space before we took the AGF buffer lock are now on disk, and the
 63	 * volatile disk cache is flushed.
 64	 */
 65	xfs_log_force(mp, XFS_LOG_SYNC);
 66
 67	/*
 68	 * Look up the longest btree in the AGF and start with it.
 69	 */
 70	error = xfs_alloc_lookup_ge(cur, 0,
 71			    be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 72	if (error)
 73		goto out_del_cursor;
 
 
 
 
 
 74
 75	/*
 76	 * Loop until we are done with all extents that are large
 77	 * enough to be worth discarding.
 78	 */
 79	while (i) {
 80		xfs_agblock_t	fbno;
 81		xfs_extlen_t	flen;
 82		xfs_daddr_t	dbno;
 83		xfs_extlen_t	dlen;
 84
 85		error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
 86		if (error)
 87			goto out_del_cursor;
 88		XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
 89		ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest));
 90
 91		/*
 92		 * use daddr format for all range/len calculations as that is
 93		 * the format the range/len variables are supplied in by
 94		 * userspace.
 95		 */
 96		dbno = XFS_AGB_TO_DADDR(mp, agno, fbno);
 97		dlen = XFS_FSB_TO_BB(mp, flen);
 98
 99		/*
100		 * Too small?  Give up.
101		 */
102		if (dlen < minlen) {
103			trace_xfs_discard_toosmall(mp, agno, fbno, flen);
104			goto out_del_cursor;
 
 
105		}
106
107		/*
108		 * If the extent is entirely outside of the range we are
109		 * supposed to discard skip it.  Do not bother to trim
110		 * down partially overlapping ranges for now.
111		 */
112		if (dbno + dlen < start || dbno > end) {
113			trace_xfs_discard_exclude(mp, agno, fbno, flen);
114			goto next_extent;
115		}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
117		/*
118		 * If any blocks in the range are still busy, skip the
119		 * discard and try again the next time.
120		 */
121		if (xfs_extent_busy_search(mp, agno, fbno, flen)) {
122			trace_xfs_discard_busy(mp, agno, fbno, flen);
123			goto next_extent;
124		}
125
126		trace_xfs_discard_extent(mp, agno, fbno, flen);
127		error = -blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0);
128		if (error)
129			goto out_del_cursor;
130		*blocks_trimmed += flen;
131
132next_extent:
133		error = xfs_btree_decrement(cur, 0, &i);
 
 
 
134		if (error)
135			goto out_del_cursor;
 
 
 
 
 
 
 
 
136	}
137
 
 
 
 
 
 
138out_del_cursor:
139	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
140	xfs_buf_relse(agbp);
141out_put_perag:
142	xfs_perag_put(pag);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143	return error;
144}
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146/*
147 * trim a range of the filesystem.
148 *
149 * Note: the parameters passed from userspace are byte ranges into the
150 * filesystem which does not match to the format we use for filesystem block
151 * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format
152 * is a linear address range. Hence we need to use DADDR based conversions and
153 * comparisons for determining the correct offset and regions to trim.
 
 
 
154 */
155int
156xfs_ioc_trim(
157	struct xfs_mount		*mp,
158	struct fstrim_range __user	*urange)
159{
160	struct request_queue	*q = bdev_get_queue(mp->m_ddev_targp->bt_bdev);
161	unsigned int		granularity = q->limits.discard_granularity;
 
162	struct fstrim_range	range;
163	xfs_daddr_t		start, end, minlen;
164	xfs_agnumber_t		start_agno, end_agno, agno;
165	__uint64_t		blocks_trimmed = 0;
166	int			error, last_error = 0;
167
168	if (!capable(CAP_SYS_ADMIN))
169		return -XFS_ERROR(EPERM);
170	if (!blk_queue_discard(q))
171		return -XFS_ERROR(EOPNOTSUPP);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172	if (copy_from_user(&range, urange, sizeof(range)))
173		return -XFS_ERROR(EFAULT);
 
 
 
174
175	/*
176	 * Truncating down the len isn't actually quite correct, but using
177	 * BBTOB would mean we trivially get overflows for values
178	 * of ULLONG_MAX or slightly lower.  And ULLONG_MAX is the default
179	 * used by the fstrim application.  In the end it really doesn't
180	 * matter as trimming blocks is an advisory interface.
181	 */
182	if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) ||
183	    range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) ||
 
184	    range.len < mp->m_sb.sb_blocksize)
185		return -XFS_ERROR(EINVAL);
186
187	start = BTOBB(range.start);
188	end = start + BTOBBT(range.len) - 1;
189	minlen = BTOBB(max_t(u64, granularity, range.minlen));
190
191	if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1)
192		end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1;
193
194	start_agno = xfs_daddr_to_agno(mp, start);
195	end_agno = xfs_daddr_to_agno(mp, end);
196
197	for (agno = start_agno; agno <= end_agno; agno++) {
198		error = -xfs_trim_extents(mp, agno, start, end, minlen,
199					  &blocks_trimmed);
200		if (error)
201			last_error = error;
202	}
203
204	if (last_error)
205		return last_error;
206
207	range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
 
208	if (copy_to_user(urange, &range, sizeof(range)))
209		return -XFS_ERROR(EFAULT);
210	return 0;
211}
212
213int
214xfs_discard_extents(
215	struct xfs_mount	*mp,
216	struct list_head	*list)
217{
218	struct xfs_extent_busy	*busyp;
219	int			error = 0;
220
221	list_for_each_entry(busyp, list, list) {
222		trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
223					 busyp->length);
224
225		error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
226				XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
227				XFS_FSB_TO_BB(mp, busyp->length),
228				GFP_NOFS, 0);
229		if (error && error != EOPNOTSUPP) {
230			xfs_info(mp,
231	 "discard failed for extent [0x%llu,%u], error %d",
232				 (unsigned long long)busyp->bno,
233				 busyp->length,
234				 error);
235			return error;
236		}
237	}
238
239	return 0;
240}

  1// SPDX-License-Identifier: GPL-2.0
  2/*
  3 * Copyright (C) 2010, 2023 Red Hat, Inc.
  4 * All Rights Reserved.
 
 
 
 
 
 
 
 
 
 
 
 
 
  5 */
  6#include "xfs.h"
  7#include "xfs_shared.h"
  8#include "xfs_format.h"
  9#include "xfs_log_format.h"
 10#include "xfs_trans_resv.h"
 11#include "xfs_trans.h"
 
 12#include "xfs_mount.h"
 
 
 13#include "xfs_btree.h"
 14#include "xfs_alloc_btree.h"
 15#include "xfs_alloc.h"
 16#include "xfs_discard.h"
 17#include "xfs_error.h"
 18#include "xfs_extent_busy.h"
 
 19#include "xfs_trace.h"
 20#include "xfs_log.h"
 21#include "xfs_ag.h"
 22#include "xfs_health.h"
 23#include "xfs_rtbitmap.h"
 24#include "xfs_rtgroup.h"
 25
 26/*
 27 * Notes on an efficient, low latency fstrim algorithm
 28 *
 29 * We need to walk the filesystem free space and issue discards on the free
 30 * space that meet the search criteria (size and location). We cannot issue
 31 * discards on extents that might be in use, or are so recently in use they are
 32 * still marked as busy. To serialise against extent state changes whilst we are
 33 * gathering extents to trim, we must hold the AGF lock to lock out other
 34 * allocations and extent free operations that might change extent state.
 35 *
 36 * However, we cannot just hold the AGF for the entire AG free space walk whilst
 37 * we issue discards on each free space that is found. Storage devices can have
 38 * extremely slow discard implementations (e.g. ceph RBD) and so walking a
 39 * couple of million free extents and issuing synchronous discards on each
 40 * extent can take a *long* time. Whilst we are doing this walk, nothing else
 41 * can access the AGF, and we can stall transactions and hence the log whilst
 42 * modifications wait for the AGF lock to be released. This can lead hung tasks
 43 * kicking the hung task timer and rebooting the system. This is bad.
 44 *
 45 * Hence we need to take a leaf from the bulkstat playbook. It takes the AGI
 46 * lock, gathers a range of inode cluster buffers that are allocated, drops the
 47 * AGI lock and then reads all the inode cluster buffers and processes them. It
 48 * loops doing this, using a cursor to keep track of where it is up to in the AG
 49 * for each iteration to restart the INOBT lookup from.
 50 *
 51 * We can't do this exactly with free space - once we drop the AGF lock, the
 52 * state of the free extent is out of our control and we cannot run a discard
 53 * safely on it in this situation. Unless, of course, we've marked the free
 54 * extent as busy and undergoing a discard operation whilst we held the AGF
 55 * locked.
 56 *
 57 * This is exactly how online discard works - free extents are marked busy when
 58 * they are freed, and once the extent free has been committed to the journal,
 59 * the busy extent record is marked as "undergoing discard" and the discard is
 60 * then issued on the free extent. Once the discard completes, the busy extent
 61 * record is removed and the extent is able to be allocated again.
 62 *
 63 * In the context of fstrim, if we find a free extent we need to discard, we
 64 * don't have to discard it immediately. All we need to do it record that free
 65 * extent as being busy and under discard, and all the allocation routines will
 66 * now avoid trying to allocate it. Hence if we mark the extent as busy under
 67 * the AGF lock, we can safely discard it without holding the AGF lock because
 68 * nothing will attempt to allocate that free space until the discard completes.
 69 *
 70 * This also allows us to issue discards asynchronously like we do with online
 71 * discard, and so for fast devices fstrim will run much faster as we can have
 72 * multiple discard operations in flight at once, as well as pipeline the free
 73 * extent search so that it overlaps in flight discard IO.
 74 */
 75
 76#define XFS_DISCARD_MAX_EXAMINE	(100)
 77
 78struct workqueue_struct *xfs_discard_wq;
 79
 80static void
 81xfs_discard_endio_work(
 82	struct work_struct	*work)
 83{
 84	struct xfs_busy_extents	*extents =
 85		container_of(work, struct xfs_busy_extents, endio_work);
 86
 87	xfs_extent_busy_clear(&extents->extent_list, false);
 88	kfree(extents->owner);
 89}
 90
 91/*
 92 * Queue up the actual completion to a thread to avoid IRQ-safe locking for
 93 * pagb_lock.
 94 */
 95static void
 96xfs_discard_endio(
 97	struct bio		*bio)
 98{
 99	struct xfs_busy_extents	*extents = bio->bi_private;
100
101	INIT_WORK(&extents->endio_work, xfs_discard_endio_work);
102	queue_work(xfs_discard_wq, &extents->endio_work);
103	bio_put(bio);
104}
105
106static inline struct block_device *
107xfs_group_bdev(
108	const struct xfs_group	*xg)
109{
110	struct xfs_mount	*mp = xg->xg_mount;
111
112	switch (xg->xg_type) {
113	case XG_TYPE_AG:
114		return mp->m_ddev_targp->bt_bdev;
115	case XG_TYPE_RTG:
116		return mp->m_rtdev_targp->bt_bdev;
117	default:
118		ASSERT(0);
119		break;
120	}
121	return NULL;
122}
123
124/*
125 * Walk the discard list and issue discards on all the busy extents in the
126 * list. We plug and chain the bios so that we only need a single completion
127 * call to clear all the busy extents once the discards are complete.
128 */
129int
130xfs_discard_extents(
131	struct xfs_mount	*mp,
132	struct xfs_busy_extents	*extents)
 
 
 
 
133{
134	struct xfs_extent_busy	*busyp;
135	struct bio		*bio = NULL;
136	struct blk_plug		plug;
137	int			error = 0;
138
139	blk_start_plug(&plug);
140	list_for_each_entry(busyp, &extents->extent_list, list) {
141		trace_xfs_discard_extent(busyp->group, busyp->bno,
142				busyp->length);
143
144		error = __blkdev_issue_discard(xfs_group_bdev(busyp->group),
145				xfs_gbno_to_daddr(busyp->group, busyp->bno),
146				XFS_FSB_TO_BB(mp, busyp->length),
147				GFP_KERNEL, &bio);
148		if (error && error != -EOPNOTSUPP) {
149			xfs_info(mp,
150	 "discard failed for extent [0x%llx,%u], error %d",
151				 (unsigned long long)busyp->bno,
152				 busyp->length,
153				 error);
154			break;
155		}
156	}
157
158	if (bio) {
159		bio->bi_private = extents;
160		bio->bi_end_io = xfs_discard_endio;
161		submit_bio(bio);
162	} else {
163		xfs_discard_endio_work(&extents->endio_work);
164	}
165	blk_finish_plug(&plug);
166
167	return error;
168}
169
170struct xfs_trim_cur {
171	xfs_agblock_t	start;
172	xfs_extlen_t	count;
173	xfs_agblock_t	end;
174	xfs_extlen_t	minlen;
175	bool		by_bno;
176};
177
178static int
179xfs_trim_gather_extents(
180	struct xfs_perag	*pag,
181	struct xfs_trim_cur	*tcur,
182	struct xfs_busy_extents	*extents)
183{
184	struct xfs_mount	*mp = pag_mount(pag);
185	struct xfs_trans	*tp;
186	struct xfs_btree_cur	*cur;
187	struct xfs_buf		*agbp;
 
188	int			error;
189	int			i;
190	int			batch = XFS_DISCARD_MAX_EXAMINE;
 
 
 
 
 
 
 
191
192	/*
193	 * Force out the log.  This means any transactions that might have freed
194	 * space before we take the AGF buffer lock are now on disk, and the
195	 * volatile disk cache is flushed.
196	 */
197	xfs_log_force(mp, XFS_LOG_SYNC);
198
199	error = xfs_trans_alloc_empty(mp, &tp);
200	if (error)
201		return error;
202
203	error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
204	if (error)
205		goto out_trans_cancel;
206
207	if (tcur->by_bno) {
208		/* sub-AG discard request always starts at tcur->start */
209		cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag);
210		error = xfs_alloc_lookup_le(cur, tcur->start, 0, &i);
211		if (!error && !i)
212			error = xfs_alloc_lookup_ge(cur, tcur->start, 0, &i);
213	} else if (tcur->start == 0) {
214		/* first time through a by-len starts with max length */
215		cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
216		error = xfs_alloc_lookup_ge(cur, 0, tcur->count, &i);
217	} else {
218		/* nth time through a by-len starts where we left off */
219		cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
220		error = xfs_alloc_lookup_le(cur, tcur->start, tcur->count, &i);
221	}
222	if (error)
223		goto out_del_cursor;
224	if (i == 0) {
225		/* nothing of that length left in the AG, we are done */
226		tcur->count = 0;
227		goto out_del_cursor;
228	}
229
230	/*
231	 * Loop until we are done with all extents that are large
232	 * enough to be worth discarding or we hit batch limits.
233	 */
234	while (i) {
235		xfs_agblock_t	fbno;
236		xfs_extlen_t	flen;
 
 
237
238		error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
239		if (error)
240			break;
241		if (XFS_IS_CORRUPT(mp, i != 1)) {
242			xfs_btree_mark_sick(cur);
243			error = -EFSCORRUPTED;
244			break;
245		}
 
 
 
 
 
246
247		if (--batch <= 0) {
248			/*
249			 * Update the cursor to point at this extent so we
250			 * restart the next batch from this extent.
251			 */
252			tcur->start = fbno;
253			tcur->count = flen;
254			break;
255		}
256
257		/*
258		 * If the extent is entirely outside of the range we are
259		 * supposed to skip it.  Do not bother to trim down partially
260		 * overlapping ranges for now.
261		 */
262		if (fbno + flen < tcur->start) {
263			trace_xfs_discard_exclude(pag_group(pag), fbno, flen);
264			goto next_extent;
265		}
266		if (fbno > tcur->end) {
267			trace_xfs_discard_exclude(pag_group(pag), fbno, flen);
268			if (tcur->by_bno) {
269				tcur->count = 0;
270				break;
271			}
272			goto next_extent;
273		}
274
275		/* Trim the extent returned to the range we want. */
276		if (fbno < tcur->start) {
277			flen -= tcur->start - fbno;
278			fbno = tcur->start;
279		}
280		if (fbno + flen > tcur->end + 1)
281			flen = tcur->end - fbno + 1;
282
283		/* Too small?  Give up. */
284		if (flen < tcur->minlen) {
285			trace_xfs_discard_toosmall(pag_group(pag), fbno, flen);
286			if (tcur->by_bno)
287				goto next_extent;
288			tcur->count = 0;
289			break;
290		}
291
292		/*
293		 * If any blocks in the range are still busy, skip the
294		 * discard and try again the next time.
295		 */
296		if (xfs_extent_busy_search(pag_group(pag), fbno, flen)) {
297			trace_xfs_discard_busy(pag_group(pag), fbno, flen);
298			goto next_extent;
299		}
300
301		xfs_extent_busy_insert_discard(pag_group(pag), fbno, flen,
302				&extents->extent_list);
 
 
 
 
303next_extent:
304		if (tcur->by_bno)
305			error = xfs_btree_increment(cur, 0, &i);
306		else
307			error = xfs_btree_decrement(cur, 0, &i);
308		if (error)
309			break;
310
311		/*
312		 * If there's no more records in the tree, we are done. Set the
313		 * cursor block count to 0 to indicate to the caller that there
314		 * is no more extents to search.
315		 */
316		if (i == 0)
317			tcur->count = 0;
318	}
319
320	/*
321	 * If there was an error, release all the gathered busy extents because
322	 * we aren't going to issue a discard on them any more.
323	 */
324	if (error)
325		xfs_extent_busy_clear(&extents->extent_list, false);
326out_del_cursor:
327	xfs_btree_del_cursor(cur, error);
328out_trans_cancel:
329	xfs_trans_cancel(tp);
330	return error;
331}
332
333static bool
334xfs_trim_should_stop(void)
335{
336	return fatal_signal_pending(current) || freezing(current);
337}
338
339/*
340 * Iterate the free list gathering extents and discarding them. We need a cursor
341 * for the repeated iteration of gather/discard loop, so use the longest extent
342 * we found in the last batch as the key to start the next.
343 */
344static int
345xfs_trim_perag_extents(
346	struct xfs_perag	*pag,
347	xfs_agblock_t		start,
348	xfs_agblock_t		end,
349	xfs_extlen_t		minlen)
350{
351	struct xfs_trim_cur	tcur = {
352		.start		= start,
353		.count		= pag->pagf_longest,
354		.end		= end,
355		.minlen		= minlen,
356	};
357	int			error = 0;
358
359	if (start != 0 || end != pag_group(pag)->xg_block_count)
360		tcur.by_bno = true;
361
362	do {
363		struct xfs_busy_extents	*extents;
364
365		extents = kzalloc(sizeof(*extents), GFP_KERNEL);
366		if (!extents) {
367			error = -ENOMEM;
368			break;
369		}
370
371		extents->owner = extents;
372		INIT_LIST_HEAD(&extents->extent_list);
373
374		error = xfs_trim_gather_extents(pag, &tcur, extents);
375		if (error) {
376			kfree(extents);
377			break;
378		}
379
380		/*
381		 * We hand the extent list to the discard function here so the
382		 * discarded extents can be removed from the busy extent list.
383		 * This allows the discards to run asynchronously with gathering
384		 * the next round of extents to discard.
385		 *
386		 * However, we must ensure that we do not reference the extent
387		 * list  after this function call, as it may have been freed by
388		 * the time control returns to us.
389		 */
390		error = xfs_discard_extents(pag_mount(pag), extents);
391		if (error)
392			break;
393
394		if (xfs_trim_should_stop())
395			break;
396
397	} while (tcur.count != 0);
398
399	return error;
400
401}
402
403static int
404xfs_trim_datadev_extents(
405	struct xfs_mount	*mp,
406	xfs_daddr_t		start,
407	xfs_daddr_t		end,
408	xfs_extlen_t		minlen)
409{
410	xfs_agnumber_t		start_agno, end_agno;
411	xfs_agblock_t		start_agbno, end_agbno;
412	struct xfs_perag	*pag = NULL;
413	xfs_daddr_t		ddev_end;
414	int			last_error = 0, error;
415
416	ddev_end = min_t(xfs_daddr_t, end,
417			 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1);
418
419	start_agno = xfs_daddr_to_agno(mp, start);
420	start_agbno = xfs_daddr_to_agbno(mp, start);
421	end_agno = xfs_daddr_to_agno(mp, ddev_end);
422	end_agbno = xfs_daddr_to_agbno(mp, ddev_end);
423
424	while ((pag = xfs_perag_next_range(mp, pag, start_agno, end_agno))) {
425		xfs_agblock_t	agend = pag_group(pag)->xg_block_count;
426
427		if (pag_agno(pag) == end_agno)
428			agend = end_agbno;
429		error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen);
430		if (error)
431			last_error = error;
432
433		if (xfs_trim_should_stop()) {
434			xfs_perag_rele(pag);
435			break;
436		}
437		start_agbno = 0;
438	}
439
440	return last_error;
441}
442
443#ifdef CONFIG_XFS_RT
444struct xfs_trim_rtdev {
445	/* list of rt extents to free */
446	struct list_head	extent_list;
447
448	/* minimum length that caller allows us to trim */
449	xfs_rtblock_t		minlen_fsb;
450
451	/* restart point for the rtbitmap walk */
452	xfs_rtxnum_t		restart_rtx;
453
454	/* stopping point for the current rtbitmap walk */
455	xfs_rtxnum_t		stop_rtx;
456};
457
458struct xfs_rtx_busy {
459	struct list_head	list;
460	xfs_rtblock_t		bno;
461	xfs_rtblock_t		length;
462};
463
464static void
465xfs_discard_free_rtdev_extents(
466	struct xfs_trim_rtdev	*tr)
467{
468	struct xfs_rtx_busy	*busyp, *n;
469
470	list_for_each_entry_safe(busyp, n, &tr->extent_list, list) {
471		list_del_init(&busyp->list);
472		kfree(busyp);
473	}
474}
475
476/*
477 * Walk the discard list and issue discards on all the busy extents in the
478 * list. We plug and chain the bios so that we only need a single completion
479 * call to clear all the busy extents once the discards are complete.
480 */
481static int
482xfs_discard_rtdev_extents(
483	struct xfs_mount	*mp,
484	struct xfs_trim_rtdev	*tr)
485{
486	struct block_device	*bdev = mp->m_rtdev_targp->bt_bdev;
487	struct xfs_rtx_busy	*busyp;
488	struct bio		*bio = NULL;
489	struct blk_plug		plug;
490	xfs_rtblock_t		start = NULLRTBLOCK, length = 0;
491	int			error = 0;
492
493	blk_start_plug(&plug);
494	list_for_each_entry(busyp, &tr->extent_list, list) {
495		if (start == NULLRTBLOCK)
496			start = busyp->bno;
497		length += busyp->length;
498
499		trace_xfs_discard_rtextent(mp, busyp->bno, busyp->length);
500
501		error = __blkdev_issue_discard(bdev,
502				xfs_rtb_to_daddr(mp, busyp->bno),
503				XFS_FSB_TO_BB(mp, busyp->length),
504				GFP_NOFS, &bio);
505		if (error)
506			break;
507	}
508	xfs_discard_free_rtdev_extents(tr);
509
510	if (bio) {
511		error = submit_bio_wait(bio);
512		if (error == -EOPNOTSUPP)
513			error = 0;
514		if (error)
515			xfs_info(mp,
516	 "discard failed for rtextent [0x%llx,%llu], error %d",
517				 (unsigned long long)start,
518				 (unsigned long long)length,
519				 error);
520		bio_put(bio);
521	}
522	blk_finish_plug(&plug);
523
524	return error;
525}
526
527static int
528xfs_trim_gather_rtextent(
529	struct xfs_rtgroup		*rtg,
530	struct xfs_trans		*tp,
531	const struct xfs_rtalloc_rec	*rec,
532	void				*priv)
533{
534	struct xfs_trim_rtdev		*tr = priv;
535	struct xfs_rtx_busy		*busyp;
536	xfs_rtblock_t			rbno, rlen;
537
538	if (rec->ar_startext > tr->stop_rtx) {
539		/*
540		 * If we've scanned a large number of rtbitmap blocks, update
541		 * the cursor to point at this extent so we restart the next
542		 * batch from this extent.
543		 */
544		tr->restart_rtx = rec->ar_startext;
545		return -ECANCELED;
546	}
547
548	rbno = xfs_rtx_to_rtb(rtg, rec->ar_startext);
549	rlen = xfs_rtbxlen_to_blen(rtg_mount(rtg), rec->ar_extcount);
550
551	/* Ignore too small. */
552	if (rlen < tr->minlen_fsb) {
553		trace_xfs_discard_rttoosmall(rtg_mount(rtg), rbno, rlen);
554		return 0;
555	}
556
557	busyp = kzalloc(sizeof(struct xfs_rtx_busy), GFP_KERNEL);
558	if (!busyp)
559		return -ENOMEM;
560
561	busyp->bno = rbno;
562	busyp->length = rlen;
563	INIT_LIST_HEAD(&busyp->list);
564	list_add_tail(&busyp->list, &tr->extent_list);
565
566	tr->restart_rtx = rec->ar_startext + rec->ar_extcount;
567	return 0;
568}
569
570/* Trim extents on an !rtgroups realtime device */
571static int
572xfs_trim_rtextents(
573	struct xfs_rtgroup	*rtg,
574	xfs_rtxnum_t		low,
575	xfs_rtxnum_t		high,
576	xfs_daddr_t		minlen)
577{
578	struct xfs_mount	*mp = rtg_mount(rtg);
579	struct xfs_trim_rtdev	tr = {
580		.minlen_fsb	= XFS_BB_TO_FSB(mp, minlen),
581		.extent_list	= LIST_HEAD_INIT(tr.extent_list),
582	};
583	struct xfs_trans	*tp;
584	int			error;
585
586	error = xfs_trans_alloc_empty(mp, &tp);
587	if (error)
588		return error;
589
590	/*
591	 * Walk the free ranges between low and high.  The query_range function
592	 * trims the extents returned.
593	 */
594	do {
595		tr.stop_rtx = low + xfs_rtbitmap_rtx_per_rbmblock(mp);
596		xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
597		error = xfs_rtalloc_query_range(rtg, tp, low, high,
598				xfs_trim_gather_rtextent, &tr);
599
600		if (error == -ECANCELED)
601			error = 0;
602		if (error) {
603			xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
604			xfs_discard_free_rtdev_extents(&tr);
605			break;
606		}
607
608		if (list_empty(&tr.extent_list)) {
609			xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
610			break;
611		}
612
613		error = xfs_discard_rtdev_extents(mp, &tr);
614		xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
615		if (error)
616			break;
617
618		low = tr.restart_rtx;
619	} while (!xfs_trim_should_stop() && low <= high);
620
621	xfs_trans_cancel(tp);
622	return error;
623}
624
625struct xfs_trim_rtgroup {
626	/* list of rtgroup extents to free */
627	struct xfs_busy_extents	*extents;
628
629	/* minimum length that caller allows us to trim */
630	xfs_rtblock_t		minlen_fsb;
631
632	/* restart point for the rtbitmap walk */
633	xfs_rtxnum_t		restart_rtx;
634
635	/* number of extents to examine before stopping to issue discard ios */
636	int			batch;
637
638	/* number of extents queued for discard */
639	int			queued;
640};
641
642static int
643xfs_trim_gather_rtgroup_extent(
644	struct xfs_rtgroup		*rtg,
645	struct xfs_trans		*tp,
646	const struct xfs_rtalloc_rec	*rec,
647	void				*priv)
648{
649	struct xfs_trim_rtgroup		*tr = priv;
650	xfs_rgblock_t			rgbno;
651	xfs_extlen_t			len;
652
653	if (--tr->batch <= 0) {
654		/*
655		 * If we've checked a large number of extents, update the
656		 * cursor to point at this extent so we restart the next batch
657		 * from this extent.
658		 */
659		tr->restart_rtx = rec->ar_startext;
660		return -ECANCELED;
661	}
662
663	rgbno = xfs_rtx_to_rgbno(rtg, rec->ar_startext);
664	len = xfs_rtxlen_to_extlen(rtg_mount(rtg), rec->ar_extcount);
665
666	/* Ignore too small. */
667	if (len < tr->minlen_fsb) {
668		trace_xfs_discard_toosmall(rtg_group(rtg), rgbno, len);
669		return 0;
670	}
671
672	/*
673	 * If any blocks in the range are still busy, skip the discard and try
674	 * again the next time.
675	 */
676	if (xfs_extent_busy_search(rtg_group(rtg), rgbno, len)) {
677		trace_xfs_discard_busy(rtg_group(rtg), rgbno, len);
678		return 0;
679	}
680
681	xfs_extent_busy_insert_discard(rtg_group(rtg), rgbno, len,
682			&tr->extents->extent_list);
683
684	tr->queued++;
685	tr->restart_rtx = rec->ar_startext + rec->ar_extcount;
686	return 0;
687}
688
689/* Trim extents in this rtgroup using the busy extent machinery. */
690static int
691xfs_trim_rtgroup_extents(
692	struct xfs_rtgroup	*rtg,
693	xfs_rtxnum_t		low,
694	xfs_rtxnum_t		high,
695	xfs_daddr_t		minlen)
696{
697	struct xfs_mount	*mp = rtg_mount(rtg);
698	struct xfs_trim_rtgroup	tr = {
699		.minlen_fsb	= XFS_BB_TO_FSB(mp, minlen),
700	};
701	struct xfs_trans	*tp;
702	int			error;
703
704	error = xfs_trans_alloc_empty(mp, &tp);
705	if (error)
706		return error;
707
708	/*
709	 * Walk the free ranges between low and high.  The query_range function
710	 * trims the extents returned.
711	 */
712	do {
713		tr.extents = kzalloc(sizeof(*tr.extents), GFP_KERNEL);
714		if (!tr.extents) {
715			error = -ENOMEM;
716			break;
717		}
718
719		tr.queued = 0;
720		tr.batch = XFS_DISCARD_MAX_EXAMINE;
721		tr.extents->owner = tr.extents;
722		INIT_LIST_HEAD(&tr.extents->extent_list);
723
724		xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
725		error = xfs_rtalloc_query_range(rtg, tp, low, high,
726				xfs_trim_gather_rtgroup_extent, &tr);
727		xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
728		if (error == -ECANCELED)
729			error = 0;
730		if (error) {
731			kfree(tr.extents);
732			break;
733		}
734
735		if (!tr.queued)
736			break;
737
738		/*
739		 * We hand the extent list to the discard function here so the
740		 * discarded extents can be removed from the busy extent list.
741		 * This allows the discards to run asynchronously with
742		 * gathering the next round of extents to discard.
743		 *
744		 * However, we must ensure that we do not reference the extent
745		 * list  after this function call, as it may have been freed by
746		 * the time control returns to us.
747		 */
748		error = xfs_discard_extents(rtg_mount(rtg), tr.extents);
749		if (error)
750			break;
751
752		low = tr.restart_rtx;
753	} while (!xfs_trim_should_stop() && low <= high);
754
755	xfs_trans_cancel(tp);
756	return error;
757}
758
759static int
760xfs_trim_rtdev_extents(
761	struct xfs_mount	*mp,
762	xfs_daddr_t		start,
763	xfs_daddr_t		end,
764	xfs_daddr_t		minlen)
765{
766	xfs_rtblock_t		start_rtbno, end_rtbno;
767	xfs_rtxnum_t		start_rtx, end_rtx;
768	xfs_rgnumber_t		start_rgno, end_rgno;
769	xfs_daddr_t		daddr_offset;
770	int			last_error = 0, error;
771	struct xfs_rtgroup	*rtg = NULL;
772
773	/* Shift the start and end downwards to match the rt device. */
774	daddr_offset = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
775	if (start > daddr_offset)
776		start -= daddr_offset;
777	else
778		start = 0;
779	start_rtbno = xfs_daddr_to_rtb(mp, start);
780	start_rtx = xfs_rtb_to_rtx(mp, start_rtbno);
781	start_rgno = xfs_rtb_to_rgno(mp, start_rtbno);
782
783	if (end <= daddr_offset)
784		return 0;
785	else
786		end -= daddr_offset;
787	end_rtbno = xfs_daddr_to_rtb(mp, end);
788	end_rtx = xfs_rtb_to_rtx(mp, end_rtbno + mp->m_sb.sb_rextsize - 1);
789	end_rgno = xfs_rtb_to_rgno(mp, end_rtbno);
790
791	while ((rtg = xfs_rtgroup_next_range(mp, rtg, start_rgno, end_rgno))) {
792		xfs_rtxnum_t	rtg_end = rtg->rtg_extents;
793
794		if (rtg_rgno(rtg) == end_rgno)
795			rtg_end = min(rtg_end, end_rtx);
796
797		if (xfs_has_rtgroups(mp))
798			error = xfs_trim_rtgroup_extents(rtg, start_rtx,
799					rtg_end, minlen);
800		else
801			error = xfs_trim_rtextents(rtg, start_rtx, rtg_end,
802					minlen);
803		if (error)
804			last_error = error;
805
806		if (xfs_trim_should_stop()) {
807			xfs_rtgroup_rele(rtg);
808			break;
809		}
810		start_rtx = 0;
811	}
812
813	return last_error;
814}
815#else
816# define xfs_trim_rtdev_extents(...)	(-EOPNOTSUPP)
817#endif /* CONFIG_XFS_RT */
818
819/*
820 * trim a range of the filesystem.
821 *
822 * Note: the parameters passed from userspace are byte ranges into the
823 * filesystem which does not match to the format we use for filesystem block
824 * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format
825 * is a linear address range. Hence we need to use DADDR based conversions and
826 * comparisons for determining the correct offset and regions to trim.
827 *
828 * The realtime device is mapped into the FITRIM "address space" immediately
829 * after the data device.
830 */
831int
832xfs_ioc_trim(
833	struct xfs_mount		*mp,
834	struct fstrim_range __user	*urange)
835{
836	unsigned int		granularity =
837		bdev_discard_granularity(mp->m_ddev_targp->bt_bdev);
838	struct block_device	*rt_bdev = NULL;
839	struct fstrim_range	range;
840	xfs_daddr_t		start, end;
841	xfs_extlen_t		minlen;
842	xfs_rfsblock_t		max_blocks;
843	int			error, last_error = 0;
844
845	if (!capable(CAP_SYS_ADMIN))
846		return -EPERM;
847	if (mp->m_rtdev_targp &&
848	    bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev))
849		rt_bdev = mp->m_rtdev_targp->bt_bdev;
850	if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev)
851		return -EOPNOTSUPP;
852
853	if (rt_bdev)
854		granularity = max(granularity,
855				  bdev_discard_granularity(rt_bdev));
856
857	/*
858	 * We haven't recovered the log, so we cannot use our bnobt-guided
859	 * storage zapping commands.
860	 */
861	if (xfs_has_norecovery(mp))
862		return -EROFS;
863
864	if (copy_from_user(&range, urange, sizeof(range)))
865		return -EFAULT;
866
867	range.minlen = max_t(u64, granularity, range.minlen);
868	minlen = XFS_B_TO_FSB(mp, range.minlen);
869
870	/*
871	 * Truncating down the len isn't actually quite correct, but using
872	 * BBTOB would mean we trivially get overflows for values
873	 * of ULLONG_MAX or slightly lower.  And ULLONG_MAX is the default
874	 * used by the fstrim application.  In the end it really doesn't
875	 * matter as trimming blocks is an advisory interface.
876	 */
877	max_blocks = mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks;
878	if (range.start >= XFS_FSB_TO_B(mp, max_blocks) ||
879	    range.minlen > XFS_FSB_TO_B(mp, mp->m_ag_max_usable) ||
880	    range.len < mp->m_sb.sb_blocksize)
881		return -EINVAL;
882
883	start = BTOBB(range.start);
884	end = start + BTOBBT(range.len) - 1;
 
885
886	if (bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) {
887		error = xfs_trim_datadev_extents(mp, start, end, minlen);
888		if (error)
889			last_error = error;
890	}
891
892	if (rt_bdev && !xfs_trim_should_stop()) {
893		error = xfs_trim_rtdev_extents(mp, start, end, minlen);
 
894		if (error)
895			last_error = error;
896	}
897
898	if (last_error)
899		return last_error;
900
901	range.len = min_t(unsigned long long, range.len,
902			  XFS_FSB_TO_B(mp, max_blocks) - range.start);
903	if (copy_to_user(urange, &range, sizeof(range)))
904		return -EFAULT;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
905	return 0;
906}