Loading...
1/*
2 * Copyright (C) 2010 Red Hat, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_format.h"
20#include "xfs_log_format.h"
21#include "xfs_trans_resv.h"
22#include "xfs_sb.h"
23#include "xfs_ag.h"
24#include "xfs_mount.h"
25#include "xfs_quota.h"
26#include "xfs_inode.h"
27#include "xfs_btree.h"
28#include "xfs_alloc_btree.h"
29#include "xfs_alloc.h"
30#include "xfs_error.h"
31#include "xfs_extent_busy.h"
32#include "xfs_discard.h"
33#include "xfs_trace.h"
34#include "xfs_log.h"
35
36STATIC int
37xfs_trim_extents(
38 struct xfs_mount *mp,
39 xfs_agnumber_t agno,
40 xfs_daddr_t start,
41 xfs_daddr_t end,
42 xfs_daddr_t minlen,
43 __uint64_t *blocks_trimmed)
44{
45 struct block_device *bdev = mp->m_ddev_targp->bt_bdev;
46 struct xfs_btree_cur *cur;
47 struct xfs_buf *agbp;
48 struct xfs_perag *pag;
49 int error;
50 int i;
51
52 pag = xfs_perag_get(mp, agno);
53
54 error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
55 if (error || !agbp)
56 goto out_put_perag;
57
58 cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
59
60 /*
61 * Force out the log. This means any transactions that might have freed
62 * space before we took the AGF buffer lock are now on disk, and the
63 * volatile disk cache is flushed.
64 */
65 xfs_log_force(mp, XFS_LOG_SYNC);
66
67 /*
68 * Look up the longest btree in the AGF and start with it.
69 */
70 error = xfs_alloc_lookup_ge(cur, 0,
71 be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i);
72 if (error)
73 goto out_del_cursor;
74
75 /*
76 * Loop until we are done with all extents that are large
77 * enough to be worth discarding.
78 */
79 while (i) {
80 xfs_agblock_t fbno;
81 xfs_extlen_t flen;
82 xfs_daddr_t dbno;
83 xfs_extlen_t dlen;
84
85 error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
86 if (error)
87 goto out_del_cursor;
88 XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
89 ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest));
90
91 /*
92 * use daddr format for all range/len calculations as that is
93 * the format the range/len variables are supplied in by
94 * userspace.
95 */
96 dbno = XFS_AGB_TO_DADDR(mp, agno, fbno);
97 dlen = XFS_FSB_TO_BB(mp, flen);
98
99 /*
100 * Too small? Give up.
101 */
102 if (dlen < minlen) {
103 trace_xfs_discard_toosmall(mp, agno, fbno, flen);
104 goto out_del_cursor;
105 }
106
107 /*
108 * If the extent is entirely outside of the range we are
109 * supposed to discard skip it. Do not bother to trim
110 * down partially overlapping ranges for now.
111 */
112 if (dbno + dlen < start || dbno > end) {
113 trace_xfs_discard_exclude(mp, agno, fbno, flen);
114 goto next_extent;
115 }
116
117 /*
118 * If any blocks in the range are still busy, skip the
119 * discard and try again the next time.
120 */
121 if (xfs_extent_busy_search(mp, agno, fbno, flen)) {
122 trace_xfs_discard_busy(mp, agno, fbno, flen);
123 goto next_extent;
124 }
125
126 trace_xfs_discard_extent(mp, agno, fbno, flen);
127 error = -blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0);
128 if (error)
129 goto out_del_cursor;
130 *blocks_trimmed += flen;
131
132next_extent:
133 error = xfs_btree_decrement(cur, 0, &i);
134 if (error)
135 goto out_del_cursor;
136 }
137
138out_del_cursor:
139 xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
140 xfs_buf_relse(agbp);
141out_put_perag:
142 xfs_perag_put(pag);
143 return error;
144}
145
146/*
147 * trim a range of the filesystem.
148 *
149 * Note: the parameters passed from userspace are byte ranges into the
150 * filesystem which does not match to the format we use for filesystem block
151 * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format
152 * is a linear address range. Hence we need to use DADDR based conversions and
153 * comparisons for determining the correct offset and regions to trim.
154 */
155int
156xfs_ioc_trim(
157 struct xfs_mount *mp,
158 struct fstrim_range __user *urange)
159{
160 struct request_queue *q = bdev_get_queue(mp->m_ddev_targp->bt_bdev);
161 unsigned int granularity = q->limits.discard_granularity;
162 struct fstrim_range range;
163 xfs_daddr_t start, end, minlen;
164 xfs_agnumber_t start_agno, end_agno, agno;
165 __uint64_t blocks_trimmed = 0;
166 int error, last_error = 0;
167
168 if (!capable(CAP_SYS_ADMIN))
169 return -XFS_ERROR(EPERM);
170 if (!blk_queue_discard(q))
171 return -XFS_ERROR(EOPNOTSUPP);
172 if (copy_from_user(&range, urange, sizeof(range)))
173 return -XFS_ERROR(EFAULT);
174
175 /*
176 * Truncating down the len isn't actually quite correct, but using
177 * BBTOB would mean we trivially get overflows for values
178 * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default
179 * used by the fstrim application. In the end it really doesn't
180 * matter as trimming blocks is an advisory interface.
181 */
182 if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) ||
183 range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) ||
184 range.len < mp->m_sb.sb_blocksize)
185 return -XFS_ERROR(EINVAL);
186
187 start = BTOBB(range.start);
188 end = start + BTOBBT(range.len) - 1;
189 minlen = BTOBB(max_t(u64, granularity, range.minlen));
190
191 if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1)
192 end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1;
193
194 start_agno = xfs_daddr_to_agno(mp, start);
195 end_agno = xfs_daddr_to_agno(mp, end);
196
197 for (agno = start_agno; agno <= end_agno; agno++) {
198 error = -xfs_trim_extents(mp, agno, start, end, minlen,
199 &blocks_trimmed);
200 if (error)
201 last_error = error;
202 }
203
204 if (last_error)
205 return last_error;
206
207 range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
208 if (copy_to_user(urange, &range, sizeof(range)))
209 return -XFS_ERROR(EFAULT);
210 return 0;
211}
212
213int
214xfs_discard_extents(
215 struct xfs_mount *mp,
216 struct list_head *list)
217{
218 struct xfs_extent_busy *busyp;
219 int error = 0;
220
221 list_for_each_entry(busyp, list, list) {
222 trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
223 busyp->length);
224
225 error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
226 XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
227 XFS_FSB_TO_BB(mp, busyp->length),
228 GFP_NOFS, 0);
229 if (error && error != EOPNOTSUPP) {
230 xfs_info(mp,
231 "discard failed for extent [0x%llu,%u], error %d",
232 (unsigned long long)busyp->bno,
233 busyp->length,
234 error);
235 return error;
236 }
237 }
238
239 return 0;
240}
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2010, 2023 Red Hat, Inc.
4 * All Rights Reserved.
5 */
6#include "xfs.h"
7#include "xfs_shared.h"
8#include "xfs_format.h"
9#include "xfs_log_format.h"
10#include "xfs_trans_resv.h"
11#include "xfs_trans.h"
12#include "xfs_mount.h"
13#include "xfs_btree.h"
14#include "xfs_alloc_btree.h"
15#include "xfs_alloc.h"
16#include "xfs_discard.h"
17#include "xfs_error.h"
18#include "xfs_extent_busy.h"
19#include "xfs_trace.h"
20#include "xfs_log.h"
21#include "xfs_ag.h"
22#include "xfs_health.h"
23#include "xfs_rtbitmap.h"
24#include "xfs_rtgroup.h"
25
26/*
27 * Notes on an efficient, low latency fstrim algorithm
28 *
29 * We need to walk the filesystem free space and issue discards on the free
30 * space that meet the search criteria (size and location). We cannot issue
31 * discards on extents that might be in use, or are so recently in use they are
32 * still marked as busy. To serialise against extent state changes whilst we are
33 * gathering extents to trim, we must hold the AGF lock to lock out other
34 * allocations and extent free operations that might change extent state.
35 *
36 * However, we cannot just hold the AGF for the entire AG free space walk whilst
37 * we issue discards on each free space that is found. Storage devices can have
38 * extremely slow discard implementations (e.g. ceph RBD) and so walking a
39 * couple of million free extents and issuing synchronous discards on each
40 * extent can take a *long* time. Whilst we are doing this walk, nothing else
41 * can access the AGF, and we can stall transactions and hence the log whilst
42 * modifications wait for the AGF lock to be released. This can lead hung tasks
43 * kicking the hung task timer and rebooting the system. This is bad.
44 *
45 * Hence we need to take a leaf from the bulkstat playbook. It takes the AGI
46 * lock, gathers a range of inode cluster buffers that are allocated, drops the
47 * AGI lock and then reads all the inode cluster buffers and processes them. It
48 * loops doing this, using a cursor to keep track of where it is up to in the AG
49 * for each iteration to restart the INOBT lookup from.
50 *
51 * We can't do this exactly with free space - once we drop the AGF lock, the
52 * state of the free extent is out of our control and we cannot run a discard
53 * safely on it in this situation. Unless, of course, we've marked the free
54 * extent as busy and undergoing a discard operation whilst we held the AGF
55 * locked.
56 *
57 * This is exactly how online discard works - free extents are marked busy when
58 * they are freed, and once the extent free has been committed to the journal,
59 * the busy extent record is marked as "undergoing discard" and the discard is
60 * then issued on the free extent. Once the discard completes, the busy extent
61 * record is removed and the extent is able to be allocated again.
62 *
63 * In the context of fstrim, if we find a free extent we need to discard, we
64 * don't have to discard it immediately. All we need to do it record that free
65 * extent as being busy and under discard, and all the allocation routines will
66 * now avoid trying to allocate it. Hence if we mark the extent as busy under
67 * the AGF lock, we can safely discard it without holding the AGF lock because
68 * nothing will attempt to allocate that free space until the discard completes.
69 *
70 * This also allows us to issue discards asynchronously like we do with online
71 * discard, and so for fast devices fstrim will run much faster as we can have
72 * multiple discard operations in flight at once, as well as pipeline the free
73 * extent search so that it overlaps in flight discard IO.
74 */
75
76#define XFS_DISCARD_MAX_EXAMINE (100)
77
78struct workqueue_struct *xfs_discard_wq;
79
80static void
81xfs_discard_endio_work(
82 struct work_struct *work)
83{
84 struct xfs_busy_extents *extents =
85 container_of(work, struct xfs_busy_extents, endio_work);
86
87 xfs_extent_busy_clear(&extents->extent_list, false);
88 kfree(extents->owner);
89}
90
91/*
92 * Queue up the actual completion to a thread to avoid IRQ-safe locking for
93 * pagb_lock.
94 */
95static void
96xfs_discard_endio(
97 struct bio *bio)
98{
99 struct xfs_busy_extents *extents = bio->bi_private;
100
101 INIT_WORK(&extents->endio_work, xfs_discard_endio_work);
102 queue_work(xfs_discard_wq, &extents->endio_work);
103 bio_put(bio);
104}
105
106static inline struct block_device *
107xfs_group_bdev(
108 const struct xfs_group *xg)
109{
110 struct xfs_mount *mp = xg->xg_mount;
111
112 switch (xg->xg_type) {
113 case XG_TYPE_AG:
114 return mp->m_ddev_targp->bt_bdev;
115 case XG_TYPE_RTG:
116 return mp->m_rtdev_targp->bt_bdev;
117 default:
118 ASSERT(0);
119 break;
120 }
121 return NULL;
122}
123
124/*
125 * Walk the discard list and issue discards on all the busy extents in the
126 * list. We plug and chain the bios so that we only need a single completion
127 * call to clear all the busy extents once the discards are complete.
128 */
129int
130xfs_discard_extents(
131 struct xfs_mount *mp,
132 struct xfs_busy_extents *extents)
133{
134 struct xfs_extent_busy *busyp;
135 struct bio *bio = NULL;
136 struct blk_plug plug;
137 int error = 0;
138
139 blk_start_plug(&plug);
140 list_for_each_entry(busyp, &extents->extent_list, list) {
141 trace_xfs_discard_extent(busyp->group, busyp->bno,
142 busyp->length);
143
144 error = __blkdev_issue_discard(xfs_group_bdev(busyp->group),
145 xfs_gbno_to_daddr(busyp->group, busyp->bno),
146 XFS_FSB_TO_BB(mp, busyp->length),
147 GFP_KERNEL, &bio);
148 if (error && error != -EOPNOTSUPP) {
149 xfs_info(mp,
150 "discard failed for extent [0x%llx,%u], error %d",
151 (unsigned long long)busyp->bno,
152 busyp->length,
153 error);
154 break;
155 }
156 }
157
158 if (bio) {
159 bio->bi_private = extents;
160 bio->bi_end_io = xfs_discard_endio;
161 submit_bio(bio);
162 } else {
163 xfs_discard_endio_work(&extents->endio_work);
164 }
165 blk_finish_plug(&plug);
166
167 return error;
168}
169
170struct xfs_trim_cur {
171 xfs_agblock_t start;
172 xfs_extlen_t count;
173 xfs_agblock_t end;
174 xfs_extlen_t minlen;
175 bool by_bno;
176};
177
178static int
179xfs_trim_gather_extents(
180 struct xfs_perag *pag,
181 struct xfs_trim_cur *tcur,
182 struct xfs_busy_extents *extents)
183{
184 struct xfs_mount *mp = pag_mount(pag);
185 struct xfs_trans *tp;
186 struct xfs_btree_cur *cur;
187 struct xfs_buf *agbp;
188 int error;
189 int i;
190 int batch = XFS_DISCARD_MAX_EXAMINE;
191
192 /*
193 * Force out the log. This means any transactions that might have freed
194 * space before we take the AGF buffer lock are now on disk, and the
195 * volatile disk cache is flushed.
196 */
197 xfs_log_force(mp, XFS_LOG_SYNC);
198
199 error = xfs_trans_alloc_empty(mp, &tp);
200 if (error)
201 return error;
202
203 error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
204 if (error)
205 goto out_trans_cancel;
206
207 if (tcur->by_bno) {
208 /* sub-AG discard request always starts at tcur->start */
209 cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag);
210 error = xfs_alloc_lookup_le(cur, tcur->start, 0, &i);
211 if (!error && !i)
212 error = xfs_alloc_lookup_ge(cur, tcur->start, 0, &i);
213 } else if (tcur->start == 0) {
214 /* first time through a by-len starts with max length */
215 cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
216 error = xfs_alloc_lookup_ge(cur, 0, tcur->count, &i);
217 } else {
218 /* nth time through a by-len starts where we left off */
219 cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
220 error = xfs_alloc_lookup_le(cur, tcur->start, tcur->count, &i);
221 }
222 if (error)
223 goto out_del_cursor;
224 if (i == 0) {
225 /* nothing of that length left in the AG, we are done */
226 tcur->count = 0;
227 goto out_del_cursor;
228 }
229
230 /*
231 * Loop until we are done with all extents that are large
232 * enough to be worth discarding or we hit batch limits.
233 */
234 while (i) {
235 xfs_agblock_t fbno;
236 xfs_extlen_t flen;
237
238 error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
239 if (error)
240 break;
241 if (XFS_IS_CORRUPT(mp, i != 1)) {
242 xfs_btree_mark_sick(cur);
243 error = -EFSCORRUPTED;
244 break;
245 }
246
247 if (--batch <= 0) {
248 /*
249 * Update the cursor to point at this extent so we
250 * restart the next batch from this extent.
251 */
252 tcur->start = fbno;
253 tcur->count = flen;
254 break;
255 }
256
257 /*
258 * If the extent is entirely outside of the range we are
259 * supposed to skip it. Do not bother to trim down partially
260 * overlapping ranges for now.
261 */
262 if (fbno + flen < tcur->start) {
263 trace_xfs_discard_exclude(pag_group(pag), fbno, flen);
264 goto next_extent;
265 }
266 if (fbno > tcur->end) {
267 trace_xfs_discard_exclude(pag_group(pag), fbno, flen);
268 if (tcur->by_bno) {
269 tcur->count = 0;
270 break;
271 }
272 goto next_extent;
273 }
274
275 /* Trim the extent returned to the range we want. */
276 if (fbno < tcur->start) {
277 flen -= tcur->start - fbno;
278 fbno = tcur->start;
279 }
280 if (fbno + flen > tcur->end + 1)
281 flen = tcur->end - fbno + 1;
282
283 /* Too small? Give up. */
284 if (flen < tcur->minlen) {
285 trace_xfs_discard_toosmall(pag_group(pag), fbno, flen);
286 if (tcur->by_bno)
287 goto next_extent;
288 tcur->count = 0;
289 break;
290 }
291
292 /*
293 * If any blocks in the range are still busy, skip the
294 * discard and try again the next time.
295 */
296 if (xfs_extent_busy_search(pag_group(pag), fbno, flen)) {
297 trace_xfs_discard_busy(pag_group(pag), fbno, flen);
298 goto next_extent;
299 }
300
301 xfs_extent_busy_insert_discard(pag_group(pag), fbno, flen,
302 &extents->extent_list);
303next_extent:
304 if (tcur->by_bno)
305 error = xfs_btree_increment(cur, 0, &i);
306 else
307 error = xfs_btree_decrement(cur, 0, &i);
308 if (error)
309 break;
310
311 /*
312 * If there's no more records in the tree, we are done. Set the
313 * cursor block count to 0 to indicate to the caller that there
314 * is no more extents to search.
315 */
316 if (i == 0)
317 tcur->count = 0;
318 }
319
320 /*
321 * If there was an error, release all the gathered busy extents because
322 * we aren't going to issue a discard on them any more.
323 */
324 if (error)
325 xfs_extent_busy_clear(&extents->extent_list, false);
326out_del_cursor:
327 xfs_btree_del_cursor(cur, error);
328out_trans_cancel:
329 xfs_trans_cancel(tp);
330 return error;
331}
332
333static bool
334xfs_trim_should_stop(void)
335{
336 return fatal_signal_pending(current) || freezing(current);
337}
338
339/*
340 * Iterate the free list gathering extents and discarding them. We need a cursor
341 * for the repeated iteration of gather/discard loop, so use the longest extent
342 * we found in the last batch as the key to start the next.
343 */
344static int
345xfs_trim_perag_extents(
346 struct xfs_perag *pag,
347 xfs_agblock_t start,
348 xfs_agblock_t end,
349 xfs_extlen_t minlen)
350{
351 struct xfs_trim_cur tcur = {
352 .start = start,
353 .count = pag->pagf_longest,
354 .end = end,
355 .minlen = minlen,
356 };
357 int error = 0;
358
359 if (start != 0 || end != pag_group(pag)->xg_block_count)
360 tcur.by_bno = true;
361
362 do {
363 struct xfs_busy_extents *extents;
364
365 extents = kzalloc(sizeof(*extents), GFP_KERNEL);
366 if (!extents) {
367 error = -ENOMEM;
368 break;
369 }
370
371 extents->owner = extents;
372 INIT_LIST_HEAD(&extents->extent_list);
373
374 error = xfs_trim_gather_extents(pag, &tcur, extents);
375 if (error) {
376 kfree(extents);
377 break;
378 }
379
380 /*
381 * We hand the extent list to the discard function here so the
382 * discarded extents can be removed from the busy extent list.
383 * This allows the discards to run asynchronously with gathering
384 * the next round of extents to discard.
385 *
386 * However, we must ensure that we do not reference the extent
387 * list after this function call, as it may have been freed by
388 * the time control returns to us.
389 */
390 error = xfs_discard_extents(pag_mount(pag), extents);
391 if (error)
392 break;
393
394 if (xfs_trim_should_stop())
395 break;
396
397 } while (tcur.count != 0);
398
399 return error;
400
401}
402
403static int
404xfs_trim_datadev_extents(
405 struct xfs_mount *mp,
406 xfs_daddr_t start,
407 xfs_daddr_t end,
408 xfs_extlen_t minlen)
409{
410 xfs_agnumber_t start_agno, end_agno;
411 xfs_agblock_t start_agbno, end_agbno;
412 struct xfs_perag *pag = NULL;
413 xfs_daddr_t ddev_end;
414 int last_error = 0, error;
415
416 ddev_end = min_t(xfs_daddr_t, end,
417 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1);
418
419 start_agno = xfs_daddr_to_agno(mp, start);
420 start_agbno = xfs_daddr_to_agbno(mp, start);
421 end_agno = xfs_daddr_to_agno(mp, ddev_end);
422 end_agbno = xfs_daddr_to_agbno(mp, ddev_end);
423
424 while ((pag = xfs_perag_next_range(mp, pag, start_agno, end_agno))) {
425 xfs_agblock_t agend = pag_group(pag)->xg_block_count;
426
427 if (pag_agno(pag) == end_agno)
428 agend = end_agbno;
429 error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen);
430 if (error)
431 last_error = error;
432
433 if (xfs_trim_should_stop()) {
434 xfs_perag_rele(pag);
435 break;
436 }
437 start_agbno = 0;
438 }
439
440 return last_error;
441}
442
443#ifdef CONFIG_XFS_RT
444struct xfs_trim_rtdev {
445 /* list of rt extents to free */
446 struct list_head extent_list;
447
448 /* minimum length that caller allows us to trim */
449 xfs_rtblock_t minlen_fsb;
450
451 /* restart point for the rtbitmap walk */
452 xfs_rtxnum_t restart_rtx;
453
454 /* stopping point for the current rtbitmap walk */
455 xfs_rtxnum_t stop_rtx;
456};
457
458struct xfs_rtx_busy {
459 struct list_head list;
460 xfs_rtblock_t bno;
461 xfs_rtblock_t length;
462};
463
464static void
465xfs_discard_free_rtdev_extents(
466 struct xfs_trim_rtdev *tr)
467{
468 struct xfs_rtx_busy *busyp, *n;
469
470 list_for_each_entry_safe(busyp, n, &tr->extent_list, list) {
471 list_del_init(&busyp->list);
472 kfree(busyp);
473 }
474}
475
476/*
477 * Walk the discard list and issue discards on all the busy extents in the
478 * list. We plug and chain the bios so that we only need a single completion
479 * call to clear all the busy extents once the discards are complete.
480 */
481static int
482xfs_discard_rtdev_extents(
483 struct xfs_mount *mp,
484 struct xfs_trim_rtdev *tr)
485{
486 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
487 struct xfs_rtx_busy *busyp;
488 struct bio *bio = NULL;
489 struct blk_plug plug;
490 xfs_rtblock_t start = NULLRTBLOCK, length = 0;
491 int error = 0;
492
493 blk_start_plug(&plug);
494 list_for_each_entry(busyp, &tr->extent_list, list) {
495 if (start == NULLRTBLOCK)
496 start = busyp->bno;
497 length += busyp->length;
498
499 trace_xfs_discard_rtextent(mp, busyp->bno, busyp->length);
500
501 error = __blkdev_issue_discard(bdev,
502 xfs_rtb_to_daddr(mp, busyp->bno),
503 XFS_FSB_TO_BB(mp, busyp->length),
504 GFP_NOFS, &bio);
505 if (error)
506 break;
507 }
508 xfs_discard_free_rtdev_extents(tr);
509
510 if (bio) {
511 error = submit_bio_wait(bio);
512 if (error == -EOPNOTSUPP)
513 error = 0;
514 if (error)
515 xfs_info(mp,
516 "discard failed for rtextent [0x%llx,%llu], error %d",
517 (unsigned long long)start,
518 (unsigned long long)length,
519 error);
520 bio_put(bio);
521 }
522 blk_finish_plug(&plug);
523
524 return error;
525}
526
527static int
528xfs_trim_gather_rtextent(
529 struct xfs_rtgroup *rtg,
530 struct xfs_trans *tp,
531 const struct xfs_rtalloc_rec *rec,
532 void *priv)
533{
534 struct xfs_trim_rtdev *tr = priv;
535 struct xfs_rtx_busy *busyp;
536 xfs_rtblock_t rbno, rlen;
537
538 if (rec->ar_startext > tr->stop_rtx) {
539 /*
540 * If we've scanned a large number of rtbitmap blocks, update
541 * the cursor to point at this extent so we restart the next
542 * batch from this extent.
543 */
544 tr->restart_rtx = rec->ar_startext;
545 return -ECANCELED;
546 }
547
548 rbno = xfs_rtx_to_rtb(rtg, rec->ar_startext);
549 rlen = xfs_rtbxlen_to_blen(rtg_mount(rtg), rec->ar_extcount);
550
551 /* Ignore too small. */
552 if (rlen < tr->minlen_fsb) {
553 trace_xfs_discard_rttoosmall(rtg_mount(rtg), rbno, rlen);
554 return 0;
555 }
556
557 busyp = kzalloc(sizeof(struct xfs_rtx_busy), GFP_KERNEL);
558 if (!busyp)
559 return -ENOMEM;
560
561 busyp->bno = rbno;
562 busyp->length = rlen;
563 INIT_LIST_HEAD(&busyp->list);
564 list_add_tail(&busyp->list, &tr->extent_list);
565
566 tr->restart_rtx = rec->ar_startext + rec->ar_extcount;
567 return 0;
568}
569
570/* Trim extents on an !rtgroups realtime device */
571static int
572xfs_trim_rtextents(
573 struct xfs_rtgroup *rtg,
574 xfs_rtxnum_t low,
575 xfs_rtxnum_t high,
576 xfs_daddr_t minlen)
577{
578 struct xfs_mount *mp = rtg_mount(rtg);
579 struct xfs_trim_rtdev tr = {
580 .minlen_fsb = XFS_BB_TO_FSB(mp, minlen),
581 .extent_list = LIST_HEAD_INIT(tr.extent_list),
582 };
583 struct xfs_trans *tp;
584 int error;
585
586 error = xfs_trans_alloc_empty(mp, &tp);
587 if (error)
588 return error;
589
590 /*
591 * Walk the free ranges between low and high. The query_range function
592 * trims the extents returned.
593 */
594 do {
595 tr.stop_rtx = low + xfs_rtbitmap_rtx_per_rbmblock(mp);
596 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
597 error = xfs_rtalloc_query_range(rtg, tp, low, high,
598 xfs_trim_gather_rtextent, &tr);
599
600 if (error == -ECANCELED)
601 error = 0;
602 if (error) {
603 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
604 xfs_discard_free_rtdev_extents(&tr);
605 break;
606 }
607
608 if (list_empty(&tr.extent_list)) {
609 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
610 break;
611 }
612
613 error = xfs_discard_rtdev_extents(mp, &tr);
614 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
615 if (error)
616 break;
617
618 low = tr.restart_rtx;
619 } while (!xfs_trim_should_stop() && low <= high);
620
621 xfs_trans_cancel(tp);
622 return error;
623}
624
625struct xfs_trim_rtgroup {
626 /* list of rtgroup extents to free */
627 struct xfs_busy_extents *extents;
628
629 /* minimum length that caller allows us to trim */
630 xfs_rtblock_t minlen_fsb;
631
632 /* restart point for the rtbitmap walk */
633 xfs_rtxnum_t restart_rtx;
634
635 /* number of extents to examine before stopping to issue discard ios */
636 int batch;
637
638 /* number of extents queued for discard */
639 int queued;
640};
641
642static int
643xfs_trim_gather_rtgroup_extent(
644 struct xfs_rtgroup *rtg,
645 struct xfs_trans *tp,
646 const struct xfs_rtalloc_rec *rec,
647 void *priv)
648{
649 struct xfs_trim_rtgroup *tr = priv;
650 xfs_rgblock_t rgbno;
651 xfs_extlen_t len;
652
653 if (--tr->batch <= 0) {
654 /*
655 * If we've checked a large number of extents, update the
656 * cursor to point at this extent so we restart the next batch
657 * from this extent.
658 */
659 tr->restart_rtx = rec->ar_startext;
660 return -ECANCELED;
661 }
662
663 rgbno = xfs_rtx_to_rgbno(rtg, rec->ar_startext);
664 len = xfs_rtxlen_to_extlen(rtg_mount(rtg), rec->ar_extcount);
665
666 /* Ignore too small. */
667 if (len < tr->minlen_fsb) {
668 trace_xfs_discard_toosmall(rtg_group(rtg), rgbno, len);
669 return 0;
670 }
671
672 /*
673 * If any blocks in the range are still busy, skip the discard and try
674 * again the next time.
675 */
676 if (xfs_extent_busy_search(rtg_group(rtg), rgbno, len)) {
677 trace_xfs_discard_busy(rtg_group(rtg), rgbno, len);
678 return 0;
679 }
680
681 xfs_extent_busy_insert_discard(rtg_group(rtg), rgbno, len,
682 &tr->extents->extent_list);
683
684 tr->queued++;
685 tr->restart_rtx = rec->ar_startext + rec->ar_extcount;
686 return 0;
687}
688
689/* Trim extents in this rtgroup using the busy extent machinery. */
690static int
691xfs_trim_rtgroup_extents(
692 struct xfs_rtgroup *rtg,
693 xfs_rtxnum_t low,
694 xfs_rtxnum_t high,
695 xfs_daddr_t minlen)
696{
697 struct xfs_mount *mp = rtg_mount(rtg);
698 struct xfs_trim_rtgroup tr = {
699 .minlen_fsb = XFS_BB_TO_FSB(mp, minlen),
700 };
701 struct xfs_trans *tp;
702 int error;
703
704 error = xfs_trans_alloc_empty(mp, &tp);
705 if (error)
706 return error;
707
708 /*
709 * Walk the free ranges between low and high. The query_range function
710 * trims the extents returned.
711 */
712 do {
713 tr.extents = kzalloc(sizeof(*tr.extents), GFP_KERNEL);
714 if (!tr.extents) {
715 error = -ENOMEM;
716 break;
717 }
718
719 tr.queued = 0;
720 tr.batch = XFS_DISCARD_MAX_EXAMINE;
721 tr.extents->owner = tr.extents;
722 INIT_LIST_HEAD(&tr.extents->extent_list);
723
724 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
725 error = xfs_rtalloc_query_range(rtg, tp, low, high,
726 xfs_trim_gather_rtgroup_extent, &tr);
727 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
728 if (error == -ECANCELED)
729 error = 0;
730 if (error) {
731 kfree(tr.extents);
732 break;
733 }
734
735 if (!tr.queued)
736 break;
737
738 /*
739 * We hand the extent list to the discard function here so the
740 * discarded extents can be removed from the busy extent list.
741 * This allows the discards to run asynchronously with
742 * gathering the next round of extents to discard.
743 *
744 * However, we must ensure that we do not reference the extent
745 * list after this function call, as it may have been freed by
746 * the time control returns to us.
747 */
748 error = xfs_discard_extents(rtg_mount(rtg), tr.extents);
749 if (error)
750 break;
751
752 low = tr.restart_rtx;
753 } while (!xfs_trim_should_stop() && low <= high);
754
755 xfs_trans_cancel(tp);
756 return error;
757}
758
759static int
760xfs_trim_rtdev_extents(
761 struct xfs_mount *mp,
762 xfs_daddr_t start,
763 xfs_daddr_t end,
764 xfs_daddr_t minlen)
765{
766 xfs_rtblock_t start_rtbno, end_rtbno;
767 xfs_rtxnum_t start_rtx, end_rtx;
768 xfs_rgnumber_t start_rgno, end_rgno;
769 xfs_daddr_t daddr_offset;
770 int last_error = 0, error;
771 struct xfs_rtgroup *rtg = NULL;
772
773 /* Shift the start and end downwards to match the rt device. */
774 daddr_offset = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
775 if (start > daddr_offset)
776 start -= daddr_offset;
777 else
778 start = 0;
779 start_rtbno = xfs_daddr_to_rtb(mp, start);
780 start_rtx = xfs_rtb_to_rtx(mp, start_rtbno);
781 start_rgno = xfs_rtb_to_rgno(mp, start_rtbno);
782
783 if (end <= daddr_offset)
784 return 0;
785 else
786 end -= daddr_offset;
787 end_rtbno = xfs_daddr_to_rtb(mp, end);
788 end_rtx = xfs_rtb_to_rtx(mp, end_rtbno + mp->m_sb.sb_rextsize - 1);
789 end_rgno = xfs_rtb_to_rgno(mp, end_rtbno);
790
791 while ((rtg = xfs_rtgroup_next_range(mp, rtg, start_rgno, end_rgno))) {
792 xfs_rtxnum_t rtg_end = rtg->rtg_extents;
793
794 if (rtg_rgno(rtg) == end_rgno)
795 rtg_end = min(rtg_end, end_rtx);
796
797 if (xfs_has_rtgroups(mp))
798 error = xfs_trim_rtgroup_extents(rtg, start_rtx,
799 rtg_end, minlen);
800 else
801 error = xfs_trim_rtextents(rtg, start_rtx, rtg_end,
802 minlen);
803 if (error)
804 last_error = error;
805
806 if (xfs_trim_should_stop()) {
807 xfs_rtgroup_rele(rtg);
808 break;
809 }
810 start_rtx = 0;
811 }
812
813 return last_error;
814}
815#else
816# define xfs_trim_rtdev_extents(...) (-EOPNOTSUPP)
817#endif /* CONFIG_XFS_RT */
818
819/*
820 * trim a range of the filesystem.
821 *
822 * Note: the parameters passed from userspace are byte ranges into the
823 * filesystem which does not match to the format we use for filesystem block
824 * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format
825 * is a linear address range. Hence we need to use DADDR based conversions and
826 * comparisons for determining the correct offset and regions to trim.
827 *
828 * The realtime device is mapped into the FITRIM "address space" immediately
829 * after the data device.
830 */
831int
832xfs_ioc_trim(
833 struct xfs_mount *mp,
834 struct fstrim_range __user *urange)
835{
836 unsigned int granularity =
837 bdev_discard_granularity(mp->m_ddev_targp->bt_bdev);
838 struct block_device *rt_bdev = NULL;
839 struct fstrim_range range;
840 xfs_daddr_t start, end;
841 xfs_extlen_t minlen;
842 xfs_rfsblock_t max_blocks;
843 int error, last_error = 0;
844
845 if (!capable(CAP_SYS_ADMIN))
846 return -EPERM;
847 if (mp->m_rtdev_targp &&
848 bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev))
849 rt_bdev = mp->m_rtdev_targp->bt_bdev;
850 if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev)
851 return -EOPNOTSUPP;
852
853 if (rt_bdev)
854 granularity = max(granularity,
855 bdev_discard_granularity(rt_bdev));
856
857 /*
858 * We haven't recovered the log, so we cannot use our bnobt-guided
859 * storage zapping commands.
860 */
861 if (xfs_has_norecovery(mp))
862 return -EROFS;
863
864 if (copy_from_user(&range, urange, sizeof(range)))
865 return -EFAULT;
866
867 range.minlen = max_t(u64, granularity, range.minlen);
868 minlen = XFS_B_TO_FSB(mp, range.minlen);
869
870 /*
871 * Truncating down the len isn't actually quite correct, but using
872 * BBTOB would mean we trivially get overflows for values
873 * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default
874 * used by the fstrim application. In the end it really doesn't
875 * matter as trimming blocks is an advisory interface.
876 */
877 max_blocks = mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks;
878 if (range.start >= XFS_FSB_TO_B(mp, max_blocks) ||
879 range.minlen > XFS_FSB_TO_B(mp, mp->m_ag_max_usable) ||
880 range.len < mp->m_sb.sb_blocksize)
881 return -EINVAL;
882
883 start = BTOBB(range.start);
884 end = start + BTOBBT(range.len) - 1;
885
886 if (bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) {
887 error = xfs_trim_datadev_extents(mp, start, end, minlen);
888 if (error)
889 last_error = error;
890 }
891
892 if (rt_bdev && !xfs_trim_should_stop()) {
893 error = xfs_trim_rtdev_extents(mp, start, end, minlen);
894 if (error)
895 last_error = error;
896 }
897
898 if (last_error)
899 return last_error;
900
901 range.len = min_t(unsigned long long, range.len,
902 XFS_FSB_TO_B(mp, max_blocks) - range.start);
903 if (copy_to_user(urange, &range, sizeof(range)))
904 return -EFAULT;
905 return 0;
906}