fiemap.c - fs/btrfs/fiemap.c - Linux source code v6.13.7

Note: File does not exist in v3.1.
  1// SPDX-License-Identifier: GPL-2.0
  2
  3#include "backref.h"
  4#include "btrfs_inode.h"
  5#include "fiemap.h"
  6#include "file.h"
  7#include "file-item.h"
  8
  9struct btrfs_fiemap_entry {
 10	u64 offset;
 11	u64 phys;
 12	u64 len;
 13	u32 flags;
 14};
 15
 16/*
 17 * Indicate the caller of emit_fiemap_extent() that it needs to unlock the file
 18 * range from the inode's io tree, unlock the subvolume tree search path, flush
 19 * the fiemap cache and relock the file range and research the subvolume tree.
 20 * The value here is something negative that can't be confused with a valid
 21 * errno value and different from 1 because that's also a return value from
 22 * fiemap_fill_next_extent() and also it's often used to mean some btree search
 23 * did not find a key, so make it some distinct negative value.
 24 */
 25#define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1))
 26
 27/*
 28 * Used to:
 29 *
 30 * - Cache the next entry to be emitted to the fiemap buffer, so that we can
 31 *   merge extents that are contiguous and can be grouped as a single one;
 32 *
 33 * - Store extents ready to be written to the fiemap buffer in an intermediary
 34 *   buffer. This intermediary buffer is to ensure that in case the fiemap
 35 *   buffer is memory mapped to the fiemap target file, we don't deadlock
 36 *   during btrfs_page_mkwrite(). This is because during fiemap we are locking
 37 *   an extent range in order to prevent races with delalloc flushing and
 38 *   ordered extent completion, which is needed in order to reliably detect
 39 *   delalloc in holes and prealloc extents. And this can lead to a deadlock
 40 *   if the fiemap buffer is memory mapped to the file we are running fiemap
 41 *   against (a silly, useless in practice scenario, but possible) because
 42 *   btrfs_page_mkwrite() will try to lock the same extent range.
 43 */
 44struct fiemap_cache {
 45	/* An array of ready fiemap entries. */
 46	struct btrfs_fiemap_entry *entries;
 47	/* Number of entries in the entries array. */
 48	int entries_size;
 49	/* Index of the next entry in the entries array to write to. */
 50	int entries_pos;
 51	/*
 52	 * Once the entries array is full, this indicates what's the offset for
 53	 * the next file extent item we must search for in the inode's subvolume
 54	 * tree after unlocking the extent range in the inode's io tree and
 55	 * releasing the search path.
 56	 */
 57	u64 next_search_offset;
 58	/*
 59	 * This matches struct fiemap_extent_info::fi_mapped_extents, we use it
 60	 * to count ourselves emitted extents and stop instead of relying on
 61	 * fiemap_fill_next_extent() because we buffer ready fiemap entries at
 62	 * the @entries array, and we want to stop as soon as we hit the max
 63	 * amount of extents to map, not just to save time but also to make the
 64	 * logic at extent_fiemap() simpler.
 65	 */
 66	unsigned int extents_mapped;
 67	/* Fields for the cached extent (unsubmitted, not ready, extent). */
 68	u64 offset;
 69	u64 phys;
 70	u64 len;
 71	u32 flags;
 72	bool cached;
 73};
 74
 75static int flush_fiemap_cache(struct fiemap_extent_info *fieinfo,
 76			      struct fiemap_cache *cache)
 77{
 78	for (int i = 0; i < cache->entries_pos; i++) {
 79		struct btrfs_fiemap_entry *entry = &cache->entries[i];
 80		int ret;
 81
 82		ret = fiemap_fill_next_extent(fieinfo, entry->offset,
 83					      entry->phys, entry->len,
 84					      entry->flags);
 85		/*
 86		 * Ignore 1 (reached max entries) because we keep track of that
 87		 * ourselves in emit_fiemap_extent().
 88		 */
 89		if (ret < 0)
 90			return ret;
 91	}
 92	cache->entries_pos = 0;
 93
 94	return 0;
 95}
 96
 97/*
 98 * Helper to submit fiemap extent.
 99 *
100 * Will try to merge current fiemap extent specified by @offset, @phys,
101 * @len and @flags with cached one.
102 * And only when we fails to merge, cached one will be submitted as
103 * fiemap extent.
104 *
105 * Return value is the same as fiemap_fill_next_extent().
106 */
107static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
108				struct fiemap_cache *cache,
109				u64 offset, u64 phys, u64 len, u32 flags)
110{
111	struct btrfs_fiemap_entry *entry;
112	u64 cache_end;
113
114	/* Set at the end of extent_fiemap(). */
115	ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);
116
117	if (!cache->cached)
118		goto assign;
119
120	/*
121	 * When iterating the extents of the inode, at extent_fiemap(), we may
122	 * find an extent that starts at an offset behind the end offset of the
123	 * previous extent we processed. This happens if fiemap is called
124	 * without FIEMAP_FLAG_SYNC and there are ordered extents completing
125	 * after we had to unlock the file range, release the search path, emit
126	 * the fiemap extents stored in the buffer (cache->entries array) and
127	 * the lock the remainder of the range and re-search the btree.
128	 *
129	 * For example we are in leaf X processing its last item, which is the
130	 * file extent item for file range [512K, 1M[, and after
131	 * btrfs_next_leaf() releases the path, there's an ordered extent that
132	 * completes for the file range [768K, 2M[, and that results in trimming
133	 * the file extent item so that it now corresponds to the file range
134	 * [512K, 768K[ and a new file extent item is inserted for the file
135	 * range [768K, 2M[, which may end up as the last item of leaf X or as
136	 * the first item of the next leaf - in either case btrfs_next_leaf()
137	 * will leave us with a path pointing to the new extent item, for the
138	 * file range [768K, 2M[, since that's the first key that follows the
139	 * last one we processed. So in order not to report overlapping extents
140	 * to user space, we trim the length of the previously cached extent and
141	 * emit it.
142	 *
143	 * Upon calling btrfs_next_leaf() we may also find an extent with an
144	 * offset smaller than or equals to cache->offset, and this happens
145	 * when we had a hole or prealloc extent with several delalloc ranges in
146	 * it, but after btrfs_next_leaf() released the path, delalloc was
147	 * flushed and the resulting ordered extents were completed, so we can
148	 * now have found a file extent item for an offset that is smaller than
149	 * or equals to what we have in cache->offset. We deal with this as
150	 * described below.
151	 */
152	cache_end = cache->offset + cache->len;
153	if (cache_end > offset) {
154		if (offset == cache->offset) {
155			/*
156			 * We cached a dealloc range (found in the io tree) for
157			 * a hole or prealloc extent and we have now found a
158			 * file extent item for the same offset. What we have
159			 * now is more recent and up to date, so discard what
160			 * we had in the cache and use what we have just found.
161			 */
162			goto assign;
163		} else if (offset > cache->offset) {
164			/*
165			 * The extent range we previously found ends after the
166			 * offset of the file extent item we found and that
167			 * offset falls somewhere in the middle of that previous
168			 * extent range. So adjust the range we previously found
169			 * to end at the offset of the file extent item we have
170			 * just found, since this extent is more up to date.
171			 * Emit that adjusted range and cache the file extent
172			 * item we have just found. This corresponds to the case
173			 * where a previously found file extent item was split
174			 * due to an ordered extent completing.
175			 */
176			cache->len = offset - cache->offset;
177			goto emit;
178		} else {
179			const u64 range_end = offset + len;
180
181			/*
182			 * The offset of the file extent item we have just found
183			 * is behind the cached offset. This means we were
184			 * processing a hole or prealloc extent for which we
185			 * have found delalloc ranges (in the io tree), so what
186			 * we have in the cache is the last delalloc range we
187			 * found while the file extent item we found can be
188			 * either for a whole delalloc range we previously
189			 * emitted or only a part of that range.
190			 *
191			 * We have two cases here:
192			 *
193			 * 1) The file extent item's range ends at or behind the
194			 *    cached extent's end. In this case just ignore the
195			 *    current file extent item because we don't want to
196			 *    overlap with previous ranges that may have been
197			 *    emitted already;
198			 *
199			 * 2) The file extent item starts behind the currently
200			 *    cached extent but its end offset goes beyond the
201			 *    end offset of the cached extent. We don't want to
202			 *    overlap with a previous range that may have been
203			 *    emitted already, so we emit the currently cached
204			 *    extent and then partially store the current file
205			 *    extent item's range in the cache, for the subrange
206			 *    going the cached extent's end to the end of the
207			 *    file extent item.
208			 */
209			if (range_end <= cache_end)
210				return 0;
211
212			if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC)))
213				phys += cache_end - offset;
214
215			offset = cache_end;
216			len = range_end - cache_end;
217			goto emit;
218		}
219	}
220
221	/*
222	 * Only merges fiemap extents if
223	 * 1) Their logical addresses are continuous
224	 *
225	 * 2) Their physical addresses are continuous
226	 *    So truly compressed (physical size smaller than logical size)
227	 *    extents won't get merged with each other
228	 *
229	 * 3) Share same flags
230	 */
231	if (cache->offset + cache->len  == offset &&
232	    cache->phys + cache->len == phys  &&
233	    cache->flags == flags) {
234		cache->len += len;
235		return 0;
236	}
237
238emit:
239	/* Not mergeable, need to submit cached one */
240
241	if (cache->entries_pos == cache->entries_size) {
242		/*
243		 * We will need to research for the end offset of the last
244		 * stored extent and not from the current offset, because after
245		 * unlocking the range and releasing the path, if there's a hole
246		 * between that end offset and this current offset, a new extent
247		 * may have been inserted due to a new write, so we don't want
248		 * to miss it.
249		 */
250		entry = &cache->entries[cache->entries_size - 1];
251		cache->next_search_offset = entry->offset + entry->len;
252		cache->cached = false;
253
254		return BTRFS_FIEMAP_FLUSH_CACHE;
255	}
256
257	entry = &cache->entries[cache->entries_pos];
258	entry->offset = cache->offset;
259	entry->phys = cache->phys;
260	entry->len = cache->len;
261	entry->flags = cache->flags;
262	cache->entries_pos++;
263	cache->extents_mapped++;
264
265	if (cache->extents_mapped == fieinfo->fi_extents_max) {
266		cache->cached = false;
267		return 1;
268	}
269assign:
270	cache->cached = true;
271	cache->offset = offset;
272	cache->phys = phys;
273	cache->len = len;
274	cache->flags = flags;
275
276	return 0;
277}
278
279/*
280 * Emit last fiemap cache
281 *
282 * The last fiemap cache may still be cached in the following case:
283 * 0		      4k		    8k
284 * |<- Fiemap range ->|
285 * |<------------  First extent ----------->|
286 *
287 * In this case, the first extent range will be cached but not emitted.
288 * So we must emit it before ending extent_fiemap().
289 */
290static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
291				  struct fiemap_cache *cache)
292{
293	int ret;
294
295	if (!cache->cached)
296		return 0;
297
298	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
299				      cache->len, cache->flags);
300	cache->cached = false;
301	if (ret > 0)
302		ret = 0;
303	return ret;
304}
305
306static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)
307{
308	struct extent_buffer *clone = path->nodes[0];
309	struct btrfs_key key;
310	int slot;
311	int ret;
312
313	path->slots[0]++;
314	if (path->slots[0] < btrfs_header_nritems(path->nodes[0]))
315		return 0;
316
317	/*
318	 * Add a temporary extra ref to an already cloned extent buffer to
319	 * prevent btrfs_next_leaf() freeing it, we want to reuse it to avoid
320	 * the cost of allocating a new one.
321	 */
322	ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags));
323	atomic_inc(&clone->refs);
324
325	ret = btrfs_next_leaf(inode->root, path);
326	if (ret != 0)
327		goto out;
328
329	/*
330	 * Don't bother with cloning if there are no more file extent items for
331	 * our inode.
332	 */
333	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
334	if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) {
335		ret = 1;
336		goto out;
337	}
338
339	/*
340	 * Important to preserve the start field, for the optimizations when
341	 * checking if extents are shared (see extent_fiemap()).
342	 *
343	 * We must set ->start before calling copy_extent_buffer_full().  If we
344	 * are on sub-pagesize blocksize, we use ->start to determine the offset
345	 * into the folio where our eb exists, and if we update ->start after
346	 * the fact then any subsequent reads of the eb may read from a
347	 * different offset in the folio than where we originally copied into.
348	 */
349	clone->start = path->nodes[0]->start;
350	/* See the comment at fiemap_search_slot() about why we clone. */
351	copy_extent_buffer_full(clone, path->nodes[0]);
352
353	slot = path->slots[0];
354	btrfs_release_path(path);
355	path->nodes[0] = clone;
356	path->slots[0] = slot;
357out:
358	if (ret)
359		free_extent_buffer(clone);
360
361	return ret;
362}
363
364/*
365 * Search for the first file extent item that starts at a given file offset or
366 * the one that starts immediately before that offset.
367 * Returns: 0 on success, < 0 on error, 1 if not found.
368 */
369static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path,
370			      u64 file_offset)
371{
372	const u64 ino = btrfs_ino(inode);
373	struct btrfs_root *root = inode->root;
374	struct extent_buffer *clone;
375	struct btrfs_key key;
376	int slot;
377	int ret;
378
379	key.objectid = ino;
380	key.type = BTRFS_EXTENT_DATA_KEY;
381	key.offset = file_offset;
382
383	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
384	if (ret < 0)
385		return ret;
386
387	if (ret > 0 && path->slots[0] > 0) {
388		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
389		if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
390			path->slots[0]--;
391	}
392
393	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
394		ret = btrfs_next_leaf(root, path);
395		if (ret != 0)
396			return ret;
397
398		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
399		if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
400			return 1;
401	}
402
403	/*
404	 * We clone the leaf and use it during fiemap. This is because while
405	 * using the leaf we do expensive things like checking if an extent is
406	 * shared, which can take a long time. In order to prevent blocking
407	 * other tasks for too long, we use a clone of the leaf. We have locked
408	 * the file range in the inode's io tree, so we know none of our file
409	 * extent items can change. This way we avoid blocking other tasks that
410	 * want to insert items for other inodes in the same leaf or b+tree
411	 * rebalance operations (triggered for example when someone is trying
412	 * to push items into this leaf when trying to insert an item in a
413	 * neighbour leaf).
414	 * We also need the private clone because holding a read lock on an
415	 * extent buffer of the subvolume's b+tree will make lockdep unhappy
416	 * when we check if extents are shared, as backref walking may need to
417	 * lock the same leaf we are processing.
418	 */
419	clone = btrfs_clone_extent_buffer(path->nodes[0]);
420	if (!clone)
421		return -ENOMEM;
422
423	slot = path->slots[0];
424	btrfs_release_path(path);
425	path->nodes[0] = clone;
426	path->slots[0] = slot;
427
428	return 0;
429}
430
431/*
432 * Process a range which is a hole or a prealloc extent in the inode's subvolume
433 * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc
434 * extent. The end offset (@end) is inclusive.
435 */
436static int fiemap_process_hole(struct btrfs_inode *inode,
437			       struct fiemap_extent_info *fieinfo,
438			       struct fiemap_cache *cache,
439			       struct extent_state **delalloc_cached_state,
440			       struct btrfs_backref_share_check_ctx *backref_ctx,
441			       u64 disk_bytenr, u64 extent_offset,
442			       u64 extent_gen,
443			       u64 start, u64 end)
444{
445	const u64 i_size = i_size_read(&inode->vfs_inode);
446	u64 cur_offset = start;
447	u64 last_delalloc_end = 0;
448	u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN;
449	bool checked_extent_shared = false;
450	int ret;
451
452	/*
453	 * There can be no delalloc past i_size, so don't waste time looking for
454	 * it beyond i_size.
455	 */
456	while (cur_offset < end && cur_offset < i_size) {
457		u64 delalloc_start;
458		u64 delalloc_end;
459		u64 prealloc_start;
460		u64 prealloc_len = 0;
461		bool delalloc;
462
463		delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
464							delalloc_cached_state,
465							&delalloc_start,
466							&delalloc_end);
467		if (!delalloc)
468			break;
469
470		/*
471		 * If this is a prealloc extent we have to report every section
472		 * of it that has no delalloc.
473		 */
474		if (disk_bytenr != 0) {
475			if (last_delalloc_end == 0) {
476				prealloc_start = start;
477				prealloc_len = delalloc_start - start;
478			} else {
479				prealloc_start = last_delalloc_end + 1;
480				prealloc_len = delalloc_start - prealloc_start;
481			}
482		}
483
484		if (prealloc_len > 0) {
485			if (!checked_extent_shared && fieinfo->fi_extents_max) {
486				ret = btrfs_is_data_extent_shared(inode,
487								  disk_bytenr,
488								  extent_gen,
489								  backref_ctx);
490				if (ret < 0)
491					return ret;
492				else if (ret > 0)
493					prealloc_flags |= FIEMAP_EXTENT_SHARED;
494
495				checked_extent_shared = true;
496			}
497			ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
498						 disk_bytenr + extent_offset,
499						 prealloc_len, prealloc_flags);
500			if (ret)
501				return ret;
502			extent_offset += prealloc_len;
503		}
504
505		ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0,
506					 delalloc_end + 1 - delalloc_start,
507					 FIEMAP_EXTENT_DELALLOC |
508					 FIEMAP_EXTENT_UNKNOWN);
509		if (ret)
510			return ret;
511
512		last_delalloc_end = delalloc_end;
513		cur_offset = delalloc_end + 1;
514		extent_offset += cur_offset - delalloc_start;
515		cond_resched();
516	}
517
518	/*
519	 * Either we found no delalloc for the whole prealloc extent or we have
520	 * a prealloc extent that spans i_size or starts at or after i_size.
521	 */
522	if (disk_bytenr != 0 && last_delalloc_end < end) {
523		u64 prealloc_start;
524		u64 prealloc_len;
525
526		if (last_delalloc_end == 0) {
527			prealloc_start = start;
528			prealloc_len = end + 1 - start;
529		} else {
530			prealloc_start = last_delalloc_end + 1;
531			prealloc_len = end + 1 - prealloc_start;
532		}
533
534		if (!checked_extent_shared && fieinfo->fi_extents_max) {
535			ret = btrfs_is_data_extent_shared(inode,
536							  disk_bytenr,
537							  extent_gen,
538							  backref_ctx);
539			if (ret < 0)
540				return ret;
541			else if (ret > 0)
542				prealloc_flags |= FIEMAP_EXTENT_SHARED;
543		}
544		ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
545					 disk_bytenr + extent_offset,
546					 prealloc_len, prealloc_flags);
547		if (ret)
548			return ret;
549	}
550
551	return 0;
552}
553
554static int fiemap_find_last_extent_offset(struct btrfs_inode *inode,
555					  struct btrfs_path *path,
556					  u64 *last_extent_end_ret)
557{
558	const u64 ino = btrfs_ino(inode);
559	struct btrfs_root *root = inode->root;
560	struct extent_buffer *leaf;
561	struct btrfs_file_extent_item *ei;
562	struct btrfs_key key;
563	u64 disk_bytenr;
564	int ret;
565
566	/*
567	 * Lookup the last file extent. We're not using i_size here because
568	 * there might be preallocation past i_size.
569	 */
570	ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0);
571	/* There can't be a file extent item at offset (u64)-1 */
572	ASSERT(ret != 0);
573	if (ret < 0)
574		return ret;
575
576	/*
577	 * For a non-existing key, btrfs_search_slot() always leaves us at a
578	 * slot > 0, except if the btree is empty, which is impossible because
579	 * at least it has the inode item for this inode and all the items for
580	 * the root inode 256.
581	 */
582	ASSERT(path->slots[0] > 0);
583	path->slots[0]--;
584	leaf = path->nodes[0];
585	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
586	if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
587		/* No file extent items in the subvolume tree. */
588		*last_extent_end_ret = 0;
589		return 0;
590	}
591
592	/*
593	 * For an inline extent, the disk_bytenr is where inline data starts at,
594	 * so first check if we have an inline extent item before checking if we
595	 * have an implicit hole (disk_bytenr == 0).
596	 */
597	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
598	if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
599		*last_extent_end_ret = btrfs_file_extent_end(path);
600		return 0;
601	}
602
603	/*
604	 * Find the last file extent item that is not a hole (when NO_HOLES is
605	 * not enabled). This should take at most 2 iterations in the worst
606	 * case: we have one hole file extent item at slot 0 of a leaf and
607	 * another hole file extent item as the last item in the previous leaf.
608	 * This is because we merge file extent items that represent holes.
609	 */
610	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
611	while (disk_bytenr == 0) {
612		ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
613		if (ret < 0) {
614			return ret;
615		} else if (ret > 0) {
616			/* No file extent items that are not holes. */
617			*last_extent_end_ret = 0;
618			return 0;
619		}
620		leaf = path->nodes[0];
621		ei = btrfs_item_ptr(leaf, path->slots[0],
622				    struct btrfs_file_extent_item);
623		disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
624	}
625
626	*last_extent_end_ret = btrfs_file_extent_end(path);
627	return 0;
628}
629
630static int extent_fiemap(struct btrfs_inode *inode,
631			 struct fiemap_extent_info *fieinfo,
632			 u64 start, u64 len)
633{
634	const u64 ino = btrfs_ino(inode);
635	struct extent_state *cached_state = NULL;
636	struct extent_state *delalloc_cached_state = NULL;
637	struct btrfs_path *path;
638	struct fiemap_cache cache = { 0 };
639	struct btrfs_backref_share_check_ctx *backref_ctx;
640	u64 last_extent_end = 0;
641	u64 prev_extent_end;
642	u64 range_start;
643	u64 range_end;
644	const u64 sectorsize = inode->root->fs_info->sectorsize;
645	bool stopped = false;
646	int ret;
647
648	cache.entries_size = PAGE_SIZE / sizeof(struct btrfs_fiemap_entry);
649	cache.entries = kmalloc_array(cache.entries_size,
650				      sizeof(struct btrfs_fiemap_entry),
651				      GFP_KERNEL);
652	backref_ctx = btrfs_alloc_backref_share_check_ctx();
653	path = btrfs_alloc_path();
654	if (!cache.entries || !backref_ctx || !path) {
655		ret = -ENOMEM;
656		goto out;
657	}
658
659restart:
660	range_start = round_down(start, sectorsize);
661	range_end = round_up(start + len, sectorsize);
662	prev_extent_end = range_start;
663
664	lock_extent(&inode->io_tree, range_start, range_end, &cached_state);
665
666	ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
667	if (ret < 0)
668		goto out_unlock;
669	btrfs_release_path(path);
670
671	path->reada = READA_FORWARD;
672	ret = fiemap_search_slot(inode, path, range_start);
673	if (ret < 0) {
674		goto out_unlock;
675	} else if (ret > 0) {
676		/*
677		 * No file extent item found, but we may have delalloc between
678		 * the current offset and i_size. So check for that.
679		 */
680		ret = 0;
681		goto check_eof_delalloc;
682	}
683
684	while (prev_extent_end < range_end) {
685		struct extent_buffer *leaf = path->nodes[0];
686		struct btrfs_file_extent_item *ei;
687		struct btrfs_key key;
688		u64 extent_end;
689		u64 extent_len;
690		u64 extent_offset = 0;
691		u64 extent_gen;
692		u64 disk_bytenr = 0;
693		u64 flags = 0;
694		int extent_type;
695		u8 compression;
696
697		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
698		if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
699			break;
700
701		extent_end = btrfs_file_extent_end(path);
702
703		/*
704		 * The first iteration can leave us at an extent item that ends
705		 * before our range's start. Move to the next item.
706		 */
707		if (extent_end <= range_start)
708			goto next_item;
709
710		backref_ctx->curr_leaf_bytenr = leaf->start;
711
712		/* We have in implicit hole (NO_HOLES feature enabled). */
713		if (prev_extent_end < key.offset) {
714			const u64 hole_end = min(key.offset, range_end) - 1;
715
716			ret = fiemap_process_hole(inode, fieinfo, &cache,
717						  &delalloc_cached_state,
718						  backref_ctx, 0, 0, 0,
719						  prev_extent_end, hole_end);
720			if (ret < 0) {
721				goto out_unlock;
722			} else if (ret > 0) {
723				/* fiemap_fill_next_extent() told us to stop. */
724				stopped = true;
725				break;
726			}
727
728			/* We've reached the end of the fiemap range, stop. */
729			if (key.offset >= range_end) {
730				stopped = true;
731				break;
732			}
733		}
734
735		extent_len = extent_end - key.offset;
736		ei = btrfs_item_ptr(leaf, path->slots[0],
737				    struct btrfs_file_extent_item);
738		compression = btrfs_file_extent_compression(leaf, ei);
739		extent_type = btrfs_file_extent_type(leaf, ei);
740		extent_gen = btrfs_file_extent_generation(leaf, ei);
741
742		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
743			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
744			if (compression == BTRFS_COMPRESS_NONE)
745				extent_offset = btrfs_file_extent_offset(leaf, ei);
746		}
747
748		if (compression != BTRFS_COMPRESS_NONE)
749			flags |= FIEMAP_EXTENT_ENCODED;
750
751		if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
752			flags |= FIEMAP_EXTENT_DATA_INLINE;
753			flags |= FIEMAP_EXTENT_NOT_ALIGNED;
754			ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0,
755						 extent_len, flags);
756		} else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
757			ret = fiemap_process_hole(inode, fieinfo, &cache,
758						  &delalloc_cached_state,
759						  backref_ctx,
760						  disk_bytenr, extent_offset,
761						  extent_gen, key.offset,
762						  extent_end - 1);
763		} else if (disk_bytenr == 0) {
764			/* We have an explicit hole. */
765			ret = fiemap_process_hole(inode, fieinfo, &cache,
766						  &delalloc_cached_state,
767						  backref_ctx, 0, 0, 0,
768						  key.offset, extent_end - 1);
769		} else {
770			/* We have a regular extent. */
771			if (fieinfo->fi_extents_max) {
772				ret = btrfs_is_data_extent_shared(inode,
773								  disk_bytenr,
774								  extent_gen,
775								  backref_ctx);
776				if (ret < 0)
777					goto out_unlock;
778				else if (ret > 0)
779					flags |= FIEMAP_EXTENT_SHARED;
780			}
781
782			ret = emit_fiemap_extent(fieinfo, &cache, key.offset,
783						 disk_bytenr + extent_offset,
784						 extent_len, flags);
785		}
786
787		if (ret < 0) {
788			goto out_unlock;
789		} else if (ret > 0) {
790			/* emit_fiemap_extent() told us to stop. */
791			stopped = true;
792			break;
793		}
794
795		prev_extent_end = extent_end;
796next_item:
797		if (fatal_signal_pending(current)) {
798			ret = -EINTR;
799			goto out_unlock;
800		}
801
802		ret = fiemap_next_leaf_item(inode, path);
803		if (ret < 0) {
804			goto out_unlock;
805		} else if (ret > 0) {
806			/* No more file extent items for this inode. */
807			break;
808		}
809		cond_resched();
810	}
811
812check_eof_delalloc:
813	if (!stopped && prev_extent_end < range_end) {
814		ret = fiemap_process_hole(inode, fieinfo, &cache,
815					  &delalloc_cached_state, backref_ctx,
816					  0, 0, 0, prev_extent_end, range_end - 1);
817		if (ret < 0)
818			goto out_unlock;
819		prev_extent_end = range_end;
820	}
821
822	if (cache.cached && cache.offset + cache.len >= last_extent_end) {
823		const u64 i_size = i_size_read(&inode->vfs_inode);
824
825		if (prev_extent_end < i_size) {
826			u64 delalloc_start;
827			u64 delalloc_end;
828			bool delalloc;
829
830			delalloc = btrfs_find_delalloc_in_range(inode,
831								prev_extent_end,
832								i_size - 1,
833								&delalloc_cached_state,
834								&delalloc_start,
835								&delalloc_end);
836			if (!delalloc)
837				cache.flags |= FIEMAP_EXTENT_LAST;
838		} else {
839			cache.flags |= FIEMAP_EXTENT_LAST;
840		}
841	}
842
843out_unlock:
844	unlock_extent(&inode->io_tree, range_start, range_end, &cached_state);
845
846	if (ret == BTRFS_FIEMAP_FLUSH_CACHE) {
847		btrfs_release_path(path);
848		ret = flush_fiemap_cache(fieinfo, &cache);
849		if (ret)
850			goto out;
851		len -= cache.next_search_offset - start;
852		start = cache.next_search_offset;
853		goto restart;
854	} else if (ret < 0) {
855		goto out;
856	}
857
858	/*
859	 * Must free the path before emitting to the fiemap buffer because we
860	 * may have a non-cloned leaf and if the fiemap buffer is memory mapped
861	 * to a file, a write into it (through btrfs_page_mkwrite()) may trigger
862	 * waiting for an ordered extent that in order to complete needs to
863	 * modify that leaf, therefore leading to a deadlock.
864	 */
865	btrfs_free_path(path);
866	path = NULL;
867
868	ret = flush_fiemap_cache(fieinfo, &cache);
869	if (ret)
870		goto out;
871
872	ret = emit_last_fiemap_cache(fieinfo, &cache);
873out:
874	free_extent_state(delalloc_cached_state);
875	kfree(cache.entries);
876	btrfs_free_backref_share_ctx(backref_ctx);
877	btrfs_free_path(path);
878	return ret;
879}
880
881int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
882		 u64 start, u64 len)
883{
884	struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
885	int ret;
886
887	ret = fiemap_prep(inode, fieinfo, start, &len, 0);
888	if (ret)
889		return ret;
890
891	/*
892	 * fiemap_prep() called filemap_write_and_wait() for the whole possible
893	 * file range (0 to LLONG_MAX), but that is not enough if we have
894	 * compression enabled. The first filemap_fdatawrite_range() only kicks
895	 * in the compression of data (in an async thread) and will return
896	 * before the compression is done and writeback is started. A second
897	 * filemap_fdatawrite_range() is needed to wait for the compression to
898	 * complete and writeback to start. We also need to wait for ordered
899	 * extents to complete, because our fiemap implementation uses mainly
900	 * file extent items to list the extents, searching for extent maps
901	 * only for file ranges with holes or prealloc extents to figure out
902	 * if we have delalloc in those ranges.
903	 */
904	if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
905		ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX);
906		if (ret)
907			return ret;
908	}
909
910	btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED);
911
912	/*
913	 * We did an initial flush to avoid holding the inode's lock while
914	 * triggering writeback and waiting for the completion of IO and ordered
915	 * extents. Now after we locked the inode we do it again, because it's
916	 * possible a new write may have happened in between those two steps.
917	 */
918	if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
919		ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX);
920		if (ret) {
921			btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
922			return ret;
923		}
924	}
925
926	ret = extent_fiemap(btrfs_inode, fieinfo, start, len);
927	btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
928
929	return ret;
930}