Linux Audio

Check our new training course

Loading...
v6.2
  1// SPDX-License-Identifier: GPL-2.0
  2
  3#include "messages.h"
  4#include "tree-mod-log.h"
  5#include "disk-io.h"
  6#include "fs.h"
  7#include "accessors.h"
  8#include "tree-checker.h"
  9
 10struct tree_mod_root {
 11	u64 logical;
 12	u8 level;
 13};
 14
 15struct tree_mod_elem {
 16	struct rb_node node;
 17	u64 logical;
 18	u64 seq;
 19	enum btrfs_mod_log_op op;
 20
 21	/*
 22	 * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
 23	 * operations.
 24	 */
 25	int slot;
 26
 27	/* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
 28	u64 generation;
 29
 30	/* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
 31	struct btrfs_disk_key key;
 32	u64 blockptr;
 33
 34	/* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
 35	struct {
 36		int dst_slot;
 37		int nr_items;
 38	} move;
 39
 40	/* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
 41	struct tree_mod_root old_root;
 42};
 43
 44/*
 45 * Pull a new tree mod seq number for our operation.
 46 */
 47static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
 48{
 49	return atomic64_inc_return(&fs_info->tree_mod_seq);
 50}
 51
 52/*
 53 * This adds a new blocker to the tree mod log's blocker list if the @elem
 54 * passed does not already have a sequence number set. So when a caller expects
 55 * to record tree modifications, it should ensure to set elem->seq to zero
 56 * before calling btrfs_get_tree_mod_seq.
 57 * Returns a fresh, unused tree log modification sequence number, even if no new
 58 * blocker was added.
 59 */
 60u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
 61			   struct btrfs_seq_list *elem)
 62{
 63	write_lock(&fs_info->tree_mod_log_lock);
 64	if (!elem->seq) {
 65		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
 66		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
 67		set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
 68	}
 69	write_unlock(&fs_info->tree_mod_log_lock);
 70
 71	return elem->seq;
 72}
 73
 74void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
 75			    struct btrfs_seq_list *elem)
 76{
 77	struct rb_root *tm_root;
 78	struct rb_node *node;
 79	struct rb_node *next;
 80	struct tree_mod_elem *tm;
 81	u64 min_seq = BTRFS_SEQ_LAST;
 82	u64 seq_putting = elem->seq;
 83
 84	if (!seq_putting)
 85		return;
 86
 87	write_lock(&fs_info->tree_mod_log_lock);
 88	list_del(&elem->list);
 89	elem->seq = 0;
 90
 91	if (list_empty(&fs_info->tree_mod_seq_list)) {
 92		clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
 93	} else {
 94		struct btrfs_seq_list *first;
 95
 96		first = list_first_entry(&fs_info->tree_mod_seq_list,
 97					 struct btrfs_seq_list, list);
 98		if (seq_putting > first->seq) {
 99			/*
100			 * Blocker with lower sequence number exists, we cannot
101			 * remove anything from the log.
102			 */
103			write_unlock(&fs_info->tree_mod_log_lock);
104			return;
105		}
106		min_seq = first->seq;
107	}
108
109	/*
110	 * Anything that's lower than the lowest existing (read: blocked)
111	 * sequence number can be removed from the tree.
112	 */
113	tm_root = &fs_info->tree_mod_log;
114	for (node = rb_first(tm_root); node; node = next) {
115		next = rb_next(node);
116		tm = rb_entry(node, struct tree_mod_elem, node);
117		if (tm->seq >= min_seq)
118			continue;
119		rb_erase(node, tm_root);
120		kfree(tm);
121	}
122	write_unlock(&fs_info->tree_mod_log_lock);
123}
124
125/*
126 * Key order of the log:
127 *       node/leaf start address -> sequence
128 *
129 * The 'start address' is the logical address of the *new* root node for root
130 * replace operations, or the logical address of the affected block for all
131 * other operations.
132 */
133static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
134					struct tree_mod_elem *tm)
135{
136	struct rb_root *tm_root;
137	struct rb_node **new;
138	struct rb_node *parent = NULL;
139	struct tree_mod_elem *cur;
140
141	lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
142
143	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
144
145	tm_root = &fs_info->tree_mod_log;
146	new = &tm_root->rb_node;
147	while (*new) {
148		cur = rb_entry(*new, struct tree_mod_elem, node);
149		parent = *new;
150		if (cur->logical < tm->logical)
151			new = &((*new)->rb_left);
152		else if (cur->logical > tm->logical)
153			new = &((*new)->rb_right);
154		else if (cur->seq < tm->seq)
155			new = &((*new)->rb_left);
156		else if (cur->seq > tm->seq)
157			new = &((*new)->rb_right);
158		else
159			return -EEXIST;
160	}
161
162	rb_link_node(&tm->node, parent, new);
163	rb_insert_color(&tm->node, tm_root);
164	return 0;
165}
166
167/*
168 * Determines if logging can be omitted. Returns true if it can. Otherwise, it
169 * returns false with the tree_mod_log_lock acquired. The caller must hold
170 * this until all tree mod log insertions are recorded in the rb tree and then
171 * write unlock fs_info::tree_mod_log_lock.
172 */
173static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info,
174				    struct extent_buffer *eb)
175{
176	if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
177		return true;
178	if (eb && btrfs_header_level(eb) == 0)
179		return true;
180
181	write_lock(&fs_info->tree_mod_log_lock);
182	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
183		write_unlock(&fs_info->tree_mod_log_lock);
184		return true;
185	}
186
187	return false;
188}
189
190/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
191static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
192				    struct extent_buffer *eb)
193{
194	if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
195		return false;
196	if (eb && btrfs_header_level(eb) == 0)
197		return false;
198
199	return true;
200}
201
202static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
203						 int slot,
204						 enum btrfs_mod_log_op op)
 
205{
206	struct tree_mod_elem *tm;
207
208	tm = kzalloc(sizeof(*tm), GFP_NOFS);
209	if (!tm)
210		return NULL;
211
212	tm->logical = eb->start;
213	if (op != BTRFS_MOD_LOG_KEY_ADD) {
214		btrfs_node_key(eb, &tm->key, slot);
215		tm->blockptr = btrfs_node_blockptr(eb, slot);
216	}
217	tm->op = op;
218	tm->slot = slot;
219	tm->generation = btrfs_node_ptr_generation(eb, slot);
220	RB_CLEAR_NODE(&tm->node);
221
222	return tm;
223}
224
225int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
226				  enum btrfs_mod_log_op op)
227{
228	struct tree_mod_elem *tm;
229	int ret;
230
231	if (!tree_mod_need_log(eb->fs_info, eb))
232		return 0;
233
234	tm = alloc_tree_mod_elem(eb, slot, op);
235	if (!tm)
236		return -ENOMEM;
237
238	if (tree_mod_dont_log(eb->fs_info, eb)) {
239		kfree(tm);
240		return 0;
241	}
242
243	ret = tree_mod_log_insert(eb->fs_info, tm);
244	write_unlock(&eb->fs_info->tree_mod_log_lock);
245	if (ret)
246		kfree(tm);
247
248	return ret;
249}
250
251int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
252				   int dst_slot, int src_slot,
253				   int nr_items)
254{
255	struct tree_mod_elem *tm = NULL;
256	struct tree_mod_elem **tm_list = NULL;
257	int ret = 0;
258	int i;
259	bool locked = false;
260
261	if (!tree_mod_need_log(eb->fs_info, eb))
262		return 0;
263
264	tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
265	if (!tm_list)
266		return -ENOMEM;
267
268	tm = kzalloc(sizeof(*tm), GFP_NOFS);
269	if (!tm) {
270		ret = -ENOMEM;
271		goto free_tms;
272	}
273
274	tm->logical = eb->start;
275	tm->slot = src_slot;
276	tm->move.dst_slot = dst_slot;
277	tm->move.nr_items = nr_items;
278	tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
279
280	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
281		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
282				BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING);
283		if (!tm_list[i]) {
284			ret = -ENOMEM;
285			goto free_tms;
286		}
287	}
288
289	if (tree_mod_dont_log(eb->fs_info, eb))
290		goto free_tms;
291	locked = true;
292
293	/*
294	 * When we override something during the move, we log these removals.
295	 * This can only happen when we move towards the beginning of the
296	 * buffer, i.e. dst_slot < src_slot.
297	 */
298	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
299		ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
300		if (ret)
301			goto free_tms;
302	}
303
304	ret = tree_mod_log_insert(eb->fs_info, tm);
305	if (ret)
306		goto free_tms;
307	write_unlock(&eb->fs_info->tree_mod_log_lock);
308	kfree(tm_list);
309
310	return 0;
311
312free_tms:
313	for (i = 0; i < nr_items; i++) {
314		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
315			rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
316		kfree(tm_list[i]);
317	}
318	if (locked)
319		write_unlock(&eb->fs_info->tree_mod_log_lock);
320	kfree(tm_list);
321	kfree(tm);
322
323	return ret;
324}
325
326static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
327				       struct tree_mod_elem **tm_list,
328				       int nritems)
329{
330	int i, j;
331	int ret;
332
333	for (i = nritems - 1; i >= 0; i--) {
334		ret = tree_mod_log_insert(fs_info, tm_list[i]);
335		if (ret) {
336			for (j = nritems - 1; j > i; j--)
337				rb_erase(&tm_list[j]->node,
338					 &fs_info->tree_mod_log);
339			return ret;
340		}
341	}
342
343	return 0;
344}
345
346int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
347				   struct extent_buffer *new_root,
348				   bool log_removal)
349{
350	struct btrfs_fs_info *fs_info = old_root->fs_info;
351	struct tree_mod_elem *tm = NULL;
352	struct tree_mod_elem **tm_list = NULL;
353	int nritems = 0;
354	int ret = 0;
355	int i;
356
357	if (!tree_mod_need_log(fs_info, NULL))
358		return 0;
359
360	if (log_removal && btrfs_header_level(old_root) > 0) {
361		nritems = btrfs_header_nritems(old_root);
362		tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
363				  GFP_NOFS);
364		if (!tm_list) {
365			ret = -ENOMEM;
366			goto free_tms;
367		}
368		for (i = 0; i < nritems; i++) {
369			tm_list[i] = alloc_tree_mod_elem(old_root, i,
370			    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
371			if (!tm_list[i]) {
372				ret = -ENOMEM;
373				goto free_tms;
374			}
375		}
376	}
377
378	tm = kzalloc(sizeof(*tm), GFP_NOFS);
379	if (!tm) {
380		ret = -ENOMEM;
381		goto free_tms;
382	}
383
384	tm->logical = new_root->start;
385	tm->old_root.logical = old_root->start;
386	tm->old_root.level = btrfs_header_level(old_root);
387	tm->generation = btrfs_header_generation(old_root);
388	tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
389
390	if (tree_mod_dont_log(fs_info, NULL))
391		goto free_tms;
392
393	if (tm_list)
394		ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
395	if (!ret)
396		ret = tree_mod_log_insert(fs_info, tm);
397
398	write_unlock(&fs_info->tree_mod_log_lock);
399	if (ret)
400		goto free_tms;
401	kfree(tm_list);
402
403	return ret;
404
405free_tms:
406	if (tm_list) {
407		for (i = 0; i < nritems; i++)
408			kfree(tm_list[i]);
409		kfree(tm_list);
410	}
411	kfree(tm);
412
413	return ret;
414}
415
416static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
417						   u64 start, u64 min_seq,
418						   bool smallest)
419{
420	struct rb_root *tm_root;
421	struct rb_node *node;
422	struct tree_mod_elem *cur = NULL;
423	struct tree_mod_elem *found = NULL;
424
425	read_lock(&fs_info->tree_mod_log_lock);
426	tm_root = &fs_info->tree_mod_log;
427	node = tm_root->rb_node;
428	while (node) {
429		cur = rb_entry(node, struct tree_mod_elem, node);
430		if (cur->logical < start) {
431			node = node->rb_left;
432		} else if (cur->logical > start) {
433			node = node->rb_right;
434		} else if (cur->seq < min_seq) {
435			node = node->rb_left;
436		} else if (!smallest) {
437			/* We want the node with the highest seq */
438			if (found)
439				BUG_ON(found->seq > cur->seq);
440			found = cur;
441			node = node->rb_left;
442		} else if (cur->seq > min_seq) {
443			/* We want the node with the smallest seq */
444			if (found)
445				BUG_ON(found->seq < cur->seq);
446			found = cur;
447			node = node->rb_right;
448		} else {
449			found = cur;
450			break;
451		}
452	}
453	read_unlock(&fs_info->tree_mod_log_lock);
454
455	return found;
456}
457
458/*
459 * This returns the element from the log with the smallest time sequence
460 * value that's in the log (the oldest log item). Any element with a time
461 * sequence lower than min_seq will be ignored.
462 */
463static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
464							u64 start, u64 min_seq)
465{
466	return __tree_mod_log_search(fs_info, start, min_seq, true);
467}
468
469/*
470 * This returns the element from the log with the largest time sequence
471 * value that's in the log (the most recent log item). Any element with
472 * a time sequence lower than min_seq will be ignored.
473 */
474static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
475						 u64 start, u64 min_seq)
476{
477	return __tree_mod_log_search(fs_info, start, min_seq, false);
478}
479
480int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
481			       struct extent_buffer *src,
482			       unsigned long dst_offset,
483			       unsigned long src_offset,
484			       int nr_items)
485{
486	struct btrfs_fs_info *fs_info = dst->fs_info;
487	int ret = 0;
488	struct tree_mod_elem **tm_list = NULL;
489	struct tree_mod_elem **tm_list_add, **tm_list_rem;
490	int i;
491	bool locked = false;
492
493	if (!tree_mod_need_log(fs_info, NULL))
494		return 0;
495
496	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
497		return 0;
498
499	tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
500			  GFP_NOFS);
501	if (!tm_list)
502		return -ENOMEM;
503
504	tm_list_add = tm_list;
505	tm_list_rem = tm_list + nr_items;
506	for (i = 0; i < nr_items; i++) {
507		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
508						     BTRFS_MOD_LOG_KEY_REMOVE);
509		if (!tm_list_rem[i]) {
510			ret = -ENOMEM;
511			goto free_tms;
512		}
513
514		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
515						     BTRFS_MOD_LOG_KEY_ADD);
516		if (!tm_list_add[i]) {
517			ret = -ENOMEM;
518			goto free_tms;
519		}
520	}
521
522	if (tree_mod_dont_log(fs_info, NULL))
523		goto free_tms;
524	locked = true;
525
526	for (i = 0; i < nr_items; i++) {
527		ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
528		if (ret)
529			goto free_tms;
530		ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
531		if (ret)
532			goto free_tms;
533	}
534
535	write_unlock(&fs_info->tree_mod_log_lock);
536	kfree(tm_list);
537
538	return 0;
539
540free_tms:
541	for (i = 0; i < nr_items * 2; i++) {
542		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
543			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
544		kfree(tm_list[i]);
545	}
546	if (locked)
547		write_unlock(&fs_info->tree_mod_log_lock);
548	kfree(tm_list);
549
550	return ret;
551}
552
553int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
554{
555	struct tree_mod_elem **tm_list = NULL;
556	int nritems = 0;
557	int i;
558	int ret = 0;
559
560	if (!tree_mod_need_log(eb->fs_info, eb))
561		return 0;
562
563	nritems = btrfs_header_nritems(eb);
564	tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
565	if (!tm_list)
566		return -ENOMEM;
567
568	for (i = 0; i < nritems; i++) {
569		tm_list[i] = alloc_tree_mod_elem(eb, i,
570				    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
571		if (!tm_list[i]) {
572			ret = -ENOMEM;
573			goto free_tms;
574		}
575	}
576
577	if (tree_mod_dont_log(eb->fs_info, eb))
578		goto free_tms;
579
580	ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
581	write_unlock(&eb->fs_info->tree_mod_log_lock);
582	if (ret)
583		goto free_tms;
584	kfree(tm_list);
585
586	return 0;
587
588free_tms:
589	for (i = 0; i < nritems; i++)
590		kfree(tm_list[i]);
591	kfree(tm_list);
592
593	return ret;
594}
595
596/*
597 * Returns the logical address of the oldest predecessor of the given root.
598 * Entries older than time_seq are ignored.
599 */
600static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
601						      u64 time_seq)
602{
603	struct tree_mod_elem *tm;
604	struct tree_mod_elem *found = NULL;
605	u64 root_logical = eb_root->start;
606	bool looped = false;
607
608	if (!time_seq)
609		return NULL;
610
611	/*
612	 * The very last operation that's logged for a root is the replacement
613	 * operation (if it is replaced at all). This has the logical address
614	 * of the *new* root, making it the very first operation that's logged
615	 * for this root.
616	 */
617	while (1) {
618		tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
619						time_seq);
620		if (!looped && !tm)
621			return NULL;
622		/*
623		 * If there are no tree operation for the oldest root, we simply
624		 * return it. This should only happen if that (old) root is at
625		 * level 0.
626		 */
627		if (!tm)
628			break;
629
630		/*
631		 * If there's an operation that's not a root replacement, we
632		 * found the oldest version of our root. Normally, we'll find a
633		 * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
634		 */
635		if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
636			break;
637
638		found = tm;
639		root_logical = tm->old_root.logical;
640		looped = true;
641	}
642
643	/* If there's no old root to return, return what we found instead */
644	if (!found)
645		found = tm;
646
647	return found;
648}
649
650
651/*
652 * tm is a pointer to the first operation to rewind within eb. Then, all
653 * previous operations will be rewound (until we reach something older than
654 * time_seq).
655 */
656static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
657				struct extent_buffer *eb,
658				u64 time_seq,
659				struct tree_mod_elem *first_tm)
660{
661	u32 n;
662	struct rb_node *next;
663	struct tree_mod_elem *tm = first_tm;
664	unsigned long o_dst;
665	unsigned long o_src;
666	unsigned long p_size = sizeof(struct btrfs_key_ptr);
667
668	n = btrfs_header_nritems(eb);
669	read_lock(&fs_info->tree_mod_log_lock);
670	while (tm && tm->seq >= time_seq) {
671		/*
672		 * All the operations are recorded with the operator used for
673		 * the modification. As we're going backwards, we do the
674		 * opposite of each operation here.
675		 */
676		switch (tm->op) {
677		case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
678			BUG_ON(tm->slot < n);
679			fallthrough;
680		case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
681		case BTRFS_MOD_LOG_KEY_REMOVE:
682			btrfs_set_node_key(eb, &tm->key, tm->slot);
683			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
684			btrfs_set_node_ptr_generation(eb, tm->slot,
685						      tm->generation);
686			n++;
687			break;
688		case BTRFS_MOD_LOG_KEY_REPLACE:
689			BUG_ON(tm->slot >= n);
690			btrfs_set_node_key(eb, &tm->key, tm->slot);
691			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
692			btrfs_set_node_ptr_generation(eb, tm->slot,
693						      tm->generation);
694			break;
695		case BTRFS_MOD_LOG_KEY_ADD:
696			/* if a move operation is needed it's in the log */
697			n--;
698			break;
699		case BTRFS_MOD_LOG_MOVE_KEYS:
700			o_dst = btrfs_node_key_ptr_offset(eb, tm->slot);
701			o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot);
702			memmove_extent_buffer(eb, o_dst, o_src,
703					      tm->move.nr_items * p_size);
704			break;
705		case BTRFS_MOD_LOG_ROOT_REPLACE:
706			/*
707			 * This operation is special. For roots, this must be
708			 * handled explicitly before rewinding.
709			 * For non-roots, this operation may exist if the node
710			 * was a root: root A -> child B; then A gets empty and
711			 * B is promoted to the new root. In the mod log, we'll
712			 * have a root-replace operation for B, a tree block
713			 * that is no root. We simply ignore that operation.
714			 */
715			break;
716		}
717		next = rb_next(&tm->node);
718		if (!next)
719			break;
720		tm = rb_entry(next, struct tree_mod_elem, node);
721		if (tm->logical != first_tm->logical)
722			break;
723	}
724	read_unlock(&fs_info->tree_mod_log_lock);
725	btrfs_set_header_nritems(eb, n);
726}
727
728/*
729 * Called with eb read locked. If the buffer cannot be rewound, the same buffer
730 * is returned. If rewind operations happen, a fresh buffer is returned. The
731 * returned buffer is always read-locked. If the returned buffer is not the
732 * input buffer, the lock on the input buffer is released and the input buffer
733 * is freed (its refcount is decremented).
734 */
735struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
736						struct btrfs_path *path,
737						struct extent_buffer *eb,
738						u64 time_seq)
739{
740	struct extent_buffer *eb_rewin;
741	struct tree_mod_elem *tm;
742
743	if (!time_seq)
744		return eb;
745
746	if (btrfs_header_level(eb) == 0)
747		return eb;
748
749	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
750	if (!tm)
751		return eb;
752
753	if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
754		BUG_ON(tm->slot != 0);
755		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
756		if (!eb_rewin) {
757			btrfs_tree_read_unlock(eb);
758			free_extent_buffer(eb);
759			return NULL;
760		}
761		btrfs_set_header_bytenr(eb_rewin, eb->start);
762		btrfs_set_header_backref_rev(eb_rewin,
763					     btrfs_header_backref_rev(eb));
764		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
765		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
766	} else {
767		eb_rewin = btrfs_clone_extent_buffer(eb);
768		if (!eb_rewin) {
769			btrfs_tree_read_unlock(eb);
770			free_extent_buffer(eb);
771			return NULL;
772		}
773	}
774
775	btrfs_tree_read_unlock(eb);
776	free_extent_buffer(eb);
777
778	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
779				       eb_rewin, btrfs_header_level(eb_rewin));
780	btrfs_tree_read_lock(eb_rewin);
781	tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
782	WARN_ON(btrfs_header_nritems(eb_rewin) >
783		BTRFS_NODEPTRS_PER_BLOCK(fs_info));
784
785	return eb_rewin;
786}
787
788/*
789 * Rewind the state of @root's root node to the given @time_seq value.
790 * If there are no changes, the current root->root_node is returned. If anything
791 * changed in between, there's a fresh buffer allocated on which the rewind
792 * operations are done. In any case, the returned buffer is read locked.
793 * Returns NULL on error (with no locks held).
794 */
795struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
796{
797	struct btrfs_fs_info *fs_info = root->fs_info;
798	struct tree_mod_elem *tm;
799	struct extent_buffer *eb = NULL;
800	struct extent_buffer *eb_root;
801	u64 eb_root_owner = 0;
802	struct extent_buffer *old;
803	struct tree_mod_root *old_root = NULL;
804	u64 old_generation = 0;
805	u64 logical;
806	int level;
807
808	eb_root = btrfs_read_lock_root_node(root);
809	tm = tree_mod_log_oldest_root(eb_root, time_seq);
810	if (!tm)
811		return eb_root;
812
813	if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
814		old_root = &tm->old_root;
815		old_generation = tm->generation;
816		logical = old_root->logical;
817		level = old_root->level;
818	} else {
819		logical = eb_root->start;
820		level = btrfs_header_level(eb_root);
821	}
822
823	tm = tree_mod_log_search(fs_info, logical, time_seq);
824	if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
825		struct btrfs_tree_parent_check check = { 0 };
826
827		btrfs_tree_read_unlock(eb_root);
828		free_extent_buffer(eb_root);
829
830		check.level = level;
831		check.owner_root = root->root_key.objectid;
832
833		old = read_tree_block(fs_info, logical, &check);
834		if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
835			if (!IS_ERR(old))
836				free_extent_buffer(old);
837			btrfs_warn(fs_info,
838				   "failed to read tree block %llu from get_old_root",
839				   logical);
840		} else {
841			struct tree_mod_elem *tm2;
842
843			btrfs_tree_read_lock(old);
844			eb = btrfs_clone_extent_buffer(old);
845			/*
846			 * After the lookup for the most recent tree mod operation
847			 * above and before we locked and cloned the extent buffer
848			 * 'old', a new tree mod log operation may have been added.
849			 * So lookup for a more recent one to make sure the number
850			 * of mod log operations we replay is consistent with the
851			 * number of items we have in the cloned extent buffer,
852			 * otherwise we can hit a BUG_ON when rewinding the extent
853			 * buffer.
854			 */
855			tm2 = tree_mod_log_search(fs_info, logical, time_seq);
856			btrfs_tree_read_unlock(old);
857			free_extent_buffer(old);
858			ASSERT(tm2);
859			ASSERT(tm2 == tm || tm2->seq > tm->seq);
860			if (!tm2 || tm2->seq < tm->seq) {
861				free_extent_buffer(eb);
862				return NULL;
863			}
864			tm = tm2;
865		}
866	} else if (old_root) {
867		eb_root_owner = btrfs_header_owner(eb_root);
868		btrfs_tree_read_unlock(eb_root);
869		free_extent_buffer(eb_root);
870		eb = alloc_dummy_extent_buffer(fs_info, logical);
871	} else {
872		eb = btrfs_clone_extent_buffer(eb_root);
873		btrfs_tree_read_unlock(eb_root);
874		free_extent_buffer(eb_root);
875	}
876
877	if (!eb)
878		return NULL;
879	if (old_root) {
880		btrfs_set_header_bytenr(eb, eb->start);
881		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
882		btrfs_set_header_owner(eb, eb_root_owner);
883		btrfs_set_header_level(eb, old_root->level);
884		btrfs_set_header_generation(eb, old_generation);
885	}
886	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
887				       btrfs_header_level(eb));
888	btrfs_tree_read_lock(eb);
889	if (tm)
890		tree_mod_log_rewind(fs_info, eb, time_seq, tm);
891	else
892		WARN_ON(btrfs_header_level(eb) != 0);
893	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
894
895	return eb;
896}
897
898int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
899{
900	struct tree_mod_elem *tm;
901	int level;
902	struct extent_buffer *eb_root = btrfs_root_node(root);
903
904	tm = tree_mod_log_oldest_root(eb_root, time_seq);
905	if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
906		level = tm->old_root.level;
907	else
908		level = btrfs_header_level(eb_root);
909
910	free_extent_buffer(eb_root);
911
912	return level;
913}
914
915/*
916 * Return the lowest sequence number in the tree modification log.
917 *
918 * Return the sequence number of the oldest tree modification log user, which
919 * corresponds to the lowest sequence number of all existing users. If there are
920 * no users it returns 0.
921 */
922u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info)
923{
924	u64 ret = 0;
925
926	read_lock(&fs_info->tree_mod_log_lock);
927	if (!list_empty(&fs_info->tree_mod_seq_list)) {
928		struct btrfs_seq_list *elem;
929
930		elem = list_first_entry(&fs_info->tree_mod_seq_list,
931					struct btrfs_seq_list, list);
932		ret = elem->seq;
933	}
934	read_unlock(&fs_info->tree_mod_log_lock);
935
936	return ret;
937}
v5.14.15
  1// SPDX-License-Identifier: GPL-2.0
  2
 
  3#include "tree-mod-log.h"
  4#include "disk-io.h"
 
 
 
  5
  6struct tree_mod_root {
  7	u64 logical;
  8	u8 level;
  9};
 10
 11struct tree_mod_elem {
 12	struct rb_node node;
 13	u64 logical;
 14	u64 seq;
 15	enum btrfs_mod_log_op op;
 16
 17	/*
 18	 * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
 19	 * operations.
 20	 */
 21	int slot;
 22
 23	/* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
 24	u64 generation;
 25
 26	/* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
 27	struct btrfs_disk_key key;
 28	u64 blockptr;
 29
 30	/* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
 31	struct {
 32		int dst_slot;
 33		int nr_items;
 34	} move;
 35
 36	/* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
 37	struct tree_mod_root old_root;
 38};
 39
 40/*
 41 * Pull a new tree mod seq number for our operation.
 42 */
 43static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
 44{
 45	return atomic64_inc_return(&fs_info->tree_mod_seq);
 46}
 47
 48/*
 49 * This adds a new blocker to the tree mod log's blocker list if the @elem
 50 * passed does not already have a sequence number set. So when a caller expects
 51 * to record tree modifications, it should ensure to set elem->seq to zero
 52 * before calling btrfs_get_tree_mod_seq.
 53 * Returns a fresh, unused tree log modification sequence number, even if no new
 54 * blocker was added.
 55 */
 56u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
 57			   struct btrfs_seq_list *elem)
 58{
 59	write_lock(&fs_info->tree_mod_log_lock);
 60	if (!elem->seq) {
 61		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
 62		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
 63		set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
 64	}
 65	write_unlock(&fs_info->tree_mod_log_lock);
 66
 67	return elem->seq;
 68}
 69
 70void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
 71			    struct btrfs_seq_list *elem)
 72{
 73	struct rb_root *tm_root;
 74	struct rb_node *node;
 75	struct rb_node *next;
 76	struct tree_mod_elem *tm;
 77	u64 min_seq = BTRFS_SEQ_LAST;
 78	u64 seq_putting = elem->seq;
 79
 80	if (!seq_putting)
 81		return;
 82
 83	write_lock(&fs_info->tree_mod_log_lock);
 84	list_del(&elem->list);
 85	elem->seq = 0;
 86
 87	if (list_empty(&fs_info->tree_mod_seq_list)) {
 88		clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
 89	} else {
 90		struct btrfs_seq_list *first;
 91
 92		first = list_first_entry(&fs_info->tree_mod_seq_list,
 93					 struct btrfs_seq_list, list);
 94		if (seq_putting > first->seq) {
 95			/*
 96			 * Blocker with lower sequence number exists, we cannot
 97			 * remove anything from the log.
 98			 */
 99			write_unlock(&fs_info->tree_mod_log_lock);
100			return;
101		}
102		min_seq = first->seq;
103	}
104
105	/*
106	 * Anything that's lower than the lowest existing (read: blocked)
107	 * sequence number can be removed from the tree.
108	 */
109	tm_root = &fs_info->tree_mod_log;
110	for (node = rb_first(tm_root); node; node = next) {
111		next = rb_next(node);
112		tm = rb_entry(node, struct tree_mod_elem, node);
113		if (tm->seq >= min_seq)
114			continue;
115		rb_erase(node, tm_root);
116		kfree(tm);
117	}
118	write_unlock(&fs_info->tree_mod_log_lock);
119}
120
121/*
122 * Key order of the log:
123 *       node/leaf start address -> sequence
124 *
125 * The 'start address' is the logical address of the *new* root node for root
126 * replace operations, or the logical address of the affected block for all
127 * other operations.
128 */
129static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
130					struct tree_mod_elem *tm)
131{
132	struct rb_root *tm_root;
133	struct rb_node **new;
134	struct rb_node *parent = NULL;
135	struct tree_mod_elem *cur;
136
137	lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
138
139	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
140
141	tm_root = &fs_info->tree_mod_log;
142	new = &tm_root->rb_node;
143	while (*new) {
144		cur = rb_entry(*new, struct tree_mod_elem, node);
145		parent = *new;
146		if (cur->logical < tm->logical)
147			new = &((*new)->rb_left);
148		else if (cur->logical > tm->logical)
149			new = &((*new)->rb_right);
150		else if (cur->seq < tm->seq)
151			new = &((*new)->rb_left);
152		else if (cur->seq > tm->seq)
153			new = &((*new)->rb_right);
154		else
155			return -EEXIST;
156	}
157
158	rb_link_node(&tm->node, parent, new);
159	rb_insert_color(&tm->node, tm_root);
160	return 0;
161}
162
163/*
164 * Determines if logging can be omitted. Returns true if it can. Otherwise, it
165 * returns false with the tree_mod_log_lock acquired. The caller must hold
166 * this until all tree mod log insertions are recorded in the rb tree and then
167 * write unlock fs_info::tree_mod_log_lock.
168 */
169static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info,
170				    struct extent_buffer *eb)
171{
172	if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
173		return true;
174	if (eb && btrfs_header_level(eb) == 0)
175		return true;
176
177	write_lock(&fs_info->tree_mod_log_lock);
178	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
179		write_unlock(&fs_info->tree_mod_log_lock);
180		return true;
181	}
182
183	return false;
184}
185
186/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
187static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
188				    struct extent_buffer *eb)
189{
190	if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
191		return false;
192	if (eb && btrfs_header_level(eb) == 0)
193		return false;
194
195	return true;
196}
197
198static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
199						 int slot,
200						 enum btrfs_mod_log_op op,
201						 gfp_t flags)
202{
203	struct tree_mod_elem *tm;
204
205	tm = kzalloc(sizeof(*tm), flags);
206	if (!tm)
207		return NULL;
208
209	tm->logical = eb->start;
210	if (op != BTRFS_MOD_LOG_KEY_ADD) {
211		btrfs_node_key(eb, &tm->key, slot);
212		tm->blockptr = btrfs_node_blockptr(eb, slot);
213	}
214	tm->op = op;
215	tm->slot = slot;
216	tm->generation = btrfs_node_ptr_generation(eb, slot);
217	RB_CLEAR_NODE(&tm->node);
218
219	return tm;
220}
221
222int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
223				  enum btrfs_mod_log_op op, gfp_t flags)
224{
225	struct tree_mod_elem *tm;
226	int ret;
227
228	if (!tree_mod_need_log(eb->fs_info, eb))
229		return 0;
230
231	tm = alloc_tree_mod_elem(eb, slot, op, flags);
232	if (!tm)
233		return -ENOMEM;
234
235	if (tree_mod_dont_log(eb->fs_info, eb)) {
236		kfree(tm);
237		return 0;
238	}
239
240	ret = tree_mod_log_insert(eb->fs_info, tm);
241	write_unlock(&eb->fs_info->tree_mod_log_lock);
242	if (ret)
243		kfree(tm);
244
245	return ret;
246}
247
248int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
249				   int dst_slot, int src_slot,
250				   int nr_items)
251{
252	struct tree_mod_elem *tm = NULL;
253	struct tree_mod_elem **tm_list = NULL;
254	int ret = 0;
255	int i;
256	bool locked = false;
257
258	if (!tree_mod_need_log(eb->fs_info, eb))
259		return 0;
260
261	tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
262	if (!tm_list)
263		return -ENOMEM;
264
265	tm = kzalloc(sizeof(*tm), GFP_NOFS);
266	if (!tm) {
267		ret = -ENOMEM;
268		goto free_tms;
269	}
270
271	tm->logical = eb->start;
272	tm->slot = src_slot;
273	tm->move.dst_slot = dst_slot;
274	tm->move.nr_items = nr_items;
275	tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
276
277	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
278		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
279				BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
280		if (!tm_list[i]) {
281			ret = -ENOMEM;
282			goto free_tms;
283		}
284	}
285
286	if (tree_mod_dont_log(eb->fs_info, eb))
287		goto free_tms;
288	locked = true;
289
290	/*
291	 * When we override something during the move, we log these removals.
292	 * This can only happen when we move towards the beginning of the
293	 * buffer, i.e. dst_slot < src_slot.
294	 */
295	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
296		ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
297		if (ret)
298			goto free_tms;
299	}
300
301	ret = tree_mod_log_insert(eb->fs_info, tm);
302	if (ret)
303		goto free_tms;
304	write_unlock(&eb->fs_info->tree_mod_log_lock);
305	kfree(tm_list);
306
307	return 0;
308
309free_tms:
310	for (i = 0; i < nr_items; i++) {
311		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
312			rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
313		kfree(tm_list[i]);
314	}
315	if (locked)
316		write_unlock(&eb->fs_info->tree_mod_log_lock);
317	kfree(tm_list);
318	kfree(tm);
319
320	return ret;
321}
322
323static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
324				       struct tree_mod_elem **tm_list,
325				       int nritems)
326{
327	int i, j;
328	int ret;
329
330	for (i = nritems - 1; i >= 0; i--) {
331		ret = tree_mod_log_insert(fs_info, tm_list[i]);
332		if (ret) {
333			for (j = nritems - 1; j > i; j--)
334				rb_erase(&tm_list[j]->node,
335					 &fs_info->tree_mod_log);
336			return ret;
337		}
338	}
339
340	return 0;
341}
342
343int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
344				   struct extent_buffer *new_root,
345				   bool log_removal)
346{
347	struct btrfs_fs_info *fs_info = old_root->fs_info;
348	struct tree_mod_elem *tm = NULL;
349	struct tree_mod_elem **tm_list = NULL;
350	int nritems = 0;
351	int ret = 0;
352	int i;
353
354	if (!tree_mod_need_log(fs_info, NULL))
355		return 0;
356
357	if (log_removal && btrfs_header_level(old_root) > 0) {
358		nritems = btrfs_header_nritems(old_root);
359		tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
360				  GFP_NOFS);
361		if (!tm_list) {
362			ret = -ENOMEM;
363			goto free_tms;
364		}
365		for (i = 0; i < nritems; i++) {
366			tm_list[i] = alloc_tree_mod_elem(old_root, i,
367			    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
368			if (!tm_list[i]) {
369				ret = -ENOMEM;
370				goto free_tms;
371			}
372		}
373	}
374
375	tm = kzalloc(sizeof(*tm), GFP_NOFS);
376	if (!tm) {
377		ret = -ENOMEM;
378		goto free_tms;
379	}
380
381	tm->logical = new_root->start;
382	tm->old_root.logical = old_root->start;
383	tm->old_root.level = btrfs_header_level(old_root);
384	tm->generation = btrfs_header_generation(old_root);
385	tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
386
387	if (tree_mod_dont_log(fs_info, NULL))
388		goto free_tms;
389
390	if (tm_list)
391		ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
392	if (!ret)
393		ret = tree_mod_log_insert(fs_info, tm);
394
395	write_unlock(&fs_info->tree_mod_log_lock);
396	if (ret)
397		goto free_tms;
398	kfree(tm_list);
399
400	return ret;
401
402free_tms:
403	if (tm_list) {
404		for (i = 0; i < nritems; i++)
405			kfree(tm_list[i]);
406		kfree(tm_list);
407	}
408	kfree(tm);
409
410	return ret;
411}
412
413static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
414						   u64 start, u64 min_seq,
415						   bool smallest)
416{
417	struct rb_root *tm_root;
418	struct rb_node *node;
419	struct tree_mod_elem *cur = NULL;
420	struct tree_mod_elem *found = NULL;
421
422	read_lock(&fs_info->tree_mod_log_lock);
423	tm_root = &fs_info->tree_mod_log;
424	node = tm_root->rb_node;
425	while (node) {
426		cur = rb_entry(node, struct tree_mod_elem, node);
427		if (cur->logical < start) {
428			node = node->rb_left;
429		} else if (cur->logical > start) {
430			node = node->rb_right;
431		} else if (cur->seq < min_seq) {
432			node = node->rb_left;
433		} else if (!smallest) {
434			/* We want the node with the highest seq */
435			if (found)
436				BUG_ON(found->seq > cur->seq);
437			found = cur;
438			node = node->rb_left;
439		} else if (cur->seq > min_seq) {
440			/* We want the node with the smallest seq */
441			if (found)
442				BUG_ON(found->seq < cur->seq);
443			found = cur;
444			node = node->rb_right;
445		} else {
446			found = cur;
447			break;
448		}
449	}
450	read_unlock(&fs_info->tree_mod_log_lock);
451
452	return found;
453}
454
455/*
456 * This returns the element from the log with the smallest time sequence
457 * value that's in the log (the oldest log item). Any element with a time
458 * sequence lower than min_seq will be ignored.
459 */
460static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
461							u64 start, u64 min_seq)
462{
463	return __tree_mod_log_search(fs_info, start, min_seq, true);
464}
465
466/*
467 * This returns the element from the log with the largest time sequence
468 * value that's in the log (the most recent log item). Any element with
469 * a time sequence lower than min_seq will be ignored.
470 */
471static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
472						 u64 start, u64 min_seq)
473{
474	return __tree_mod_log_search(fs_info, start, min_seq, false);
475}
476
477int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
478			       struct extent_buffer *src,
479			       unsigned long dst_offset,
480			       unsigned long src_offset,
481			       int nr_items)
482{
483	struct btrfs_fs_info *fs_info = dst->fs_info;
484	int ret = 0;
485	struct tree_mod_elem **tm_list = NULL;
486	struct tree_mod_elem **tm_list_add, **tm_list_rem;
487	int i;
488	bool locked = false;
489
490	if (!tree_mod_need_log(fs_info, NULL))
491		return 0;
492
493	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
494		return 0;
495
496	tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
497			  GFP_NOFS);
498	if (!tm_list)
499		return -ENOMEM;
500
501	tm_list_add = tm_list;
502	tm_list_rem = tm_list + nr_items;
503	for (i = 0; i < nr_items; i++) {
504		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
505		    BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
506		if (!tm_list_rem[i]) {
507			ret = -ENOMEM;
508			goto free_tms;
509		}
510
511		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
512						BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
513		if (!tm_list_add[i]) {
514			ret = -ENOMEM;
515			goto free_tms;
516		}
517	}
518
519	if (tree_mod_dont_log(fs_info, NULL))
520		goto free_tms;
521	locked = true;
522
523	for (i = 0; i < nr_items; i++) {
524		ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
525		if (ret)
526			goto free_tms;
527		ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
528		if (ret)
529			goto free_tms;
530	}
531
532	write_unlock(&fs_info->tree_mod_log_lock);
533	kfree(tm_list);
534
535	return 0;
536
537free_tms:
538	for (i = 0; i < nr_items * 2; i++) {
539		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
540			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
541		kfree(tm_list[i]);
542	}
543	if (locked)
544		write_unlock(&fs_info->tree_mod_log_lock);
545	kfree(tm_list);
546
547	return ret;
548}
549
550int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
551{
552	struct tree_mod_elem **tm_list = NULL;
553	int nritems = 0;
554	int i;
555	int ret = 0;
556
557	if (!tree_mod_need_log(eb->fs_info, eb))
558		return 0;
559
560	nritems = btrfs_header_nritems(eb);
561	tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
562	if (!tm_list)
563		return -ENOMEM;
564
565	for (i = 0; i < nritems; i++) {
566		tm_list[i] = alloc_tree_mod_elem(eb, i,
567		    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
568		if (!tm_list[i]) {
569			ret = -ENOMEM;
570			goto free_tms;
571		}
572	}
573
574	if (tree_mod_dont_log(eb->fs_info, eb))
575		goto free_tms;
576
577	ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
578	write_unlock(&eb->fs_info->tree_mod_log_lock);
579	if (ret)
580		goto free_tms;
581	kfree(tm_list);
582
583	return 0;
584
585free_tms:
586	for (i = 0; i < nritems; i++)
587		kfree(tm_list[i]);
588	kfree(tm_list);
589
590	return ret;
591}
592
593/*
594 * Returns the logical address of the oldest predecessor of the given root.
595 * Entries older than time_seq are ignored.
596 */
597static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
598						      u64 time_seq)
599{
600	struct tree_mod_elem *tm;
601	struct tree_mod_elem *found = NULL;
602	u64 root_logical = eb_root->start;
603	bool looped = false;
604
605	if (!time_seq)
606		return NULL;
607
608	/*
609	 * The very last operation that's logged for a root is the replacement
610	 * operation (if it is replaced at all). This has the logical address
611	 * of the *new* root, making it the very first operation that's logged
612	 * for this root.
613	 */
614	while (1) {
615		tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
616						time_seq);
617		if (!looped && !tm)
618			return NULL;
619		/*
620		 * If there are no tree operation for the oldest root, we simply
621		 * return it. This should only happen if that (old) root is at
622		 * level 0.
623		 */
624		if (!tm)
625			break;
626
627		/*
628		 * If there's an operation that's not a root replacement, we
629		 * found the oldest version of our root. Normally, we'll find a
630		 * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
631		 */
632		if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
633			break;
634
635		found = tm;
636		root_logical = tm->old_root.logical;
637		looped = true;
638	}
639
640	/* If there's no old root to return, return what we found instead */
641	if (!found)
642		found = tm;
643
644	return found;
645}
646
647
648/*
649 * tm is a pointer to the first operation to rewind within eb. Then, all
650 * previous operations will be rewound (until we reach something older than
651 * time_seq).
652 */
653static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
654				struct extent_buffer *eb,
655				u64 time_seq,
656				struct tree_mod_elem *first_tm)
657{
658	u32 n;
659	struct rb_node *next;
660	struct tree_mod_elem *tm = first_tm;
661	unsigned long o_dst;
662	unsigned long o_src;
663	unsigned long p_size = sizeof(struct btrfs_key_ptr);
664
665	n = btrfs_header_nritems(eb);
666	read_lock(&fs_info->tree_mod_log_lock);
667	while (tm && tm->seq >= time_seq) {
668		/*
669		 * All the operations are recorded with the operator used for
670		 * the modification. As we're going backwards, we do the
671		 * opposite of each operation here.
672		 */
673		switch (tm->op) {
674		case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
675			BUG_ON(tm->slot < n);
676			fallthrough;
677		case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
678		case BTRFS_MOD_LOG_KEY_REMOVE:
679			btrfs_set_node_key(eb, &tm->key, tm->slot);
680			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
681			btrfs_set_node_ptr_generation(eb, tm->slot,
682						      tm->generation);
683			n++;
684			break;
685		case BTRFS_MOD_LOG_KEY_REPLACE:
686			BUG_ON(tm->slot >= n);
687			btrfs_set_node_key(eb, &tm->key, tm->slot);
688			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
689			btrfs_set_node_ptr_generation(eb, tm->slot,
690						      tm->generation);
691			break;
692		case BTRFS_MOD_LOG_KEY_ADD:
693			/* if a move operation is needed it's in the log */
694			n--;
695			break;
696		case BTRFS_MOD_LOG_MOVE_KEYS:
697			o_dst = btrfs_node_key_ptr_offset(tm->slot);
698			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
699			memmove_extent_buffer(eb, o_dst, o_src,
700					      tm->move.nr_items * p_size);
701			break;
702		case BTRFS_MOD_LOG_ROOT_REPLACE:
703			/*
704			 * This operation is special. For roots, this must be
705			 * handled explicitly before rewinding.
706			 * For non-roots, this operation may exist if the node
707			 * was a root: root A -> child B; then A gets empty and
708			 * B is promoted to the new root. In the mod log, we'll
709			 * have a root-replace operation for B, a tree block
710			 * that is no root. We simply ignore that operation.
711			 */
712			break;
713		}
714		next = rb_next(&tm->node);
715		if (!next)
716			break;
717		tm = rb_entry(next, struct tree_mod_elem, node);
718		if (tm->logical != first_tm->logical)
719			break;
720	}
721	read_unlock(&fs_info->tree_mod_log_lock);
722	btrfs_set_header_nritems(eb, n);
723}
724
725/*
726 * Called with eb read locked. If the buffer cannot be rewound, the same buffer
727 * is returned. If rewind operations happen, a fresh buffer is returned. The
728 * returned buffer is always read-locked. If the returned buffer is not the
729 * input buffer, the lock on the input buffer is released and the input buffer
730 * is freed (its refcount is decremented).
731 */
732struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
733						struct btrfs_path *path,
734						struct extent_buffer *eb,
735						u64 time_seq)
736{
737	struct extent_buffer *eb_rewin;
738	struct tree_mod_elem *tm;
739
740	if (!time_seq)
741		return eb;
742
743	if (btrfs_header_level(eb) == 0)
744		return eb;
745
746	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
747	if (!tm)
748		return eb;
749
750	if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
751		BUG_ON(tm->slot != 0);
752		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
753		if (!eb_rewin) {
754			btrfs_tree_read_unlock(eb);
755			free_extent_buffer(eb);
756			return NULL;
757		}
758		btrfs_set_header_bytenr(eb_rewin, eb->start);
759		btrfs_set_header_backref_rev(eb_rewin,
760					     btrfs_header_backref_rev(eb));
761		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
762		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
763	} else {
764		eb_rewin = btrfs_clone_extent_buffer(eb);
765		if (!eb_rewin) {
766			btrfs_tree_read_unlock(eb);
767			free_extent_buffer(eb);
768			return NULL;
769		}
770	}
771
772	btrfs_tree_read_unlock(eb);
773	free_extent_buffer(eb);
774
775	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
776				       eb_rewin, btrfs_header_level(eb_rewin));
777	btrfs_tree_read_lock(eb_rewin);
778	tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
779	WARN_ON(btrfs_header_nritems(eb_rewin) >
780		BTRFS_NODEPTRS_PER_BLOCK(fs_info));
781
782	return eb_rewin;
783}
784
785/*
786 * Rewind the state of @root's root node to the given @time_seq value.
787 * If there are no changes, the current root->root_node is returned. If anything
788 * changed in between, there's a fresh buffer allocated on which the rewind
789 * operations are done. In any case, the returned buffer is read locked.
790 * Returns NULL on error (with no locks held).
791 */
792struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
793{
794	struct btrfs_fs_info *fs_info = root->fs_info;
795	struct tree_mod_elem *tm;
796	struct extent_buffer *eb = NULL;
797	struct extent_buffer *eb_root;
798	u64 eb_root_owner = 0;
799	struct extent_buffer *old;
800	struct tree_mod_root *old_root = NULL;
801	u64 old_generation = 0;
802	u64 logical;
803	int level;
804
805	eb_root = btrfs_read_lock_root_node(root);
806	tm = tree_mod_log_oldest_root(eb_root, time_seq);
807	if (!tm)
808		return eb_root;
809
810	if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
811		old_root = &tm->old_root;
812		old_generation = tm->generation;
813		logical = old_root->logical;
814		level = old_root->level;
815	} else {
816		logical = eb_root->start;
817		level = btrfs_header_level(eb_root);
818	}
819
820	tm = tree_mod_log_search(fs_info, logical, time_seq);
821	if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
 
 
822		btrfs_tree_read_unlock(eb_root);
823		free_extent_buffer(eb_root);
824		old = read_tree_block(fs_info, logical, root->root_key.objectid,
825				      0, level, NULL);
 
 
 
826		if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
827			if (!IS_ERR(old))
828				free_extent_buffer(old);
829			btrfs_warn(fs_info,
830				   "failed to read tree block %llu from get_old_root",
831				   logical);
832		} else {
833			struct tree_mod_elem *tm2;
834
835			btrfs_tree_read_lock(old);
836			eb = btrfs_clone_extent_buffer(old);
837			/*
838			 * After the lookup for the most recent tree mod operation
839			 * above and before we locked and cloned the extent buffer
840			 * 'old', a new tree mod log operation may have been added.
841			 * So lookup for a more recent one to make sure the number
842			 * of mod log operations we replay is consistent with the
843			 * number of items we have in the cloned extent buffer,
844			 * otherwise we can hit a BUG_ON when rewinding the extent
845			 * buffer.
846			 */
847			tm2 = tree_mod_log_search(fs_info, logical, time_seq);
848			btrfs_tree_read_unlock(old);
849			free_extent_buffer(old);
850			ASSERT(tm2);
851			ASSERT(tm2 == tm || tm2->seq > tm->seq);
852			if (!tm2 || tm2->seq < tm->seq) {
853				free_extent_buffer(eb);
854				return NULL;
855			}
856			tm = tm2;
857		}
858	} else if (old_root) {
859		eb_root_owner = btrfs_header_owner(eb_root);
860		btrfs_tree_read_unlock(eb_root);
861		free_extent_buffer(eb_root);
862		eb = alloc_dummy_extent_buffer(fs_info, logical);
863	} else {
864		eb = btrfs_clone_extent_buffer(eb_root);
865		btrfs_tree_read_unlock(eb_root);
866		free_extent_buffer(eb_root);
867	}
868
869	if (!eb)
870		return NULL;
871	if (old_root) {
872		btrfs_set_header_bytenr(eb, eb->start);
873		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
874		btrfs_set_header_owner(eb, eb_root_owner);
875		btrfs_set_header_level(eb, old_root->level);
876		btrfs_set_header_generation(eb, old_generation);
877	}
878	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
879				       btrfs_header_level(eb));
880	btrfs_tree_read_lock(eb);
881	if (tm)
882		tree_mod_log_rewind(fs_info, eb, time_seq, tm);
883	else
884		WARN_ON(btrfs_header_level(eb) != 0);
885	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
886
887	return eb;
888}
889
890int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
891{
892	struct tree_mod_elem *tm;
893	int level;
894	struct extent_buffer *eb_root = btrfs_root_node(root);
895
896	tm = tree_mod_log_oldest_root(eb_root, time_seq);
897	if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
898		level = tm->old_root.level;
899	else
900		level = btrfs_header_level(eb_root);
901
902	free_extent_buffer(eb_root);
903
904	return level;
905}
906
907/*
908 * Return the lowest sequence number in the tree modification log.
909 *
910 * Return the sequence number of the oldest tree modification log user, which
911 * corresponds to the lowest sequence number of all existing users. If there are
912 * no users it returns 0.
913 */
914u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info)
915{
916	u64 ret = 0;
917
918	read_lock(&fs_info->tree_mod_log_lock);
919	if (!list_empty(&fs_info->tree_mod_seq_list)) {
920		struct btrfs_seq_list *elem;
921
922		elem = list_first_entry(&fs_info->tree_mod_seq_list,
923					struct btrfs_seq_list, list);
924		ret = elem->seq;
925	}
926	read_unlock(&fs_info->tree_mod_log_lock);
927
928	return ret;
929}