tree-mod-log.c - fs/btrfs/tree-mod-log.c - Linux diff v6.8

   1// SPDX-License-Identifier: GPL-2.0
   2
   3#include "messages.h"
   4#include "tree-mod-log.h"
   5#include "disk-io.h"
   6#include "fs.h"
   7#include "accessors.h"
   8#include "tree-checker.h"
   9
  10struct tree_mod_root {
  11	u64 logical;
  12	u8 level;
  13};
  14
  15struct tree_mod_elem {
  16	struct rb_node node;
  17	u64 logical;
  18	u64 seq;
  19	enum btrfs_mod_log_op op;
  20
  21	/*
  22	 * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
  23	 * operations.
  24	 */
  25	int slot;
  26
  27	/* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
  28	u64 generation;
  29
  30	/* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
  31	struct btrfs_disk_key key;
  32	u64 blockptr;
  33
  34	/* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
  35	struct {
  36		int dst_slot;
  37		int nr_items;
  38	} move;
  39
  40	/* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
  41	struct tree_mod_root old_root;
  42};
  43
  44/*
  45 * Pull a new tree mod seq number for our operation.
  46 */
  47static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
  48{
  49	return atomic64_inc_return(&fs_info->tree_mod_seq);
  50}
  51
  52/*
  53 * This adds a new blocker to the tree mod log's blocker list if the @elem
  54 * passed does not already have a sequence number set. So when a caller expects
  55 * to record tree modifications, it should ensure to set elem->seq to zero
  56 * before calling btrfs_get_tree_mod_seq.
  57 * Returns a fresh, unused tree log modification sequence number, even if no new
  58 * blocker was added.
  59 */
  60u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
  61			   struct btrfs_seq_list *elem)
  62{
  63	write_lock(&fs_info->tree_mod_log_lock);
  64	if (!elem->seq) {
  65		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
  66		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
  67		set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
  68	}
  69	write_unlock(&fs_info->tree_mod_log_lock);
  70
  71	return elem->seq;
  72}
  73
  74void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
  75			    struct btrfs_seq_list *elem)
  76{
  77	struct rb_root *tm_root;
  78	struct rb_node *node;
  79	struct rb_node *next;
  80	struct tree_mod_elem *tm;
  81	u64 min_seq = BTRFS_SEQ_LAST;
  82	u64 seq_putting = elem->seq;
  83
  84	if (!seq_putting)
  85		return;
  86
  87	write_lock(&fs_info->tree_mod_log_lock);
  88	list_del(&elem->list);
  89	elem->seq = 0;
  90
  91	if (list_empty(&fs_info->tree_mod_seq_list)) {
  92		clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
  93	} else {
  94		struct btrfs_seq_list *first;
  95
  96		first = list_first_entry(&fs_info->tree_mod_seq_list,
  97					 struct btrfs_seq_list, list);
  98		if (seq_putting > first->seq) {
  99			/*
 100			 * Blocker with lower sequence number exists, we cannot
 101			 * remove anything from the log.
 102			 */
 103			write_unlock(&fs_info->tree_mod_log_lock);
 104			return;
 105		}
 106		min_seq = first->seq;
 107	}
 108
 109	/*
 110	 * Anything that's lower than the lowest existing (read: blocked)
 111	 * sequence number can be removed from the tree.
 112	 */
 113	tm_root = &fs_info->tree_mod_log;
 114	for (node = rb_first(tm_root); node; node = next) {
 115		next = rb_next(node);
 116		tm = rb_entry(node, struct tree_mod_elem, node);
 117		if (tm->seq >= min_seq)
 118			continue;
 119		rb_erase(node, tm_root);
 120		kfree(tm);
 121	}
 122	write_unlock(&fs_info->tree_mod_log_lock);
 123}
 124
 125/*
 126 * Key order of the log:
 127 *       node/leaf start address -> sequence
 128 *
 129 * The 'start address' is the logical address of the *new* root node for root
 130 * replace operations, or the logical address of the affected block for all
 131 * other operations.
 132 */
 133static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
 134					struct tree_mod_elem *tm)
 135{
 136	struct rb_root *tm_root;
 137	struct rb_node **new;
 138	struct rb_node *parent = NULL;
 139	struct tree_mod_elem *cur;
 140
 141	lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
 142
 143	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
 144
 145	tm_root = &fs_info->tree_mod_log;
 146	new = &tm_root->rb_node;
 147	while (*new) {
 148		cur = rb_entry(*new, struct tree_mod_elem, node);
 149		parent = *new;
 150		if (cur->logical < tm->logical)
 151			new = &((*new)->rb_left);
 152		else if (cur->logical > tm->logical)
 153			new = &((*new)->rb_right);
 154		else if (cur->seq < tm->seq)
 155			new = &((*new)->rb_left);
 156		else if (cur->seq > tm->seq)
 157			new = &((*new)->rb_right);
 158		else
 159			return -EEXIST;
 160	}
 161
 162	rb_link_node(&tm->node, parent, new);
 163	rb_insert_color(&tm->node, tm_root);
 164	return 0;
 165}
 166
 167/*
 168 * Determines if logging can be omitted. Returns true if it can. Otherwise, it
 169 * returns false with the tree_mod_log_lock acquired. The caller must hold
 170 * this until all tree mod log insertions are recorded in the rb tree and then
 171 * write unlock fs_info::tree_mod_log_lock.
 172 */
 173static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info,
 174				    struct extent_buffer *eb)
 175{
 176	if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
 177		return true;
 178	if (eb && btrfs_header_level(eb) == 0)
 179		return true;
 180
 181	write_lock(&fs_info->tree_mod_log_lock);
 182	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
 183		write_unlock(&fs_info->tree_mod_log_lock);
 184		return true;
 185	}
 186
 187	return false;
 188}
 189
 190/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
 191static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
 192				    struct extent_buffer *eb)
 193{
 194	if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
 195		return false;
 196	if (eb && btrfs_header_level(eb) == 0)
 197		return false;
 198
 199	return true;
 200}
 201
 202static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
 203						 int slot,
 204						 enum btrfs_mod_log_op op)
 205{
 206	struct tree_mod_elem *tm;
 207
 208	tm = kzalloc(sizeof(*tm), GFP_NOFS);
 209	if (!tm)
 210		return NULL;
 211
 212	tm->logical = eb->start;
 213	if (op != BTRFS_MOD_LOG_KEY_ADD) {
 214		btrfs_node_key(eb, &tm->key, slot);
 215		tm->blockptr = btrfs_node_blockptr(eb, slot);
 216	}
 217	tm->op = op;
 218	tm->slot = slot;
 219	tm->generation = btrfs_node_ptr_generation(eb, slot);
 220	RB_CLEAR_NODE(&tm->node);
 221
 222	return tm;
 223}
 224
 225int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
 226				  enum btrfs_mod_log_op op)
 227{
 228	struct tree_mod_elem *tm;
 229	int ret = 0;
 230
 231	if (!tree_mod_need_log(eb->fs_info, eb))
 232		return 0;
 233
 234	tm = alloc_tree_mod_elem(eb, slot, op);
 235	if (!tm)
 236		ret = -ENOMEM;
 237
 238	if (tree_mod_dont_log(eb->fs_info, eb)) {
 239		kfree(tm);
 240		/*
 241		 * Don't error if we failed to allocate memory because we don't
 242		 * need to log.
 243		 */
 244		return 0;
 245	} else if (ret != 0) {
 246		/*
 247		 * We previously failed to allocate memory and we need to log,
 248		 * so we have to fail.
 249		 */
 250		goto out_unlock;
 251	}
 252
 253	ret = tree_mod_log_insert(eb->fs_info, tm);
 254out_unlock:
 255	write_unlock(&eb->fs_info->tree_mod_log_lock);
 256	if (ret)
 257		kfree(tm);
 258
 259	return ret;
 260}
 261
 262static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb,
 263						     int dst_slot, int src_slot,
 264						     int nr_items)
 265{
 266	struct tree_mod_elem *tm;
 267
 268	tm = kzalloc(sizeof(*tm), GFP_NOFS);
 269	if (!tm)
 270		return ERR_PTR(-ENOMEM);
 271
 272	tm->logical = eb->start;
 273	tm->slot = src_slot;
 274	tm->move.dst_slot = dst_slot;
 275	tm->move.nr_items = nr_items;
 276	tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
 277	RB_CLEAR_NODE(&tm->node);
 278
 279	return tm;
 280}
 281
 282int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
 283				   int dst_slot, int src_slot,
 284				   int nr_items)
 285{
 286	struct tree_mod_elem *tm = NULL;
 287	struct tree_mod_elem **tm_list = NULL;
 288	int ret = 0;
 289	int i;
 290	bool locked = false;
 291
 292	if (!tree_mod_need_log(eb->fs_info, eb))
 293		return 0;
 294
 295	tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
 296	if (!tm_list) {
 297		ret = -ENOMEM;
 298		goto lock;
 299	}
 300
 301	tm = tree_mod_log_alloc_move(eb, dst_slot, src_slot, nr_items);
 302	if (IS_ERR(tm)) {
 303		ret = PTR_ERR(tm);
 304		tm = NULL;
 305		goto lock;
 306	}
 307
 308	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 309		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
 310				BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING);
 311		if (!tm_list[i]) {
 312			ret = -ENOMEM;
 313			goto lock;
 314		}
 315	}
 316
 317lock:
 318	if (tree_mod_dont_log(eb->fs_info, eb)) {
 319		/*
 320		 * Don't error if we failed to allocate memory because we don't
 321		 * need to log.
 322		 */
 323		ret = 0;
 324		goto free_tms;
 325	}
 326	locked = true;
 327
 328	/*
 329	 * We previously failed to allocate memory and we need to log, so we
 330	 * have to fail.
 331	 */
 332	if (ret != 0)
 333		goto free_tms;
 334
 335	/*
 336	 * When we override something during the move, we log these removals.
 337	 * This can only happen when we move towards the beginning of the
 338	 * buffer, i.e. dst_slot < src_slot.
 339	 */
 340	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 341		ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
 342		if (ret)
 343			goto free_tms;
 344	}
 345
 346	ret = tree_mod_log_insert(eb->fs_info, tm);
 347	if (ret)
 348		goto free_tms;
 349	write_unlock(&eb->fs_info->tree_mod_log_lock);
 350	kfree(tm_list);
 351
 352	return 0;
 353
 354free_tms:
 355	if (tm_list) {
 356		for (i = 0; i < nr_items; i++) {
 357			if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
 358				rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
 359			kfree(tm_list[i]);
 360		}
 361	}
 362	if (locked)
 363		write_unlock(&eb->fs_info->tree_mod_log_lock);
 364	kfree(tm_list);
 365	kfree(tm);
 366
 367	return ret;
 368}
 369
 370static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
 371				       struct tree_mod_elem **tm_list,
 372				       int nritems)
 373{
 374	int i, j;
 375	int ret;
 376
 377	for (i = nritems - 1; i >= 0; i--) {
 378		ret = tree_mod_log_insert(fs_info, tm_list[i]);
 379		if (ret) {
 380			for (j = nritems - 1; j > i; j--)
 381				rb_erase(&tm_list[j]->node,
 382					 &fs_info->tree_mod_log);
 383			return ret;
 384		}
 385	}
 386
 387	return 0;
 388}
 389
 390int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
 391				   struct extent_buffer *new_root,
 392				   bool log_removal)
 393{
 394	struct btrfs_fs_info *fs_info = old_root->fs_info;
 395	struct tree_mod_elem *tm = NULL;
 396	struct tree_mod_elem **tm_list = NULL;
 397	int nritems = 0;
 398	int ret = 0;
 399	int i;
 400
 401	if (!tree_mod_need_log(fs_info, NULL))
 402		return 0;
 403
 404	if (log_removal && btrfs_header_level(old_root) > 0) {
 405		nritems = btrfs_header_nritems(old_root);
 406		tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
 407				  GFP_NOFS);
 408		if (!tm_list) {
 409			ret = -ENOMEM;
 410			goto lock;
 411		}
 412		for (i = 0; i < nritems; i++) {
 413			tm_list[i] = alloc_tree_mod_elem(old_root, i,
 414			    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
 415			if (!tm_list[i]) {
 416				ret = -ENOMEM;
 417				goto lock;
 418			}
 419		}
 420	}
 421
 422	tm = kzalloc(sizeof(*tm), GFP_NOFS);
 423	if (!tm) {
 424		ret = -ENOMEM;
 425		goto lock;
 426	}
 427
 428	tm->logical = new_root->start;
 429	tm->old_root.logical = old_root->start;
 430	tm->old_root.level = btrfs_header_level(old_root);
 431	tm->generation = btrfs_header_generation(old_root);
 432	tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
 433
 434lock:
 435	if (tree_mod_dont_log(fs_info, NULL)) {
 436		/*
 437		 * Don't error if we failed to allocate memory because we don't
 438		 * need to log.
 439		 */
 440		ret = 0;
 441		goto free_tms;
 442	} else if (ret != 0) {
 443		/*
 444		 * We previously failed to allocate memory and we need to log,
 445		 * so we have to fail.
 446		 */
 447		goto out_unlock;
 448	}
 449
 450	if (tm_list)
 451		ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
 452	if (!ret)
 453		ret = tree_mod_log_insert(fs_info, tm);
 454
 455out_unlock:
 456	write_unlock(&fs_info->tree_mod_log_lock);
 457	if (ret)
 458		goto free_tms;
 459	kfree(tm_list);
 460
 461	return ret;
 462
 463free_tms:
 464	if (tm_list) {
 465		for (i = 0; i < nritems; i++)
 466			kfree(tm_list[i]);
 467		kfree(tm_list);
 468	}
 469	kfree(tm);
 470
 471	return ret;
 472}
 473
 474static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
 475						   u64 start, u64 min_seq,
 476						   bool smallest)
 477{
 478	struct rb_root *tm_root;
 479	struct rb_node *node;
 480	struct tree_mod_elem *cur = NULL;
 481	struct tree_mod_elem *found = NULL;
 482
 483	read_lock(&fs_info->tree_mod_log_lock);
 484	tm_root = &fs_info->tree_mod_log;
 485	node = tm_root->rb_node;
 486	while (node) {
 487		cur = rb_entry(node, struct tree_mod_elem, node);
 488		if (cur->logical < start) {
 489			node = node->rb_left;
 490		} else if (cur->logical > start) {
 491			node = node->rb_right;
 492		} else if (cur->seq < min_seq) {
 493			node = node->rb_left;
 494		} else if (!smallest) {
 495			/* We want the node with the highest seq */
 496			if (found)
 497				BUG_ON(found->seq > cur->seq);
 498			found = cur;
 499			node = node->rb_left;
 500		} else if (cur->seq > min_seq) {
 501			/* We want the node with the smallest seq */
 502			if (found)
 503				BUG_ON(found->seq < cur->seq);
 504			found = cur;
 505			node = node->rb_right;
 506		} else {
 507			found = cur;
 508			break;
 509		}
 510	}
 511	read_unlock(&fs_info->tree_mod_log_lock);
 512
 513	return found;
 514}
 515
 516/*
 517 * This returns the element from the log with the smallest time sequence
 518 * value that's in the log (the oldest log item). Any element with a time
 519 * sequence lower than min_seq will be ignored.
 520 */
 521static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
 522							u64 start, u64 min_seq)
 523{
 524	return __tree_mod_log_search(fs_info, start, min_seq, true);
 525}
 526
 527/*
 528 * This returns the element from the log with the largest time sequence
 529 * value that's in the log (the most recent log item). Any element with
 530 * a time sequence lower than min_seq will be ignored.
 531 */
 532static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
 533						 u64 start, u64 min_seq)
 534{
 535	return __tree_mod_log_search(fs_info, start, min_seq, false);
 536}
 537
 538int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
 539			       struct extent_buffer *src,
 540			       unsigned long dst_offset,
 541			       unsigned long src_offset,
 542			       int nr_items)
 543{
 544	struct btrfs_fs_info *fs_info = dst->fs_info;
 545	int ret = 0;
 546	struct tree_mod_elem **tm_list = NULL;
 547	struct tree_mod_elem **tm_list_add = NULL;
 548	struct tree_mod_elem **tm_list_rem = NULL;
 549	int i;
 550	bool locked = false;
 551	struct tree_mod_elem *dst_move_tm = NULL;
 552	struct tree_mod_elem *src_move_tm = NULL;
 553	u32 dst_move_nr_items = btrfs_header_nritems(dst) - dst_offset;
 554	u32 src_move_nr_items = btrfs_header_nritems(src) - (src_offset + nr_items);
 555
 556	if (!tree_mod_need_log(fs_info, NULL))
 557		return 0;
 558
 559	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
 560		return 0;
 561
 562	tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
 563			  GFP_NOFS);
 564	if (!tm_list) {
 565		ret = -ENOMEM;
 566		goto lock;
 567	}
 568
 569	if (dst_move_nr_items) {
 570		dst_move_tm = tree_mod_log_alloc_move(dst, dst_offset + nr_items,
 571						      dst_offset, dst_move_nr_items);
 572		if (IS_ERR(dst_move_tm)) {
 573			ret = PTR_ERR(dst_move_tm);
 574			dst_move_tm = NULL;
 575			goto lock;
 576		}
 577	}
 578	if (src_move_nr_items) {
 579		src_move_tm = tree_mod_log_alloc_move(src, src_offset,
 580						      src_offset + nr_items,
 581						      src_move_nr_items);
 582		if (IS_ERR(src_move_tm)) {
 583			ret = PTR_ERR(src_move_tm);
 584			src_move_tm = NULL;
 585			goto lock;
 586		}
 587	}
 588
 589	tm_list_add = tm_list;
 590	tm_list_rem = tm_list + nr_items;
 591	for (i = 0; i < nr_items; i++) {
 592		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
 593						     BTRFS_MOD_LOG_KEY_REMOVE);
 594		if (!tm_list_rem[i]) {
 595			ret = -ENOMEM;
 596			goto lock;
 597		}
 598
 599		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
 600						     BTRFS_MOD_LOG_KEY_ADD);
 601		if (!tm_list_add[i]) {
 602			ret = -ENOMEM;
 603			goto lock;
 604		}
 605	}
 606
 607lock:
 608	if (tree_mod_dont_log(fs_info, NULL)) {
 609		/*
 610		 * Don't error if we failed to allocate memory because we don't
 611		 * need to log.
 612		 */
 613		ret = 0;
 614		goto free_tms;
 615	}
 616	locked = true;
 617
 618	/*
 619	 * We previously failed to allocate memory and we need to log, so we
 620	 * have to fail.
 621	 */
 622	if (ret != 0)
 623		goto free_tms;
 624
 625	if (dst_move_tm) {
 626		ret = tree_mod_log_insert(fs_info, dst_move_tm);
 627		if (ret)
 628			goto free_tms;
 629	}
 630	for (i = 0; i < nr_items; i++) {
 631		ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
 632		if (ret)
 633			goto free_tms;
 634		ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
 635		if (ret)
 636			goto free_tms;
 637	}
 638	if (src_move_tm) {
 639		ret = tree_mod_log_insert(fs_info, src_move_tm);
 640		if (ret)
 641			goto free_tms;
 642	}
 643
 644	write_unlock(&fs_info->tree_mod_log_lock);
 645	kfree(tm_list);
 646
 647	return 0;
 648
 649free_tms:
 650	if (dst_move_tm && !RB_EMPTY_NODE(&dst_move_tm->node))
 651		rb_erase(&dst_move_tm->node, &fs_info->tree_mod_log);
 652	kfree(dst_move_tm);
 653	if (src_move_tm && !RB_EMPTY_NODE(&src_move_tm->node))
 654		rb_erase(&src_move_tm->node, &fs_info->tree_mod_log);
 655	kfree(src_move_tm);
 656	if (tm_list) {
 657		for (i = 0; i < nr_items * 2; i++) {
 658			if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
 659				rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
 660			kfree(tm_list[i]);
 661		}
 662	}
 663	if (locked)
 664		write_unlock(&fs_info->tree_mod_log_lock);
 665	kfree(tm_list);
 666
 667	return ret;
 668}
 669
 670int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
 671{
 672	struct tree_mod_elem **tm_list = NULL;
 673	int nritems = 0;
 674	int i;
 675	int ret = 0;
 676
 677	if (!tree_mod_need_log(eb->fs_info, eb))
 678		return 0;
 679
 680	nritems = btrfs_header_nritems(eb);
 681	tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
 682	if (!tm_list) {
 683		ret = -ENOMEM;
 684		goto lock;
 685	}
 686
 687	for (i = 0; i < nritems; i++) {
 688		tm_list[i] = alloc_tree_mod_elem(eb, i,
 689				    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
 690		if (!tm_list[i]) {
 691			ret = -ENOMEM;
 692			goto lock;
 693		}
 694	}
 695
 696lock:
 697	if (tree_mod_dont_log(eb->fs_info, eb)) {
 698		/*
 699		 * Don't error if we failed to allocate memory because we don't
 700		 * need to log.
 701		 */
 702		ret = 0;
 703		goto free_tms;
 704	} else if (ret != 0) {
 705		/*
 706		 * We previously failed to allocate memory and we need to log,
 707		 * so we have to fail.
 708		 */
 709		goto out_unlock;
 710	}
 711
 712	ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
 713out_unlock:
 714	write_unlock(&eb->fs_info->tree_mod_log_lock);
 715	if (ret)
 716		goto free_tms;
 717	kfree(tm_list);
 718
 719	return 0;
 720
 721free_tms:
 722	if (tm_list) {
 723		for (i = 0; i < nritems; i++)
 724			kfree(tm_list[i]);
 725		kfree(tm_list);
 726	}
 727
 728	return ret;
 729}
 730
 731/*
 732 * Returns the logical address of the oldest predecessor of the given root.
 733 * Entries older than time_seq are ignored.
 734 */
 735static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
 736						      u64 time_seq)
 737{
 738	struct tree_mod_elem *tm;
 739	struct tree_mod_elem *found = NULL;
 740	u64 root_logical = eb_root->start;
 741	bool looped = false;
 742
 743	if (!time_seq)
 744		return NULL;
 745
 746	/*
 747	 * The very last operation that's logged for a root is the replacement
 748	 * operation (if it is replaced at all). This has the logical address
 749	 * of the *new* root, making it the very first operation that's logged
 750	 * for this root.
 751	 */
 752	while (1) {
 753		tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
 754						time_seq);
 755		if (!looped && !tm)
 756			return NULL;
 757		/*
 758		 * If there are no tree operation for the oldest root, we simply
 759		 * return it. This should only happen if that (old) root is at
 760		 * level 0.
 761		 */
 762		if (!tm)
 763			break;
 764
 765		/*
 766		 * If there's an operation that's not a root replacement, we
 767		 * found the oldest version of our root. Normally, we'll find a
 768		 * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
 769		 */
 770		if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
 771			break;
 772
 773		found = tm;
 774		root_logical = tm->old_root.logical;
 775		looped = true;
 776	}
 777
 778	/* If there's no old root to return, return what we found instead */
 779	if (!found)
 780		found = tm;
 781
 782	return found;
 783}
 784
 785
 786/*
 787 * tm is a pointer to the first operation to rewind within eb. Then, all
 788 * previous operations will be rewound (until we reach something older than
 789 * time_seq).
 790 */
 791static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
 792				struct extent_buffer *eb,
 793				u64 time_seq,
 794				struct tree_mod_elem *first_tm)
 795{
 796	u32 n;
 797	struct rb_node *next;
 798	struct tree_mod_elem *tm = first_tm;
 799	unsigned long o_dst;
 800	unsigned long o_src;
 801	unsigned long p_size = sizeof(struct btrfs_key_ptr);
 802	/*
 803	 * max_slot tracks the maximum valid slot of the rewind eb at every
 804	 * step of the rewind. This is in contrast with 'n' which eventually
 805	 * matches the number of items, but can be wrong during moves or if
 806	 * removes overlap on already valid slots (which is probably separately
 807	 * a bug). We do this to validate the offsets of memmoves for rewinding
 808	 * moves and detect invalid memmoves.
 809	 *
 810	 * Since a rewind eb can start empty, max_slot is a signed integer with
 811	 * a special meaning for -1, which is that no slot is valid to move out
 812	 * of. Any other negative value is invalid.
 813	 */
 814	int max_slot;
 815	int move_src_end_slot;
 816	int move_dst_end_slot;
 817
 818	n = btrfs_header_nritems(eb);
 819	max_slot = n - 1;
 820	read_lock(&fs_info->tree_mod_log_lock);
 821	while (tm && tm->seq >= time_seq) {
 822		ASSERT(max_slot >= -1);
 823		/*
 824		 * All the operations are recorded with the operator used for
 825		 * the modification. As we're going backwards, we do the
 826		 * opposite of each operation here.
 827		 */
 828		switch (tm->op) {
 829		case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
 830			BUG_ON(tm->slot < n);
 831			fallthrough;
 832		case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
 833		case BTRFS_MOD_LOG_KEY_REMOVE:
 834			btrfs_set_node_key(eb, &tm->key, tm->slot);
 835			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
 836			btrfs_set_node_ptr_generation(eb, tm->slot,
 837						      tm->generation);
 838			n++;
 839			if (tm->slot > max_slot)
 840				max_slot = tm->slot;
 841			break;
 842		case BTRFS_MOD_LOG_KEY_REPLACE:
 843			BUG_ON(tm->slot >= n);
 844			btrfs_set_node_key(eb, &tm->key, tm->slot);
 845			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
 846			btrfs_set_node_ptr_generation(eb, tm->slot,
 847						      tm->generation);
 848			break;
 849		case BTRFS_MOD_LOG_KEY_ADD:
 850			/*
 851			 * It is possible we could have already removed keys
 852			 * behind the known max slot, so this will be an
 853			 * overestimate. In practice, the copy operation
 854			 * inserts them in increasing order, and overestimating
 855			 * just means we miss some warnings, so it's OK. It
 856			 * isn't worth carefully tracking the full array of
 857			 * valid slots to check against when moving.
 858			 */
 859			if (tm->slot == max_slot)
 860				max_slot--;
 861			/* if a move operation is needed it's in the log */
 862			n--;
 863			break;
 864		case BTRFS_MOD_LOG_MOVE_KEYS:
 865			ASSERT(tm->move.nr_items > 0);
 866			move_src_end_slot = tm->move.dst_slot + tm->move.nr_items - 1;
 867			move_dst_end_slot = tm->slot + tm->move.nr_items - 1;
 868			o_dst = btrfs_node_key_ptr_offset(eb, tm->slot);
 869			o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot);
 870			if (WARN_ON(move_src_end_slot > max_slot ||
 871				    tm->move.nr_items <= 0)) {
 872				btrfs_warn(fs_info,
 873"move from invalid tree mod log slot eb %llu slot %d dst_slot %d nr_items %d seq %llu n %u max_slot %d",
 874					   eb->start, tm->slot,
 875					   tm->move.dst_slot, tm->move.nr_items,
 876					   tm->seq, n, max_slot);
 877			}
 878			memmove_extent_buffer(eb, o_dst, o_src,
 879					      tm->move.nr_items * p_size);
 880			max_slot = move_dst_end_slot;
 881			break;
 882		case BTRFS_MOD_LOG_ROOT_REPLACE:
 883			/*
 884			 * This operation is special. For roots, this must be
 885			 * handled explicitly before rewinding.
 886			 * For non-roots, this operation may exist if the node
 887			 * was a root: root A -> child B; then A gets empty and
 888			 * B is promoted to the new root. In the mod log, we'll
 889			 * have a root-replace operation for B, a tree block
 890			 * that is no root. We simply ignore that operation.
 891			 */
 892			break;
 893		}
 894		next = rb_next(&tm->node);
 895		if (!next)
 896			break;
 897		tm = rb_entry(next, struct tree_mod_elem, node);
 898		if (tm->logical != first_tm->logical)
 899			break;
 900	}
 901	read_unlock(&fs_info->tree_mod_log_lock);
 902	btrfs_set_header_nritems(eb, n);
 903}
 904
 905/*
 906 * Called with eb read locked. If the buffer cannot be rewound, the same buffer
 907 * is returned. If rewind operations happen, a fresh buffer is returned. The
 908 * returned buffer is always read-locked. If the returned buffer is not the
 909 * input buffer, the lock on the input buffer is released and the input buffer
 910 * is freed (its refcount is decremented).
 911 */
 912struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
 913						struct btrfs_path *path,
 914						struct extent_buffer *eb,
 915						u64 time_seq)
 916{
 917	struct extent_buffer *eb_rewin;
 918	struct tree_mod_elem *tm;
 919
 920	if (!time_seq)
 921		return eb;
 922
 923	if (btrfs_header_level(eb) == 0)
 924		return eb;
 925
 926	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
 927	if (!tm)
 928		return eb;
 929
 930	if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
 931		BUG_ON(tm->slot != 0);
 932		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
 933		if (!eb_rewin) {
 934			btrfs_tree_read_unlock(eb);
 935			free_extent_buffer(eb);
 936			return NULL;
 937		}
 938		btrfs_set_header_bytenr(eb_rewin, eb->start);
 939		btrfs_set_header_backref_rev(eb_rewin,
 940					     btrfs_header_backref_rev(eb));
 941		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
 942		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
 943	} else {
 944		eb_rewin = btrfs_clone_extent_buffer(eb);
 945		if (!eb_rewin) {
 946			btrfs_tree_read_unlock(eb);
 947			free_extent_buffer(eb);
 948			return NULL;
 949		}
 950	}
 951
 952	btrfs_tree_read_unlock(eb);
 953	free_extent_buffer(eb);
 954
 955	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
 956				       eb_rewin, btrfs_header_level(eb_rewin));
 957	btrfs_tree_read_lock(eb_rewin);
 958	tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
 959	WARN_ON(btrfs_header_nritems(eb_rewin) >
 960		BTRFS_NODEPTRS_PER_BLOCK(fs_info));
 961
 962	return eb_rewin;
 963}
 964
 965/*
 966 * Rewind the state of @root's root node to the given @time_seq value.
 967 * If there are no changes, the current root->root_node is returned. If anything
 968 * changed in between, there's a fresh buffer allocated on which the rewind
 969 * operations are done. In any case, the returned buffer is read locked.
 970 * Returns NULL on error (with no locks held).
 971 */
 972struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
 973{
 974	struct btrfs_fs_info *fs_info = root->fs_info;
 975	struct tree_mod_elem *tm;
 976	struct extent_buffer *eb = NULL;
 977	struct extent_buffer *eb_root;
 978	u64 eb_root_owner = 0;
 979	struct extent_buffer *old;
 980	struct tree_mod_root *old_root = NULL;
 981	u64 old_generation = 0;
 982	u64 logical;
 983	int level;
 984
 985	eb_root = btrfs_read_lock_root_node(root);
 986	tm = tree_mod_log_oldest_root(eb_root, time_seq);
 987	if (!tm)
 988		return eb_root;
 989
 990	if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
 991		old_root = &tm->old_root;
 992		old_generation = tm->generation;
 993		logical = old_root->logical;
 994		level = old_root->level;
 995	} else {
 996		logical = eb_root->start;
 997		level = btrfs_header_level(eb_root);
 998	}
 999
1000	tm = tree_mod_log_search(fs_info, logical, time_seq);
1001	if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1002		struct btrfs_tree_parent_check check = { 0 };
1003
1004		btrfs_tree_read_unlock(eb_root);
1005		free_extent_buffer(eb_root);
1006
1007		check.level = level;
1008		check.owner_root = root->root_key.objectid;
1009
1010		old = read_tree_block(fs_info, logical, &check);
1011		if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1012			if (!IS_ERR(old))
1013				free_extent_buffer(old);
1014			btrfs_warn(fs_info,
1015				   "failed to read tree block %llu from get_old_root",
1016				   logical);
1017		} else {
1018			struct tree_mod_elem *tm2;
1019
1020			btrfs_tree_read_lock(old);
1021			eb = btrfs_clone_extent_buffer(old);
1022			/*
1023			 * After the lookup for the most recent tree mod operation
1024			 * above and before we locked and cloned the extent buffer
1025			 * 'old', a new tree mod log operation may have been added.
1026			 * So lookup for a more recent one to make sure the number
1027			 * of mod log operations we replay is consistent with the
1028			 * number of items we have in the cloned extent buffer,
1029			 * otherwise we can hit a BUG_ON when rewinding the extent
1030			 * buffer.
1031			 */
1032			tm2 = tree_mod_log_search(fs_info, logical, time_seq);
1033			btrfs_tree_read_unlock(old);
1034			free_extent_buffer(old);
1035			ASSERT(tm2);
1036			ASSERT(tm2 == tm || tm2->seq > tm->seq);
1037			if (!tm2 || tm2->seq < tm->seq) {
1038				free_extent_buffer(eb);
1039				return NULL;
1040			}
1041			tm = tm2;
1042		}
1043	} else if (old_root) {
1044		eb_root_owner = btrfs_header_owner(eb_root);
1045		btrfs_tree_read_unlock(eb_root);
1046		free_extent_buffer(eb_root);
1047		eb = alloc_dummy_extent_buffer(fs_info, logical);
1048	} else {
1049		eb = btrfs_clone_extent_buffer(eb_root);
1050		btrfs_tree_read_unlock(eb_root);
1051		free_extent_buffer(eb_root);
1052	}
1053
1054	if (!eb)
1055		return NULL;
1056	if (old_root) {
1057		btrfs_set_header_bytenr(eb, eb->start);
1058		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1059		btrfs_set_header_owner(eb, eb_root_owner);
1060		btrfs_set_header_level(eb, old_root->level);
1061		btrfs_set_header_generation(eb, old_generation);
1062	}
1063	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
1064				       btrfs_header_level(eb));
1065	btrfs_tree_read_lock(eb);
1066	if (tm)
1067		tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1068	else
1069		WARN_ON(btrfs_header_level(eb) != 0);
1070	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1071
1072	return eb;
1073}
1074
1075int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1076{
1077	struct tree_mod_elem *tm;
1078	int level;
1079	struct extent_buffer *eb_root = btrfs_root_node(root);
1080
1081	tm = tree_mod_log_oldest_root(eb_root, time_seq);
1082	if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
1083		level = tm->old_root.level;
1084	else
1085		level = btrfs_header_level(eb_root);
1086
1087	free_extent_buffer(eb_root);
1088
1089	return level;
1090}
1091
1092/*
1093 * Return the lowest sequence number in the tree modification log.
1094 *
1095 * Return the sequence number of the oldest tree modification log user, which
1096 * corresponds to the lowest sequence number of all existing users. If there are
1097 * no users it returns 0.
1098 */
1099u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info)
1100{
1101	u64 ret = 0;
1102
1103	read_lock(&fs_info->tree_mod_log_lock);
1104	if (!list_empty(&fs_info->tree_mod_seq_list)) {
1105		struct btrfs_seq_list *elem;
1106
1107		elem = list_first_entry(&fs_info->tree_mod_seq_list,
1108					struct btrfs_seq_list, list);
1109		ret = elem->seq;
1110	}
1111	read_unlock(&fs_info->tree_mod_log_lock);
1112
1113	return ret;
1114}

   1// SPDX-License-Identifier: GPL-2.0
   2
   3#include "messages.h"
   4#include "tree-mod-log.h"
   5#include "disk-io.h"
   6#include "fs.h"
   7#include "accessors.h"
   8#include "tree-checker.h"
   9
  10struct tree_mod_root {
  11	u64 logical;
  12	u8 level;
  13};
  14
  15struct tree_mod_elem {
  16	struct rb_node node;
  17	u64 logical;
  18	u64 seq;
  19	enum btrfs_mod_log_op op;
  20
  21	/*
  22	 * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
  23	 * operations.
  24	 */
  25	int slot;
  26
  27	/* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
  28	u64 generation;
  29
  30	/* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
  31	struct btrfs_disk_key key;
  32	u64 blockptr;
  33
  34	/* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
  35	struct {
  36		int dst_slot;
  37		int nr_items;
  38	} move;
  39
  40	/* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
  41	struct tree_mod_root old_root;
  42};
  43
  44/*
  45 * Pull a new tree mod seq number for our operation.
  46 */
  47static u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
  48{
  49	return atomic64_inc_return(&fs_info->tree_mod_seq);
  50}
  51
  52/*
  53 * This adds a new blocker to the tree mod log's blocker list if the @elem
  54 * passed does not already have a sequence number set. So when a caller expects
  55 * to record tree modifications, it should ensure to set elem->seq to zero
  56 * before calling btrfs_get_tree_mod_seq.
  57 * Returns a fresh, unused tree log modification sequence number, even if no new
  58 * blocker was added.
  59 */
  60u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
  61			   struct btrfs_seq_list *elem)
  62{
  63	write_lock(&fs_info->tree_mod_log_lock);
  64	if (!elem->seq) {
  65		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
  66		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
  67		set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
  68	}
  69	write_unlock(&fs_info->tree_mod_log_lock);
  70
  71	return elem->seq;
  72}
  73
  74void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
  75			    struct btrfs_seq_list *elem)
  76{
  77	struct rb_root *tm_root;
  78	struct rb_node *node;
  79	struct rb_node *next;
  80	struct tree_mod_elem *tm;
  81	u64 min_seq = BTRFS_SEQ_LAST;
  82	u64 seq_putting = elem->seq;
  83
  84	if (!seq_putting)
  85		return;
  86
  87	write_lock(&fs_info->tree_mod_log_lock);
  88	list_del(&elem->list);
  89	elem->seq = 0;
  90
  91	if (list_empty(&fs_info->tree_mod_seq_list)) {
  92		clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
  93	} else {
  94		struct btrfs_seq_list *first;
  95
  96		first = list_first_entry(&fs_info->tree_mod_seq_list,
  97					 struct btrfs_seq_list, list);
  98		if (seq_putting > first->seq) {
  99			/*
 100			 * Blocker with lower sequence number exists, we cannot
 101			 * remove anything from the log.
 102			 */
 103			write_unlock(&fs_info->tree_mod_log_lock);
 104			return;
 105		}
 106		min_seq = first->seq;
 107	}
 108
 109	/*
 110	 * Anything that's lower than the lowest existing (read: blocked)
 111	 * sequence number can be removed from the tree.
 112	 */
 113	tm_root = &fs_info->tree_mod_log;
 114	for (node = rb_first(tm_root); node; node = next) {
 115		next = rb_next(node);
 116		tm = rb_entry(node, struct tree_mod_elem, node);
 117		if (tm->seq >= min_seq)
 118			continue;
 119		rb_erase(node, tm_root);
 120		kfree(tm);
 121	}
 122	write_unlock(&fs_info->tree_mod_log_lock);
 123}
 124
 125/*
 126 * Key order of the log:
 127 *       node/leaf start address -> sequence
 128 *
 129 * The 'start address' is the logical address of the *new* root node for root
 130 * replace operations, or the logical address of the affected block for all
 131 * other operations.
 132 */
 133static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
 134					struct tree_mod_elem *tm)
 135{
 136	struct rb_root *tm_root;
 137	struct rb_node **new;
 138	struct rb_node *parent = NULL;
 139	struct tree_mod_elem *cur;
 140
 141	lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
 142
 143	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
 144
 145	tm_root = &fs_info->tree_mod_log;
 146	new = &tm_root->rb_node;
 147	while (*new) {
 148		cur = rb_entry(*new, struct tree_mod_elem, node);
 149		parent = *new;
 150		if (cur->logical < tm->logical)
 151			new = &((*new)->rb_left);
 152		else if (cur->logical > tm->logical)
 153			new = &((*new)->rb_right);
 154		else if (cur->seq < tm->seq)
 155			new = &((*new)->rb_left);
 156		else if (cur->seq > tm->seq)
 157			new = &((*new)->rb_right);
 158		else
 159			return -EEXIST;
 160	}
 161
 162	rb_link_node(&tm->node, parent, new);
 163	rb_insert_color(&tm->node, tm_root);
 164	return 0;
 165}
 166
 167/*
 168 * Determines if logging can be omitted. Returns true if it can. Otherwise, it
 169 * returns false with the tree_mod_log_lock acquired. The caller must hold
 170 * this until all tree mod log insertions are recorded in the rb tree and then
 171 * write unlock fs_info::tree_mod_log_lock.
 172 */
 173static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
 
 174{
 175	if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
 176		return true;
 177	if (eb && btrfs_header_level(eb) == 0)
 178		return true;
 179
 180	write_lock(&fs_info->tree_mod_log_lock);
 181	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
 182		write_unlock(&fs_info->tree_mod_log_lock);
 183		return true;
 184	}
 185
 186	return false;
 187}
 188
 189/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
 190static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
 191				    struct extent_buffer *eb)
 192{
 193	if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
 194		return false;
 195	if (eb && btrfs_header_level(eb) == 0)
 196		return false;
 197
 198	return true;
 199}
 200
 201static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
 202						 int slot,
 203						 enum btrfs_mod_log_op op)
 204{
 205	struct tree_mod_elem *tm;
 206
 207	tm = kzalloc(sizeof(*tm), GFP_NOFS);
 208	if (!tm)
 209		return NULL;
 210
 211	tm->logical = eb->start;
 212	if (op != BTRFS_MOD_LOG_KEY_ADD) {
 213		btrfs_node_key(eb, &tm->key, slot);
 214		tm->blockptr = btrfs_node_blockptr(eb, slot);
 215	}
 216	tm->op = op;
 217	tm->slot = slot;
 218	tm->generation = btrfs_node_ptr_generation(eb, slot);
 219	RB_CLEAR_NODE(&tm->node);
 220
 221	return tm;
 222}
 223
 224int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
 225				  enum btrfs_mod_log_op op)
 226{
 227	struct tree_mod_elem *tm;
 228	int ret = 0;
 229
 230	if (!tree_mod_need_log(eb->fs_info, eb))
 231		return 0;
 232
 233	tm = alloc_tree_mod_elem(eb, slot, op);
 234	if (!tm)
 235		ret = -ENOMEM;
 236
 237	if (tree_mod_dont_log(eb->fs_info, eb)) {
 238		kfree(tm);
 239		/*
 240		 * Don't error if we failed to allocate memory because we don't
 241		 * need to log.
 242		 */
 243		return 0;
 244	} else if (ret != 0) {
 245		/*
 246		 * We previously failed to allocate memory and we need to log,
 247		 * so we have to fail.
 248		 */
 249		goto out_unlock;
 250	}
 251
 252	ret = tree_mod_log_insert(eb->fs_info, tm);
 253out_unlock:
 254	write_unlock(&eb->fs_info->tree_mod_log_lock);
 255	if (ret)
 256		kfree(tm);
 257
 258	return ret;
 259}
 260
 261static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb,
 262						     int dst_slot, int src_slot,
 263						     int nr_items)
 264{
 265	struct tree_mod_elem *tm;
 266
 267	tm = kzalloc(sizeof(*tm), GFP_NOFS);
 268	if (!tm)
 269		return ERR_PTR(-ENOMEM);
 270
 271	tm->logical = eb->start;
 272	tm->slot = src_slot;
 273	tm->move.dst_slot = dst_slot;
 274	tm->move.nr_items = nr_items;
 275	tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
 276	RB_CLEAR_NODE(&tm->node);
 277
 278	return tm;
 279}
 280
 281int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
 282				   int dst_slot, int src_slot,
 283				   int nr_items)
 284{
 285	struct tree_mod_elem *tm = NULL;
 286	struct tree_mod_elem **tm_list = NULL;
 287	int ret = 0;
 288	int i;
 289	bool locked = false;
 290
 291	if (!tree_mod_need_log(eb->fs_info, eb))
 292		return 0;
 293
 294	tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
 295	if (!tm_list) {
 296		ret = -ENOMEM;
 297		goto lock;
 298	}
 299
 300	tm = tree_mod_log_alloc_move(eb, dst_slot, src_slot, nr_items);
 301	if (IS_ERR(tm)) {
 302		ret = PTR_ERR(tm);
 303		tm = NULL;
 304		goto lock;
 305	}
 306
 307	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 308		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
 309				BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING);
 310		if (!tm_list[i]) {
 311			ret = -ENOMEM;
 312			goto lock;
 313		}
 314	}
 315
 316lock:
 317	if (tree_mod_dont_log(eb->fs_info, eb)) {
 318		/*
 319		 * Don't error if we failed to allocate memory because we don't
 320		 * need to log.
 321		 */
 322		ret = 0;
 323		goto free_tms;
 324	}
 325	locked = true;
 326
 327	/*
 328	 * We previously failed to allocate memory and we need to log, so we
 329	 * have to fail.
 330	 */
 331	if (ret != 0)
 332		goto free_tms;
 333
 334	/*
 335	 * When we override something during the move, we log these removals.
 336	 * This can only happen when we move towards the beginning of the
 337	 * buffer, i.e. dst_slot < src_slot.
 338	 */
 339	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 340		ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
 341		if (ret)
 342			goto free_tms;
 343	}
 344
 345	ret = tree_mod_log_insert(eb->fs_info, tm);
 346	if (ret)
 347		goto free_tms;
 348	write_unlock(&eb->fs_info->tree_mod_log_lock);
 349	kfree(tm_list);
 350
 351	return 0;
 352
 353free_tms:
 354	if (tm_list) {
 355		for (i = 0; i < nr_items; i++) {
 356			if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
 357				rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
 358			kfree(tm_list[i]);
 359		}
 360	}
 361	if (locked)
 362		write_unlock(&eb->fs_info->tree_mod_log_lock);
 363	kfree(tm_list);
 364	kfree(tm);
 365
 366	return ret;
 367}
 368
 369static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
 370				struct tree_mod_elem **tm_list,
 371				int nritems)
 372{
 373	int i, j;
 374	int ret;
 375
 376	for (i = nritems - 1; i >= 0; i--) {
 377		ret = tree_mod_log_insert(fs_info, tm_list[i]);
 378		if (ret) {
 379			for (j = nritems - 1; j > i; j--)
 380				rb_erase(&tm_list[j]->node,
 381					 &fs_info->tree_mod_log);
 382			return ret;
 383		}
 384	}
 385
 386	return 0;
 387}
 388
 389int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
 390				   struct extent_buffer *new_root,
 391				   bool log_removal)
 392{
 393	struct btrfs_fs_info *fs_info = old_root->fs_info;
 394	struct tree_mod_elem *tm = NULL;
 395	struct tree_mod_elem **tm_list = NULL;
 396	int nritems = 0;
 397	int ret = 0;
 398	int i;
 399
 400	if (!tree_mod_need_log(fs_info, NULL))
 401		return 0;
 402
 403	if (log_removal && btrfs_header_level(old_root) > 0) {
 404		nritems = btrfs_header_nritems(old_root);
 405		tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
 406				  GFP_NOFS);
 407		if (!tm_list) {
 408			ret = -ENOMEM;
 409			goto lock;
 410		}
 411		for (i = 0; i < nritems; i++) {
 412			tm_list[i] = alloc_tree_mod_elem(old_root, i,
 413			    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
 414			if (!tm_list[i]) {
 415				ret = -ENOMEM;
 416				goto lock;
 417			}
 418		}
 419	}
 420
 421	tm = kzalloc(sizeof(*tm), GFP_NOFS);
 422	if (!tm) {
 423		ret = -ENOMEM;
 424		goto lock;
 425	}
 426
 427	tm->logical = new_root->start;
 428	tm->old_root.logical = old_root->start;
 429	tm->old_root.level = btrfs_header_level(old_root);
 430	tm->generation = btrfs_header_generation(old_root);
 431	tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
 432
 433lock:
 434	if (tree_mod_dont_log(fs_info, NULL)) {
 435		/*
 436		 * Don't error if we failed to allocate memory because we don't
 437		 * need to log.
 438		 */
 439		ret = 0;
 440		goto free_tms;
 441	} else if (ret != 0) {
 442		/*
 443		 * We previously failed to allocate memory and we need to log,
 444		 * so we have to fail.
 445		 */
 446		goto out_unlock;
 447	}
 448
 449	if (tm_list)
 450		ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
 451	if (!ret)
 452		ret = tree_mod_log_insert(fs_info, tm);
 453
 454out_unlock:
 455	write_unlock(&fs_info->tree_mod_log_lock);
 456	if (ret)
 457		goto free_tms;
 458	kfree(tm_list);
 459
 460	return ret;
 461
 462free_tms:
 463	if (tm_list) {
 464		for (i = 0; i < nritems; i++)
 465			kfree(tm_list[i]);
 466		kfree(tm_list);
 467	}
 468	kfree(tm);
 469
 470	return ret;
 471}
 472
 473static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
 474						   u64 start, u64 min_seq,
 475						   bool smallest)
 476{
 477	struct rb_root *tm_root;
 478	struct rb_node *node;
 479	struct tree_mod_elem *cur = NULL;
 480	struct tree_mod_elem *found = NULL;
 481
 482	read_lock(&fs_info->tree_mod_log_lock);
 483	tm_root = &fs_info->tree_mod_log;
 484	node = tm_root->rb_node;
 485	while (node) {
 486		cur = rb_entry(node, struct tree_mod_elem, node);
 487		if (cur->logical < start) {
 488			node = node->rb_left;
 489		} else if (cur->logical > start) {
 490			node = node->rb_right;
 491		} else if (cur->seq < min_seq) {
 492			node = node->rb_left;
 493		} else if (!smallest) {
 494			/* We want the node with the highest seq */
 495			if (found)
 496				BUG_ON(found->seq > cur->seq);
 497			found = cur;
 498			node = node->rb_left;
 499		} else if (cur->seq > min_seq) {
 500			/* We want the node with the smallest seq */
 501			if (found)
 502				BUG_ON(found->seq < cur->seq);
 503			found = cur;
 504			node = node->rb_right;
 505		} else {
 506			found = cur;
 507			break;
 508		}
 509	}
 510	read_unlock(&fs_info->tree_mod_log_lock);
 511
 512	return found;
 513}
 514
 515/*
 516 * This returns the element from the log with the smallest time sequence
 517 * value that's in the log (the oldest log item). Any element with a time
 518 * sequence lower than min_seq will be ignored.
 519 */
 520static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
 521							u64 start, u64 min_seq)
 522{
 523	return __tree_mod_log_search(fs_info, start, min_seq, true);
 524}
 525
 526/*
 527 * This returns the element from the log with the largest time sequence
 528 * value that's in the log (the most recent log item). Any element with
 529 * a time sequence lower than min_seq will be ignored.
 530 */
 531static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
 532						 u64 start, u64 min_seq)
 533{
 534	return __tree_mod_log_search(fs_info, start, min_seq, false);
 535}
 536
 537int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
 538			       struct extent_buffer *src,
 539			       unsigned long dst_offset,
 540			       unsigned long src_offset,
 541			       int nr_items)
 542{
 543	struct btrfs_fs_info *fs_info = dst->fs_info;
 544	int ret = 0;
 545	struct tree_mod_elem **tm_list = NULL;
 546	struct tree_mod_elem **tm_list_add = NULL;
 547	struct tree_mod_elem **tm_list_rem = NULL;
 548	int i;
 549	bool locked = false;
 550	struct tree_mod_elem *dst_move_tm = NULL;
 551	struct tree_mod_elem *src_move_tm = NULL;
 552	u32 dst_move_nr_items = btrfs_header_nritems(dst) - dst_offset;
 553	u32 src_move_nr_items = btrfs_header_nritems(src) - (src_offset + nr_items);
 554
 555	if (!tree_mod_need_log(fs_info, NULL))
 556		return 0;
 557
 558	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
 559		return 0;
 560
 561	tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
 562			  GFP_NOFS);
 563	if (!tm_list) {
 564		ret = -ENOMEM;
 565		goto lock;
 566	}
 567
 568	if (dst_move_nr_items) {
 569		dst_move_tm = tree_mod_log_alloc_move(dst, dst_offset + nr_items,
 570						      dst_offset, dst_move_nr_items);
 571		if (IS_ERR(dst_move_tm)) {
 572			ret = PTR_ERR(dst_move_tm);
 573			dst_move_tm = NULL;
 574			goto lock;
 575		}
 576	}
 577	if (src_move_nr_items) {
 578		src_move_tm = tree_mod_log_alloc_move(src, src_offset,
 579						      src_offset + nr_items,
 580						      src_move_nr_items);
 581		if (IS_ERR(src_move_tm)) {
 582			ret = PTR_ERR(src_move_tm);
 583			src_move_tm = NULL;
 584			goto lock;
 585		}
 586	}
 587
 588	tm_list_add = tm_list;
 589	tm_list_rem = tm_list + nr_items;
 590	for (i = 0; i < nr_items; i++) {
 591		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
 592						     BTRFS_MOD_LOG_KEY_REMOVE);
 593		if (!tm_list_rem[i]) {
 594			ret = -ENOMEM;
 595			goto lock;
 596		}
 597
 598		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
 599						     BTRFS_MOD_LOG_KEY_ADD);
 600		if (!tm_list_add[i]) {
 601			ret = -ENOMEM;
 602			goto lock;
 603		}
 604	}
 605
 606lock:
 607	if (tree_mod_dont_log(fs_info, NULL)) {
 608		/*
 609		 * Don't error if we failed to allocate memory because we don't
 610		 * need to log.
 611		 */
 612		ret = 0;
 613		goto free_tms;
 614	}
 615	locked = true;
 616
 617	/*
 618	 * We previously failed to allocate memory and we need to log, so we
 619	 * have to fail.
 620	 */
 621	if (ret != 0)
 622		goto free_tms;
 623
 624	if (dst_move_tm) {
 625		ret = tree_mod_log_insert(fs_info, dst_move_tm);
 626		if (ret)
 627			goto free_tms;
 628	}
 629	for (i = 0; i < nr_items; i++) {
 630		ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
 631		if (ret)
 632			goto free_tms;
 633		ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
 634		if (ret)
 635			goto free_tms;
 636	}
 637	if (src_move_tm) {
 638		ret = tree_mod_log_insert(fs_info, src_move_tm);
 639		if (ret)
 640			goto free_tms;
 641	}
 642
 643	write_unlock(&fs_info->tree_mod_log_lock);
 644	kfree(tm_list);
 645
 646	return 0;
 647
 648free_tms:
 649	if (dst_move_tm && !RB_EMPTY_NODE(&dst_move_tm->node))
 650		rb_erase(&dst_move_tm->node, &fs_info->tree_mod_log);
 651	kfree(dst_move_tm);
 652	if (src_move_tm && !RB_EMPTY_NODE(&src_move_tm->node))
 653		rb_erase(&src_move_tm->node, &fs_info->tree_mod_log);
 654	kfree(src_move_tm);
 655	if (tm_list) {
 656		for (i = 0; i < nr_items * 2; i++) {
 657			if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
 658				rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
 659			kfree(tm_list[i]);
 660		}
 661	}
 662	if (locked)
 663		write_unlock(&fs_info->tree_mod_log_lock);
 664	kfree(tm_list);
 665
 666	return ret;
 667}
 668
 669int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
 670{
 671	struct tree_mod_elem **tm_list = NULL;
 672	int nritems = 0;
 673	int i;
 674	int ret = 0;
 675
 676	if (!tree_mod_need_log(eb->fs_info, eb))
 677		return 0;
 678
 679	nritems = btrfs_header_nritems(eb);
 680	tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
 681	if (!tm_list) {
 682		ret = -ENOMEM;
 683		goto lock;
 684	}
 685
 686	for (i = 0; i < nritems; i++) {
 687		tm_list[i] = alloc_tree_mod_elem(eb, i,
 688				    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
 689		if (!tm_list[i]) {
 690			ret = -ENOMEM;
 691			goto lock;
 692		}
 693	}
 694
 695lock:
 696	if (tree_mod_dont_log(eb->fs_info, eb)) {
 697		/*
 698		 * Don't error if we failed to allocate memory because we don't
 699		 * need to log.
 700		 */
 701		ret = 0;
 702		goto free_tms;
 703	} else if (ret != 0) {
 704		/*
 705		 * We previously failed to allocate memory and we need to log,
 706		 * so we have to fail.
 707		 */
 708		goto out_unlock;
 709	}
 710
 711	ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
 712out_unlock:
 713	write_unlock(&eb->fs_info->tree_mod_log_lock);
 714	if (ret)
 715		goto free_tms;
 716	kfree(tm_list);
 717
 718	return 0;
 719
 720free_tms:
 721	if (tm_list) {
 722		for (i = 0; i < nritems; i++)
 723			kfree(tm_list[i]);
 724		kfree(tm_list);
 725	}
 726
 727	return ret;
 728}
 729
 730/*
 731 * Returns the logical address of the oldest predecessor of the given root.
 732 * Entries older than time_seq are ignored.
 733 */
 734static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
 735						      u64 time_seq)
 736{
 737	struct tree_mod_elem *tm;
 738	struct tree_mod_elem *found = NULL;
 739	u64 root_logical = eb_root->start;
 740	bool looped = false;
 741
 742	if (!time_seq)
 743		return NULL;
 744
 745	/*
 746	 * The very last operation that's logged for a root is the replacement
 747	 * operation (if it is replaced at all). This has the logical address
 748	 * of the *new* root, making it the very first operation that's logged
 749	 * for this root.
 750	 */
 751	while (1) {
 752		tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
 753						time_seq);
 754		if (!looped && !tm)
 755			return NULL;
 756		/*
 757		 * If there are no tree operation for the oldest root, we simply
 758		 * return it. This should only happen if that (old) root is at
 759		 * level 0.
 760		 */
 761		if (!tm)
 762			break;
 763
 764		/*
 765		 * If there's an operation that's not a root replacement, we
 766		 * found the oldest version of our root. Normally, we'll find a
 767		 * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
 768		 */
 769		if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
 770			break;
 771
 772		found = tm;
 773		root_logical = tm->old_root.logical;
 774		looped = true;
 775	}
 776
 777	/* If there's no old root to return, return what we found instead */
 778	if (!found)
 779		found = tm;
 780
 781	return found;
 782}
 783
 784
 785/*
 786 * tm is a pointer to the first operation to rewind within eb. Then, all
 787 * previous operations will be rewound (until we reach something older than
 788 * time_seq).
 789 */
 790static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
 791				struct extent_buffer *eb,
 792				u64 time_seq,
 793				struct tree_mod_elem *first_tm)
 794{
 795	u32 n;
 796	struct rb_node *next;
 797	struct tree_mod_elem *tm = first_tm;
 798	unsigned long o_dst;
 799	unsigned long o_src;
 800	unsigned long p_size = sizeof(struct btrfs_key_ptr);
 801	/*
 802	 * max_slot tracks the maximum valid slot of the rewind eb at every
 803	 * step of the rewind. This is in contrast with 'n' which eventually
 804	 * matches the number of items, but can be wrong during moves or if
 805	 * removes overlap on already valid slots (which is probably separately
 806	 * a bug). We do this to validate the offsets of memmoves for rewinding
 807	 * moves and detect invalid memmoves.
 808	 *
 809	 * Since a rewind eb can start empty, max_slot is a signed integer with
 810	 * a special meaning for -1, which is that no slot is valid to move out
 811	 * of. Any other negative value is invalid.
 812	 */
 813	int max_slot;
 814	int move_src_end_slot;
 815	int move_dst_end_slot;
 816
 817	n = btrfs_header_nritems(eb);
 818	max_slot = n - 1;
 819	read_lock(&fs_info->tree_mod_log_lock);
 820	while (tm && tm->seq >= time_seq) {
 821		ASSERT(max_slot >= -1);
 822		/*
 823		 * All the operations are recorded with the operator used for
 824		 * the modification. As we're going backwards, we do the
 825		 * opposite of each operation here.
 826		 */
 827		switch (tm->op) {
 828		case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
 829			BUG_ON(tm->slot < n);
 830			fallthrough;
 831		case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
 832		case BTRFS_MOD_LOG_KEY_REMOVE:
 833			btrfs_set_node_key(eb, &tm->key, tm->slot);
 834			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
 835			btrfs_set_node_ptr_generation(eb, tm->slot,
 836						      tm->generation);
 837			n++;
 838			if (tm->slot > max_slot)
 839				max_slot = tm->slot;
 840			break;
 841		case BTRFS_MOD_LOG_KEY_REPLACE:
 842			BUG_ON(tm->slot >= n);
 843			btrfs_set_node_key(eb, &tm->key, tm->slot);
 844			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
 845			btrfs_set_node_ptr_generation(eb, tm->slot,
 846						      tm->generation);
 847			break;
 848		case BTRFS_MOD_LOG_KEY_ADD:
 849			/*
 850			 * It is possible we could have already removed keys
 851			 * behind the known max slot, so this will be an
 852			 * overestimate. In practice, the copy operation
 853			 * inserts them in increasing order, and overestimating
 854			 * just means we miss some warnings, so it's OK. It
 855			 * isn't worth carefully tracking the full array of
 856			 * valid slots to check against when moving.
 857			 */
 858			if (tm->slot == max_slot)
 859				max_slot--;
 860			/* if a move operation is needed it's in the log */
 861			n--;
 862			break;
 863		case BTRFS_MOD_LOG_MOVE_KEYS:
 864			ASSERT(tm->move.nr_items > 0);
 865			move_src_end_slot = tm->move.dst_slot + tm->move.nr_items - 1;
 866			move_dst_end_slot = tm->slot + tm->move.nr_items - 1;
 867			o_dst = btrfs_node_key_ptr_offset(eb, tm->slot);
 868			o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot);
 869			if (WARN_ON(move_src_end_slot > max_slot ||
 870				    tm->move.nr_items <= 0)) {
 871				btrfs_warn(fs_info,
 872"move from invalid tree mod log slot eb %llu slot %d dst_slot %d nr_items %d seq %llu n %u max_slot %d",
 873					   eb->start, tm->slot,
 874					   tm->move.dst_slot, tm->move.nr_items,
 875					   tm->seq, n, max_slot);
 876			}
 877			memmove_extent_buffer(eb, o_dst, o_src,
 878					      tm->move.nr_items * p_size);
 879			max_slot = move_dst_end_slot;
 880			break;
 881		case BTRFS_MOD_LOG_ROOT_REPLACE:
 882			/*
 883			 * This operation is special. For roots, this must be
 884			 * handled explicitly before rewinding.
 885			 * For non-roots, this operation may exist if the node
 886			 * was a root: root A -> child B; then A gets empty and
 887			 * B is promoted to the new root. In the mod log, we'll
 888			 * have a root-replace operation for B, a tree block
 889			 * that is no root. We simply ignore that operation.
 890			 */
 891			break;
 892		}
 893		next = rb_next(&tm->node);
 894		if (!next)
 895			break;
 896		tm = rb_entry(next, struct tree_mod_elem, node);
 897		if (tm->logical != first_tm->logical)
 898			break;
 899	}
 900	read_unlock(&fs_info->tree_mod_log_lock);
 901	btrfs_set_header_nritems(eb, n);
 902}
 903
 904/*
 905 * Called with eb read locked. If the buffer cannot be rewound, the same buffer
 906 * is returned. If rewind operations happen, a fresh buffer is returned. The
 907 * returned buffer is always read-locked. If the returned buffer is not the
 908 * input buffer, the lock on the input buffer is released and the input buffer
 909 * is freed (its refcount is decremented).
 910 */
 911struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
 912						struct btrfs_path *path,
 913						struct extent_buffer *eb,
 914						u64 time_seq)
 915{
 916	struct extent_buffer *eb_rewin;
 917	struct tree_mod_elem *tm;
 918
 919	if (!time_seq)
 920		return eb;
 921
 922	if (btrfs_header_level(eb) == 0)
 923		return eb;
 924
 925	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
 926	if (!tm)
 927		return eb;
 928
 929	if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
 930		BUG_ON(tm->slot != 0);
 931		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
 932		if (!eb_rewin) {
 933			btrfs_tree_read_unlock(eb);
 934			free_extent_buffer(eb);
 935			return NULL;
 936		}
 937		btrfs_set_header_bytenr(eb_rewin, eb->start);
 938		btrfs_set_header_backref_rev(eb_rewin,
 939					     btrfs_header_backref_rev(eb));
 940		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
 941		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
 942	} else {
 943		eb_rewin = btrfs_clone_extent_buffer(eb);
 944		if (!eb_rewin) {
 945			btrfs_tree_read_unlock(eb);
 946			free_extent_buffer(eb);
 947			return NULL;
 948		}
 949	}
 950
 951	btrfs_tree_read_unlock(eb);
 952	free_extent_buffer(eb);
 953
 954	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
 955				       eb_rewin, btrfs_header_level(eb_rewin));
 956	btrfs_tree_read_lock(eb_rewin);
 957	tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
 958	WARN_ON(btrfs_header_nritems(eb_rewin) >
 959		BTRFS_NODEPTRS_PER_BLOCK(fs_info));
 960
 961	return eb_rewin;
 962}
 963
 964/*
 965 * Rewind the state of @root's root node to the given @time_seq value.
 966 * If there are no changes, the current root->root_node is returned. If anything
 967 * changed in between, there's a fresh buffer allocated on which the rewind
 968 * operations are done. In any case, the returned buffer is read locked.
 969 * Returns NULL on error (with no locks held).
 970 */
 971struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
 972{
 973	struct btrfs_fs_info *fs_info = root->fs_info;
 974	struct tree_mod_elem *tm;
 975	struct extent_buffer *eb = NULL;
 976	struct extent_buffer *eb_root;
 977	u64 eb_root_owner = 0;
 978	struct extent_buffer *old;
 979	struct tree_mod_root *old_root = NULL;
 980	u64 old_generation = 0;
 981	u64 logical;
 982	int level;
 983
 984	eb_root = btrfs_read_lock_root_node(root);
 985	tm = tree_mod_log_oldest_root(eb_root, time_seq);
 986	if (!tm)
 987		return eb_root;
 988
 989	if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
 990		old_root = &tm->old_root;
 991		old_generation = tm->generation;
 992		logical = old_root->logical;
 993		level = old_root->level;
 994	} else {
 995		logical = eb_root->start;
 996		level = btrfs_header_level(eb_root);
 997	}
 998
 999	tm = tree_mod_log_search(fs_info, logical, time_seq);
1000	if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1001		struct btrfs_tree_parent_check check = { 0 };
1002
1003		btrfs_tree_read_unlock(eb_root);
1004		free_extent_buffer(eb_root);
1005
1006		check.level = level;
1007		check.owner_root = root->root_key.objectid;
1008
1009		old = read_tree_block(fs_info, logical, &check);
1010		if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1011			if (!IS_ERR(old))
1012				free_extent_buffer(old);
1013			btrfs_warn(fs_info,
1014				   "failed to read tree block %llu from get_old_root",
1015				   logical);
1016		} else {
1017			struct tree_mod_elem *tm2;
1018
1019			btrfs_tree_read_lock(old);
1020			eb = btrfs_clone_extent_buffer(old);
1021			/*
1022			 * After the lookup for the most recent tree mod operation
1023			 * above and before we locked and cloned the extent buffer
1024			 * 'old', a new tree mod log operation may have been added.
1025			 * So lookup for a more recent one to make sure the number
1026			 * of mod log operations we replay is consistent with the
1027			 * number of items we have in the cloned extent buffer,
1028			 * otherwise we can hit a BUG_ON when rewinding the extent
1029			 * buffer.
1030			 */
1031			tm2 = tree_mod_log_search(fs_info, logical, time_seq);
1032			btrfs_tree_read_unlock(old);
1033			free_extent_buffer(old);
1034			ASSERT(tm2);
1035			ASSERT(tm2 == tm || tm2->seq > tm->seq);
1036			if (!tm2 || tm2->seq < tm->seq) {
1037				free_extent_buffer(eb);
1038				return NULL;
1039			}
1040			tm = tm2;
1041		}
1042	} else if (old_root) {
1043		eb_root_owner = btrfs_header_owner(eb_root);
1044		btrfs_tree_read_unlock(eb_root);
1045		free_extent_buffer(eb_root);
1046		eb = alloc_dummy_extent_buffer(fs_info, logical);
1047	} else {
1048		eb = btrfs_clone_extent_buffer(eb_root);
1049		btrfs_tree_read_unlock(eb_root);
1050		free_extent_buffer(eb_root);
1051	}
1052
1053	if (!eb)
1054		return NULL;
1055	if (old_root) {
1056		btrfs_set_header_bytenr(eb, eb->start);
1057		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1058		btrfs_set_header_owner(eb, eb_root_owner);
1059		btrfs_set_header_level(eb, old_root->level);
1060		btrfs_set_header_generation(eb, old_generation);
1061	}
1062	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
1063				       btrfs_header_level(eb));
1064	btrfs_tree_read_lock(eb);
1065	if (tm)
1066		tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1067	else
1068		WARN_ON(btrfs_header_level(eb) != 0);
1069	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1070
1071	return eb;
1072}
1073
1074int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1075{
1076	struct tree_mod_elem *tm;
1077	int level;
1078	struct extent_buffer *eb_root = btrfs_root_node(root);
1079
1080	tm = tree_mod_log_oldest_root(eb_root, time_seq);
1081	if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
1082		level = tm->old_root.level;
1083	else
1084		level = btrfs_header_level(eb_root);
1085
1086	free_extent_buffer(eb_root);
1087
1088	return level;
1089}
1090
1091/*
1092 * Return the lowest sequence number in the tree modification log.
1093 *
1094 * Return the sequence number of the oldest tree modification log user, which
1095 * corresponds to the lowest sequence number of all existing users. If there are
1096 * no users it returns 0.
1097 */
1098u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info)
1099{
1100	u64 ret = 0;
1101
1102	read_lock(&fs_info->tree_mod_log_lock);
1103	if (!list_empty(&fs_info->tree_mod_seq_list)) {
1104		struct btrfs_seq_list *elem;
1105
1106		elem = list_first_entry(&fs_info->tree_mod_seq_list,
1107					struct btrfs_seq_list, list);
1108		ret = elem->seq;
1109	}
1110	read_unlock(&fs_info->tree_mod_log_lock);
1111
1112	return ret;
1113}