Linux Audio

Check our new training course

Loading...
v4.17
 
   1/*
   2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include "xfs_fs.h"
  20#include "xfs_shared.h"
  21#include "xfs_format.h"
  22#include "xfs_log_format.h"
  23#include "xfs_trans_resv.h"
  24#include "xfs_bit.h"
  25#include "xfs_mount.h"
  26#include "xfs_defer.h"
  27#include "xfs_inode.h"
  28#include "xfs_trans.h"
  29#include "xfs_inode_item.h"
  30#include "xfs_buf_item.h"
  31#include "xfs_btree.h"
  32#include "xfs_errortag.h"
  33#include "xfs_error.h"
  34#include "xfs_trace.h"
  35#include "xfs_cksum.h"
  36#include "xfs_alloc.h"
  37#include "xfs_log.h"
  38
  39/*
  40 * Cursor allocation zone.
  41 */
  42kmem_zone_t	*xfs_btree_cur_zone;
  43
  44/*
  45 * Btree magic numbers.
  46 */
  47static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
  48	{ XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
  49	  XFS_FIBT_MAGIC, 0 },
  50	{ XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
  51	  XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
  52	  XFS_REFC_CRC_MAGIC }
  53};
  54
  55uint32_t
  56xfs_btree_magic(
  57	int			crc,
  58	xfs_btnum_t		btnum)
  59{
  60	uint32_t		magic = xfs_magics[crc][btnum];
  61
  62	/* Ensure we asked for crc for crc-only magics. */
  63	ASSERT(magic != 0);
  64	return magic;
  65}
  66
  67/*
  68 * Check a long btree block header.  Return the address of the failing check,
  69 * or NULL if everything is ok.
  70 */
  71xfs_failaddr_t
  72__xfs_btree_check_lblock(
  73	struct xfs_btree_cur	*cur,
  74	struct xfs_btree_block	*block,
  75	int			level,
  76	struct xfs_buf		*bp)
  77{
  78	struct xfs_mount	*mp = cur->bc_mp;
  79	xfs_btnum_t		btnum = cur->bc_btnum;
  80	int			crc = xfs_sb_version_hascrc(&mp->m_sb);
  81
  82	if (crc) {
  83		if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
  84			return __this_address;
  85		if (block->bb_u.l.bb_blkno !=
  86		    cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL))
  87			return __this_address;
  88		if (block->bb_u.l.bb_pad != cpu_to_be32(0))
  89			return __this_address;
  90	}
  91
  92	if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
  93		return __this_address;
  94	if (be16_to_cpu(block->bb_level) != level)
  95		return __this_address;
  96	if (be16_to_cpu(block->bb_numrecs) >
  97	    cur->bc_ops->get_maxrecs(cur, level))
  98		return __this_address;
  99	if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
 100	    !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib),
 101			level + 1))
 102		return __this_address;
 103	if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
 104	    !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib),
 105			level + 1))
 106		return __this_address;
 107
 108	return NULL;
 109}
 110
 111/* Check a long btree block header. */
 112static int
 113xfs_btree_check_lblock(
 114	struct xfs_btree_cur	*cur,
 115	struct xfs_btree_block	*block,
 116	int			level,
 117	struct xfs_buf		*bp)
 118{
 119	struct xfs_mount	*mp = cur->bc_mp;
 120	xfs_failaddr_t		fa;
 121
 122	fa = __xfs_btree_check_lblock(cur, block, level, bp);
 123	if (unlikely(XFS_TEST_ERROR(fa != NULL, mp,
 124			XFS_ERRTAG_BTREE_CHECK_LBLOCK))) {
 125		if (bp)
 126			trace_xfs_btree_corrupt(bp, _RET_IP_);
 127		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
 128		return -EFSCORRUPTED;
 129	}
 130	return 0;
 131}
 132
 133/*
 134 * Check a short btree block header.  Return the address of the failing check,
 135 * or NULL if everything is ok.
 136 */
 137xfs_failaddr_t
 138__xfs_btree_check_sblock(
 139	struct xfs_btree_cur	*cur,
 140	struct xfs_btree_block	*block,
 141	int			level,
 142	struct xfs_buf		*bp)
 143{
 144	struct xfs_mount	*mp = cur->bc_mp;
 145	xfs_btnum_t		btnum = cur->bc_btnum;
 146	int			crc = xfs_sb_version_hascrc(&mp->m_sb);
 147
 148	if (crc) {
 149		if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
 150			return __this_address;
 151		if (block->bb_u.s.bb_blkno !=
 152		    cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL))
 153			return __this_address;
 154	}
 155
 156	if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
 157		return __this_address;
 158	if (be16_to_cpu(block->bb_level) != level)
 159		return __this_address;
 160	if (be16_to_cpu(block->bb_numrecs) >
 161	    cur->bc_ops->get_maxrecs(cur, level))
 162		return __this_address;
 163	if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
 164	    !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib),
 165			level + 1))
 166		return __this_address;
 167	if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
 168	    !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib),
 169			level + 1))
 170		return __this_address;
 171
 172	return NULL;
 173}
 174
 175/* Check a short btree block header. */
 176STATIC int
 177xfs_btree_check_sblock(
 178	struct xfs_btree_cur	*cur,
 179	struct xfs_btree_block	*block,
 180	int			level,
 181	struct xfs_buf		*bp)
 182{
 183	struct xfs_mount	*mp = cur->bc_mp;
 184	xfs_failaddr_t		fa;
 185
 186	fa = __xfs_btree_check_sblock(cur, block, level, bp);
 187	if (unlikely(XFS_TEST_ERROR(fa != NULL, mp,
 188			XFS_ERRTAG_BTREE_CHECK_SBLOCK))) {
 189		if (bp)
 190			trace_xfs_btree_corrupt(bp, _RET_IP_);
 191		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
 192		return -EFSCORRUPTED;
 193	}
 194	return 0;
 195}
 196
 197/*
 198 * Debug routine: check that block header is ok.
 199 */
 200int
 201xfs_btree_check_block(
 202	struct xfs_btree_cur	*cur,	/* btree cursor */
 203	struct xfs_btree_block	*block,	/* generic btree block pointer */
 204	int			level,	/* level of the btree block */
 205	struct xfs_buf		*bp)	/* buffer containing block, if any */
 206{
 207	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 208		return xfs_btree_check_lblock(cur, block, level, bp);
 209	else
 210		return xfs_btree_check_sblock(cur, block, level, bp);
 211}
 212
 213/* Check that this long pointer is valid and points within the fs. */
 214bool
 215xfs_btree_check_lptr(
 216	struct xfs_btree_cur	*cur,
 217	xfs_fsblock_t		fsbno,
 218	int			level)
 219{
 220	if (level <= 0)
 221		return false;
 222	return xfs_verify_fsbno(cur->bc_mp, fsbno);
 223}
 224
 225/* Check that this short pointer is valid and points within the AG. */
 226bool
 227xfs_btree_check_sptr(
 228	struct xfs_btree_cur	*cur,
 229	xfs_agblock_t		agbno,
 230	int			level)
 231{
 232	if (level <= 0)
 233		return false;
 234	return xfs_verify_agbno(cur->bc_mp, cur->bc_private.a.agno, agbno);
 235}
 236
 237#ifdef DEBUG
 238/*
 239 * Check that a given (indexed) btree pointer at a certain level of a
 240 * btree is valid and doesn't point past where it should.
 241 */
 242static int
 243xfs_btree_check_ptr(
 244	struct xfs_btree_cur	*cur,
 245	union xfs_btree_ptr	*ptr,
 246	int			index,
 247	int			level)
 248{
 249	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 250		XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
 251				xfs_btree_check_lptr(cur,
 252					be64_to_cpu((&ptr->l)[index]), level));
 
 
 
 
 
 253	} else {
 254		XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
 255				xfs_btree_check_sptr(cur,
 256					be32_to_cpu((&ptr->s)[index]), level));
 
 
 
 
 257	}
 258
 259	return 0;
 260}
 
 
 
 
 
 261#endif
 262
 263/*
 264 * Calculate CRC on the whole btree block and stuff it into the
 265 * long-form btree header.
 266 *
 267 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
 268 * it into the buffer so recovery knows what the last modification was that made
 269 * it to disk.
 270 */
 271void
 272xfs_btree_lblock_calc_crc(
 273	struct xfs_buf		*bp)
 274{
 275	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
 276	struct xfs_buf_log_item	*bip = bp->b_log_item;
 277
 278	if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
 279		return;
 280	if (bip)
 281		block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
 282	xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
 283}
 284
 285bool
 286xfs_btree_lblock_verify_crc(
 287	struct xfs_buf		*bp)
 288{
 289	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
 290	struct xfs_mount	*mp = bp->b_target->bt_mount;
 291
 292	if (xfs_sb_version_hascrc(&mp->m_sb)) {
 293		if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
 294			return false;
 295		return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
 296	}
 297
 298	return true;
 299}
 300
 301/*
 302 * Calculate CRC on the whole btree block and stuff it into the
 303 * short-form btree header.
 304 *
 305 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
 306 * it into the buffer so recovery knows what the last modification was that made
 307 * it to disk.
 308 */
 309void
 310xfs_btree_sblock_calc_crc(
 311	struct xfs_buf		*bp)
 312{
 313	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
 314	struct xfs_buf_log_item	*bip = bp->b_log_item;
 315
 316	if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
 317		return;
 318	if (bip)
 319		block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
 320	xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
 321}
 322
 323bool
 324xfs_btree_sblock_verify_crc(
 325	struct xfs_buf		*bp)
 326{
 327	struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 328	struct xfs_mount	*mp = bp->b_target->bt_mount;
 329
 330	if (xfs_sb_version_hascrc(&mp->m_sb)) {
 331		if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
 332			return __this_address;
 333		return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
 334	}
 335
 336	return true;
 337}
 338
 339static int
 340xfs_btree_free_block(
 341	struct xfs_btree_cur	*cur,
 342	struct xfs_buf		*bp)
 343{
 344	int			error;
 345
 346	error = cur->bc_ops->free_block(cur, bp);
 347	if (!error) {
 348		xfs_trans_binval(cur->bc_tp, bp);
 349		XFS_BTREE_STATS_INC(cur, free);
 350	}
 351	return error;
 352}
 353
 354/*
 355 * Delete the btree cursor.
 356 */
 357void
 358xfs_btree_del_cursor(
 359	xfs_btree_cur_t	*cur,		/* btree cursor */
 360	int		error)		/* del because of error */
 361{
 362	int		i;		/* btree level */
 363
 364	/*
 365	 * Clear the buffer pointers, and release the buffers.
 366	 * If we're doing this in the face of an error, we
 367	 * need to make sure to inspect all of the entries
 368	 * in the bc_bufs array for buffers to be unlocked.
 369	 * This is because some of the btree code works from
 370	 * level n down to 0, and if we get an error along
 371	 * the way we won't have initialized all the entries
 372	 * down to 0.
 373	 */
 374	for (i = 0; i < cur->bc_nlevels; i++) {
 375		if (cur->bc_bufs[i])
 376			xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
 377		else if (!error)
 378			break;
 379	}
 380	/*
 381	 * Can't free a bmap cursor without having dealt with the
 382	 * allocated indirect blocks' accounting.
 383	 */
 384	ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
 385	       cur->bc_private.b.allocated == 0);
 386	/*
 387	 * Free the cursor.
 388	 */
 389	kmem_zone_free(xfs_btree_cur_zone, cur);
 390}
 391
 392/*
 393 * Duplicate the btree cursor.
 394 * Allocate a new one, copy the record, re-get the buffers.
 395 */
 396int					/* error */
 397xfs_btree_dup_cursor(
 398	xfs_btree_cur_t	*cur,		/* input cursor */
 399	xfs_btree_cur_t	**ncur)		/* output cursor */
 400{
 401	xfs_buf_t	*bp;		/* btree block's buffer pointer */
 402	int		error;		/* error return value */
 403	int		i;		/* level number of btree block */
 404	xfs_mount_t	*mp;		/* mount structure for filesystem */
 405	xfs_btree_cur_t	*new;		/* new cursor value */
 406	xfs_trans_t	*tp;		/* transaction pointer, can be NULL */
 407
 408	tp = cur->bc_tp;
 409	mp = cur->bc_mp;
 410
 411	/*
 412	 * Allocate a new cursor like the old one.
 413	 */
 414	new = cur->bc_ops->dup_cursor(cur);
 415
 416	/*
 417	 * Copy the record currently in the cursor.
 418	 */
 419	new->bc_rec = cur->bc_rec;
 420
 421	/*
 422	 * For each level current, re-get the buffer and copy the ptr value.
 423	 */
 424	for (i = 0; i < new->bc_nlevels; i++) {
 425		new->bc_ptrs[i] = cur->bc_ptrs[i];
 426		new->bc_ra[i] = cur->bc_ra[i];
 427		bp = cur->bc_bufs[i];
 428		if (bp) {
 429			error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 430						   XFS_BUF_ADDR(bp), mp->m_bsize,
 431						   0, &bp,
 432						   cur->bc_ops->buf_ops);
 433			if (error) {
 434				xfs_btree_del_cursor(new, error);
 435				*ncur = NULL;
 436				return error;
 437			}
 438		}
 439		new->bc_bufs[i] = bp;
 440	}
 441	*ncur = new;
 442	return 0;
 443}
 444
 445/*
 446 * XFS btree block layout and addressing:
 447 *
 448 * There are two types of blocks in the btree: leaf and non-leaf blocks.
 449 *
 450 * The leaf record start with a header then followed by records containing
 451 * the values.  A non-leaf block also starts with the same header, and
 452 * then first contains lookup keys followed by an equal number of pointers
 453 * to the btree blocks at the previous level.
 454 *
 455 *		+--------+-------+-------+-------+-------+-------+-------+
 456 * Leaf:	| header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
 457 *		+--------+-------+-------+-------+-------+-------+-------+
 458 *
 459 *		+--------+-------+-------+-------+-------+-------+-------+
 460 * Non-Leaf:	| header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
 461 *		+--------+-------+-------+-------+-------+-------+-------+
 462 *
 463 * The header is called struct xfs_btree_block for reasons better left unknown
 464 * and comes in different versions for short (32bit) and long (64bit) block
 465 * pointers.  The record and key structures are defined by the btree instances
 466 * and opaque to the btree core.  The block pointers are simple disk endian
 467 * integers, available in a short (32bit) and long (64bit) variant.
 468 *
 469 * The helpers below calculate the offset of a given record, key or pointer
 470 * into a btree block (xfs_btree_*_offset) or return a pointer to the given
 471 * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
 472 * inside the btree block is done using indices starting at one, not zero!
 473 *
 474 * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing
 475 * overlapping intervals.  In such a tree, records are still sorted lowest to
 476 * highest and indexed by the smallest key value that refers to the record.
 477 * However, nodes are different: each pointer has two associated keys -- one
 478 * indexing the lowest key available in the block(s) below (the same behavior
 479 * as the key in a regular btree) and another indexing the highest key
 480 * available in the block(s) below.  Because records are /not/ sorted by the
 481 * highest key, all leaf block updates require us to compute the highest key
 482 * that matches any record in the leaf and to recursively update the high keys
 483 * in the nodes going further up in the tree, if necessary.  Nodes look like
 484 * this:
 485 *
 486 *		+--------+-----+-----+-----+-----+-----+-------+-------+-----+
 487 * Non-Leaf:	| header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... |
 488 *		+--------+-----+-----+-----+-----+-----+-------+-------+-----+
 489 *
 490 * To perform an interval query on an overlapped tree, perform the usual
 491 * depth-first search and use the low and high keys to decide if we can skip
 492 * that particular node.  If a leaf node is reached, return the records that
 493 * intersect the interval.  Note that an interval query may return numerous
 494 * entries.  For a non-overlapped tree, simply search for the record associated
 495 * with the lowest key and iterate forward until a non-matching record is
 496 * found.  Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by
 497 * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in
 498 * more detail.
 499 *
 500 * Why do we care about overlapping intervals?  Let's say you have a bunch of
 501 * reverse mapping records on a reflink filesystem:
 502 *
 503 * 1: +- file A startblock B offset C length D -----------+
 504 * 2:      +- file E startblock F offset G length H --------------+
 505 * 3:      +- file I startblock F offset J length K --+
 506 * 4:                                                        +- file L... --+
 507 *
 508 * Now say we want to map block (B+D) into file A at offset (C+D).  Ideally,
 509 * we'd simply increment the length of record 1.  But how do we find the record
 510 * that ends at (B+D-1) (i.e. record 1)?  A LE lookup of (B+D-1) would return
 511 * record 3 because the keys are ordered first by startblock.  An interval
 512 * query would return records 1 and 2 because they both overlap (B+D-1), and
 513 * from that we can pick out record 1 as the appropriate left neighbor.
 514 *
 515 * In the non-overlapped case you can do a LE lookup and decrement the cursor
 516 * because a record's interval must end before the next record.
 517 */
 518
 519/*
 520 * Return size of the btree block header for this btree instance.
 521 */
 522static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
 523{
 524	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 525		if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
 526			return XFS_BTREE_LBLOCK_CRC_LEN;
 527		return XFS_BTREE_LBLOCK_LEN;
 528	}
 529	if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
 530		return XFS_BTREE_SBLOCK_CRC_LEN;
 531	return XFS_BTREE_SBLOCK_LEN;
 532}
 533
 534/*
 535 * Return size of btree block pointers for this btree instance.
 536 */
 537static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
 538{
 539	return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
 540		sizeof(__be64) : sizeof(__be32);
 541}
 542
 543/*
 544 * Calculate offset of the n-th record in a btree block.
 545 */
 546STATIC size_t
 547xfs_btree_rec_offset(
 548	struct xfs_btree_cur	*cur,
 549	int			n)
 550{
 551	return xfs_btree_block_len(cur) +
 552		(n - 1) * cur->bc_ops->rec_len;
 553}
 554
 555/*
 556 * Calculate offset of the n-th key in a btree block.
 557 */
 558STATIC size_t
 559xfs_btree_key_offset(
 560	struct xfs_btree_cur	*cur,
 561	int			n)
 562{
 563	return xfs_btree_block_len(cur) +
 564		(n - 1) * cur->bc_ops->key_len;
 565}
 566
 567/*
 568 * Calculate offset of the n-th high key in a btree block.
 569 */
 570STATIC size_t
 571xfs_btree_high_key_offset(
 572	struct xfs_btree_cur	*cur,
 573	int			n)
 574{
 575	return xfs_btree_block_len(cur) +
 576		(n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2);
 577}
 578
 579/*
 580 * Calculate offset of the n-th block pointer in a btree block.
 581 */
 582STATIC size_t
 583xfs_btree_ptr_offset(
 584	struct xfs_btree_cur	*cur,
 585	int			n,
 586	int			level)
 587{
 588	return xfs_btree_block_len(cur) +
 589		cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
 590		(n - 1) * xfs_btree_ptr_len(cur);
 591}
 592
 593/*
 594 * Return a pointer to the n-th record in the btree block.
 595 */
 596union xfs_btree_rec *
 597xfs_btree_rec_addr(
 598	struct xfs_btree_cur	*cur,
 599	int			n,
 600	struct xfs_btree_block	*block)
 601{
 602	return (union xfs_btree_rec *)
 603		((char *)block + xfs_btree_rec_offset(cur, n));
 604}
 605
 606/*
 607 * Return a pointer to the n-th key in the btree block.
 608 */
 609union xfs_btree_key *
 610xfs_btree_key_addr(
 611	struct xfs_btree_cur	*cur,
 612	int			n,
 613	struct xfs_btree_block	*block)
 614{
 615	return (union xfs_btree_key *)
 616		((char *)block + xfs_btree_key_offset(cur, n));
 617}
 618
 619/*
 620 * Return a pointer to the n-th high key in the btree block.
 621 */
 622union xfs_btree_key *
 623xfs_btree_high_key_addr(
 624	struct xfs_btree_cur	*cur,
 625	int			n,
 626	struct xfs_btree_block	*block)
 627{
 628	return (union xfs_btree_key *)
 629		((char *)block + xfs_btree_high_key_offset(cur, n));
 630}
 631
 632/*
 633 * Return a pointer to the n-th block pointer in the btree block.
 634 */
 635union xfs_btree_ptr *
 636xfs_btree_ptr_addr(
 637	struct xfs_btree_cur	*cur,
 638	int			n,
 639	struct xfs_btree_block	*block)
 640{
 641	int			level = xfs_btree_get_level(block);
 642
 643	ASSERT(block->bb_level != 0);
 644
 645	return (union xfs_btree_ptr *)
 646		((char *)block + xfs_btree_ptr_offset(cur, n, level));
 647}
 648
 649/*
 650 * Get the root block which is stored in the inode.
 651 *
 652 * For now this btree implementation assumes the btree root is always
 653 * stored in the if_broot field of an inode fork.
 654 */
 655STATIC struct xfs_btree_block *
 656xfs_btree_get_iroot(
 657	struct xfs_btree_cur	*cur)
 658{
 659	struct xfs_ifork	*ifp;
 660
 661	ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
 662	return (struct xfs_btree_block *)ifp->if_broot;
 663}
 664
 665/*
 666 * Retrieve the block pointer from the cursor at the given level.
 667 * This may be an inode btree root or from a buffer.
 668 */
 669struct xfs_btree_block *		/* generic btree block pointer */
 670xfs_btree_get_block(
 671	struct xfs_btree_cur	*cur,	/* btree cursor */
 672	int			level,	/* level in btree */
 673	struct xfs_buf		**bpp)	/* buffer containing the block */
 674{
 675	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
 676	    (level == cur->bc_nlevels - 1)) {
 677		*bpp = NULL;
 678		return xfs_btree_get_iroot(cur);
 679	}
 680
 681	*bpp = cur->bc_bufs[level];
 682	return XFS_BUF_TO_BLOCK(*bpp);
 683}
 684
 685/*
 686 * Get a buffer for the block, return it with no data read.
 687 * Long-form addressing.
 688 */
 689xfs_buf_t *				/* buffer for fsbno */
 690xfs_btree_get_bufl(
 691	xfs_mount_t	*mp,		/* file system mount point */
 692	xfs_trans_t	*tp,		/* transaction pointer */
 693	xfs_fsblock_t	fsbno,		/* file system block number */
 694	uint		lock)		/* lock flags for get_buf */
 695{
 696	xfs_daddr_t		d;		/* real disk block address */
 697
 698	ASSERT(fsbno != NULLFSBLOCK);
 699	d = XFS_FSB_TO_DADDR(mp, fsbno);
 700	return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
 701}
 702
 703/*
 704 * Get a buffer for the block, return it with no data read.
 705 * Short-form addressing.
 706 */
 707xfs_buf_t *				/* buffer for agno/agbno */
 708xfs_btree_get_bufs(
 709	xfs_mount_t	*mp,		/* file system mount point */
 710	xfs_trans_t	*tp,		/* transaction pointer */
 711	xfs_agnumber_t	agno,		/* allocation group number */
 712	xfs_agblock_t	agbno,		/* allocation group block number */
 713	uint		lock)		/* lock flags for get_buf */
 714{
 715	xfs_daddr_t		d;		/* real disk block address */
 716
 717	ASSERT(agno != NULLAGNUMBER);
 718	ASSERT(agbno != NULLAGBLOCK);
 719	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
 720	return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
 721}
 722
 723/*
 724 * Check for the cursor referring to the last block at the given level.
 725 */
 726int					/* 1=is last block, 0=not last block */
 727xfs_btree_islastblock(
 728	xfs_btree_cur_t		*cur,	/* btree cursor */
 729	int			level)	/* level to check */
 730{
 731	struct xfs_btree_block	*block;	/* generic btree block pointer */
 732	xfs_buf_t		*bp;	/* buffer containing block */
 733
 734	block = xfs_btree_get_block(cur, level, &bp);
 735	xfs_btree_check_block(cur, block, level, bp);
 736	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 737		return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
 738	else
 739		return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
 740}
 741
 742/*
 743 * Change the cursor to point to the first record at the given level.
 744 * Other levels are unaffected.
 745 */
 746STATIC int				/* success=1, failure=0 */
 747xfs_btree_firstrec(
 748	xfs_btree_cur_t		*cur,	/* btree cursor */
 749	int			level)	/* level to change */
 750{
 751	struct xfs_btree_block	*block;	/* generic btree block pointer */
 752	xfs_buf_t		*bp;	/* buffer containing block */
 753
 754	/*
 755	 * Get the block pointer for this level.
 756	 */
 757	block = xfs_btree_get_block(cur, level, &bp);
 758	if (xfs_btree_check_block(cur, block, level, bp))
 759		return 0;
 760	/*
 761	 * It's empty, there is no such record.
 762	 */
 763	if (!block->bb_numrecs)
 764		return 0;
 765	/*
 766	 * Set the ptr value to 1, that's the first record/key.
 767	 */
 768	cur->bc_ptrs[level] = 1;
 769	return 1;
 770}
 771
 772/*
 773 * Change the cursor to point to the last record in the current block
 774 * at the given level.  Other levels are unaffected.
 775 */
 776STATIC int				/* success=1, failure=0 */
 777xfs_btree_lastrec(
 778	xfs_btree_cur_t		*cur,	/* btree cursor */
 779	int			level)	/* level to change */
 780{
 781	struct xfs_btree_block	*block;	/* generic btree block pointer */
 782	xfs_buf_t		*bp;	/* buffer containing block */
 783
 784	/*
 785	 * Get the block pointer for this level.
 786	 */
 787	block = xfs_btree_get_block(cur, level, &bp);
 788	if (xfs_btree_check_block(cur, block, level, bp))
 789		return 0;
 790	/*
 791	 * It's empty, there is no such record.
 792	 */
 793	if (!block->bb_numrecs)
 794		return 0;
 795	/*
 796	 * Set the ptr value to numrecs, that's the last record/key.
 797	 */
 798	cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
 799	return 1;
 800}
 801
 802/*
 803 * Compute first and last byte offsets for the fields given.
 804 * Interprets the offsets table, which contains struct field offsets.
 805 */
 806void
 807xfs_btree_offsets(
 808	int64_t		fields,		/* bitmask of fields */
 809	const short	*offsets,	/* table of field offsets */
 810	int		nbits,		/* number of bits to inspect */
 811	int		*first,		/* output: first byte offset */
 812	int		*last)		/* output: last byte offset */
 813{
 814	int		i;		/* current bit number */
 815	int64_t		imask;		/* mask for current bit number */
 816
 817	ASSERT(fields != 0);
 818	/*
 819	 * Find the lowest bit, so the first byte offset.
 820	 */
 821	for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
 822		if (imask & fields) {
 823			*first = offsets[i];
 824			break;
 825		}
 826	}
 827	/*
 828	 * Find the highest bit, so the last byte offset.
 829	 */
 830	for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
 831		if (imask & fields) {
 832			*last = offsets[i + 1] - 1;
 833			break;
 834		}
 835	}
 836}
 837
 838/*
 839 * Get a buffer for the block, return it read in.
 840 * Long-form addressing.
 841 */
 842int
 843xfs_btree_read_bufl(
 844	struct xfs_mount	*mp,		/* file system mount point */
 845	struct xfs_trans	*tp,		/* transaction pointer */
 846	xfs_fsblock_t		fsbno,		/* file system block number */
 847	uint			lock,		/* lock flags for read_buf */
 848	struct xfs_buf		**bpp,		/* buffer for fsbno */
 849	int			refval,		/* ref count value for buffer */
 850	const struct xfs_buf_ops *ops)
 851{
 852	struct xfs_buf		*bp;		/* return value */
 853	xfs_daddr_t		d;		/* real disk block address */
 854	int			error;
 855
 856	if (!xfs_verify_fsbno(mp, fsbno))
 857		return -EFSCORRUPTED;
 858	d = XFS_FSB_TO_DADDR(mp, fsbno);
 859	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
 860				   mp->m_bsize, lock, &bp, ops);
 861	if (error)
 862		return error;
 863	if (bp)
 864		xfs_buf_set_ref(bp, refval);
 865	*bpp = bp;
 866	return 0;
 867}
 868
 869/*
 870 * Read-ahead the block, don't wait for it, don't return a buffer.
 871 * Long-form addressing.
 872 */
 873/* ARGSUSED */
 874void
 875xfs_btree_reada_bufl(
 876	struct xfs_mount	*mp,		/* file system mount point */
 877	xfs_fsblock_t		fsbno,		/* file system block number */
 878	xfs_extlen_t		count,		/* count of filesystem blocks */
 879	const struct xfs_buf_ops *ops)
 880{
 881	xfs_daddr_t		d;
 882
 883	ASSERT(fsbno != NULLFSBLOCK);
 884	d = XFS_FSB_TO_DADDR(mp, fsbno);
 885	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
 886}
 887
 888/*
 889 * Read-ahead the block, don't wait for it, don't return a buffer.
 890 * Short-form addressing.
 891 */
 892/* ARGSUSED */
 893void
 894xfs_btree_reada_bufs(
 895	struct xfs_mount	*mp,		/* file system mount point */
 896	xfs_agnumber_t		agno,		/* allocation group number */
 897	xfs_agblock_t		agbno,		/* allocation group block number */
 898	xfs_extlen_t		count,		/* count of filesystem blocks */
 899	const struct xfs_buf_ops *ops)
 900{
 901	xfs_daddr_t		d;
 902
 903	ASSERT(agno != NULLAGNUMBER);
 904	ASSERT(agbno != NULLAGBLOCK);
 905	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
 906	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
 907}
 908
 909STATIC int
 910xfs_btree_readahead_lblock(
 911	struct xfs_btree_cur	*cur,
 912	int			lr,
 913	struct xfs_btree_block	*block)
 914{
 915	int			rval = 0;
 916	xfs_fsblock_t		left = be64_to_cpu(block->bb_u.l.bb_leftsib);
 917	xfs_fsblock_t		right = be64_to_cpu(block->bb_u.l.bb_rightsib);
 918
 919	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
 920		xfs_btree_reada_bufl(cur->bc_mp, left, 1,
 921				     cur->bc_ops->buf_ops);
 922		rval++;
 923	}
 924
 925	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
 926		xfs_btree_reada_bufl(cur->bc_mp, right, 1,
 927				     cur->bc_ops->buf_ops);
 928		rval++;
 929	}
 930
 931	return rval;
 932}
 933
 934STATIC int
 935xfs_btree_readahead_sblock(
 936	struct xfs_btree_cur	*cur,
 937	int			lr,
 938	struct xfs_btree_block *block)
 939{
 940	int			rval = 0;
 941	xfs_agblock_t		left = be32_to_cpu(block->bb_u.s.bb_leftsib);
 942	xfs_agblock_t		right = be32_to_cpu(block->bb_u.s.bb_rightsib);
 943
 944
 945	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
 946		xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
 947				     left, 1, cur->bc_ops->buf_ops);
 948		rval++;
 949	}
 950
 951	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
 952		xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
 953				     right, 1, cur->bc_ops->buf_ops);
 954		rval++;
 955	}
 956
 957	return rval;
 958}
 959
 960/*
 961 * Read-ahead btree blocks, at the given level.
 962 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
 963 */
 964STATIC int
 965xfs_btree_readahead(
 966	struct xfs_btree_cur	*cur,		/* btree cursor */
 967	int			lev,		/* level in btree */
 968	int			lr)		/* left/right bits */
 969{
 970	struct xfs_btree_block	*block;
 971
 972	/*
 973	 * No readahead needed if we are at the root level and the
 974	 * btree root is stored in the inode.
 975	 */
 976	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
 977	    (lev == cur->bc_nlevels - 1))
 978		return 0;
 979
 980	if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
 981		return 0;
 982
 983	cur->bc_ra[lev] |= lr;
 984	block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
 985
 986	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 987		return xfs_btree_readahead_lblock(cur, lr, block);
 988	return xfs_btree_readahead_sblock(cur, lr, block);
 989}
 990
 991STATIC xfs_daddr_t
 992xfs_btree_ptr_to_daddr(
 993	struct xfs_btree_cur	*cur,
 994	union xfs_btree_ptr	*ptr)
 
 995{
 996	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 997		ASSERT(ptr->l != cpu_to_be64(NULLFSBLOCK));
 
 998
 999		return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
1000	} else {
1001		ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
1002		ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
1003
1004		return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
1005					be32_to_cpu(ptr->s));
 
 
 
 
 
1006	}
 
 
1007}
1008
1009/*
1010 * Readahead @count btree blocks at the given @ptr location.
1011 *
1012 * We don't need to care about long or short form btrees here as we have a
1013 * method of converting the ptr directly to a daddr available to us.
1014 */
1015STATIC void
1016xfs_btree_readahead_ptr(
1017	struct xfs_btree_cur	*cur,
1018	union xfs_btree_ptr	*ptr,
1019	xfs_extlen_t		count)
1020{
1021	xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
1022			  xfs_btree_ptr_to_daddr(cur, ptr),
 
 
 
1023			  cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
1024}
1025
1026/*
1027 * Set the buffer for level "lev" in the cursor to bp, releasing
1028 * any previous buffer.
1029 */
1030STATIC void
1031xfs_btree_setbuf(
1032	xfs_btree_cur_t		*cur,	/* btree cursor */
1033	int			lev,	/* level in btree */
1034	xfs_buf_t		*bp)	/* new buffer to set */
1035{
1036	struct xfs_btree_block	*b;	/* btree block */
1037
1038	if (cur->bc_bufs[lev])
1039		xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
1040	cur->bc_bufs[lev] = bp;
1041	cur->bc_ra[lev] = 0;
1042
1043	b = XFS_BUF_TO_BLOCK(bp);
1044	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1045		if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
1046			cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
1047		if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
1048			cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
1049	} else {
1050		if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
1051			cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
1052		if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
1053			cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
1054	}
1055}
1056
1057bool
1058xfs_btree_ptr_is_null(
1059	struct xfs_btree_cur	*cur,
1060	union xfs_btree_ptr	*ptr)
1061{
1062	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1063		return ptr->l == cpu_to_be64(NULLFSBLOCK);
1064	else
1065		return ptr->s == cpu_to_be32(NULLAGBLOCK);
1066}
1067
1068STATIC void
1069xfs_btree_set_ptr_null(
1070	struct xfs_btree_cur	*cur,
1071	union xfs_btree_ptr	*ptr)
1072{
1073	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1074		ptr->l = cpu_to_be64(NULLFSBLOCK);
1075	else
1076		ptr->s = cpu_to_be32(NULLAGBLOCK);
1077}
1078
1079/*
1080 * Get/set/init sibling pointers
1081 */
1082void
1083xfs_btree_get_sibling(
1084	struct xfs_btree_cur	*cur,
1085	struct xfs_btree_block	*block,
1086	union xfs_btree_ptr	*ptr,
1087	int			lr)
1088{
1089	ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1090
1091	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1092		if (lr == XFS_BB_RIGHTSIB)
1093			ptr->l = block->bb_u.l.bb_rightsib;
1094		else
1095			ptr->l = block->bb_u.l.bb_leftsib;
1096	} else {
1097		if (lr == XFS_BB_RIGHTSIB)
1098			ptr->s = block->bb_u.s.bb_rightsib;
1099		else
1100			ptr->s = block->bb_u.s.bb_leftsib;
1101	}
1102}
1103
1104STATIC void
1105xfs_btree_set_sibling(
1106	struct xfs_btree_cur	*cur,
1107	struct xfs_btree_block	*block,
1108	union xfs_btree_ptr	*ptr,
1109	int			lr)
1110{
1111	ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1112
1113	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1114		if (lr == XFS_BB_RIGHTSIB)
1115			block->bb_u.l.bb_rightsib = ptr->l;
1116		else
1117			block->bb_u.l.bb_leftsib = ptr->l;
1118	} else {
1119		if (lr == XFS_BB_RIGHTSIB)
1120			block->bb_u.s.bb_rightsib = ptr->s;
1121		else
1122			block->bb_u.s.bb_leftsib = ptr->s;
1123	}
1124}
1125
1126void
1127xfs_btree_init_block_int(
1128	struct xfs_mount	*mp,
1129	struct xfs_btree_block	*buf,
1130	xfs_daddr_t		blkno,
1131	xfs_btnum_t		btnum,
1132	__u16			level,
1133	__u16			numrecs,
1134	__u64			owner,
1135	unsigned int		flags)
1136{
1137	int			crc = xfs_sb_version_hascrc(&mp->m_sb);
1138	__u32			magic = xfs_btree_magic(crc, btnum);
1139
1140	buf->bb_magic = cpu_to_be32(magic);
1141	buf->bb_level = cpu_to_be16(level);
1142	buf->bb_numrecs = cpu_to_be16(numrecs);
1143
1144	if (flags & XFS_BTREE_LONG_PTRS) {
1145		buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
1146		buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
1147		if (crc) {
1148			buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1149			buf->bb_u.l.bb_owner = cpu_to_be64(owner);
1150			uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
1151			buf->bb_u.l.bb_pad = 0;
1152			buf->bb_u.l.bb_lsn = 0;
1153		}
1154	} else {
1155		/* owner is a 32 bit value on short blocks */
1156		__u32 __owner = (__u32)owner;
1157
1158		buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1159		buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1160		if (crc) {
1161			buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
1162			buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
1163			uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
1164			buf->bb_u.s.bb_lsn = 0;
1165		}
1166	}
1167}
1168
1169void
1170xfs_btree_init_block(
1171	struct xfs_mount *mp,
1172	struct xfs_buf	*bp,
1173	xfs_btnum_t	btnum,
1174	__u16		level,
1175	__u16		numrecs,
1176	__u64		owner,
1177	unsigned int	flags)
1178{
1179	xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1180				 btnum, level, numrecs, owner, flags);
1181}
1182
1183STATIC void
1184xfs_btree_init_block_cur(
1185	struct xfs_btree_cur	*cur,
1186	struct xfs_buf		*bp,
1187	int			level,
1188	int			numrecs)
1189{
1190	__u64			owner;
1191
1192	/*
1193	 * we can pull the owner from the cursor right now as the different
1194	 * owners align directly with the pointer size of the btree. This may
1195	 * change in future, but is safe for current users of the generic btree
1196	 * code.
1197	 */
1198	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1199		owner = cur->bc_private.b.ip->i_ino;
1200	else
1201		owner = cur->bc_private.a.agno;
1202
1203	xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1204				 cur->bc_btnum, level, numrecs,
1205				 owner, cur->bc_flags);
1206}
1207
1208/*
1209 * Return true if ptr is the last record in the btree and
1210 * we need to track updates to this record.  The decision
1211 * will be further refined in the update_lastrec method.
1212 */
1213STATIC int
1214xfs_btree_is_lastrec(
1215	struct xfs_btree_cur	*cur,
1216	struct xfs_btree_block	*block,
1217	int			level)
1218{
1219	union xfs_btree_ptr	ptr;
1220
1221	if (level > 0)
1222		return 0;
1223	if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
1224		return 0;
1225
1226	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1227	if (!xfs_btree_ptr_is_null(cur, &ptr))
1228		return 0;
1229	return 1;
1230}
1231
1232STATIC void
1233xfs_btree_buf_to_ptr(
1234	struct xfs_btree_cur	*cur,
1235	struct xfs_buf		*bp,
1236	union xfs_btree_ptr	*ptr)
1237{
1238	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1239		ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
1240					XFS_BUF_ADDR(bp)));
1241	else {
1242		ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
1243					XFS_BUF_ADDR(bp)));
1244	}
1245}
1246
1247STATIC void
1248xfs_btree_set_refs(
1249	struct xfs_btree_cur	*cur,
1250	struct xfs_buf		*bp)
1251{
1252	switch (cur->bc_btnum) {
1253	case XFS_BTNUM_BNO:
1254	case XFS_BTNUM_CNT:
1255		xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
1256		break;
1257	case XFS_BTNUM_INO:
1258	case XFS_BTNUM_FINO:
1259		xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
1260		break;
1261	case XFS_BTNUM_BMAP:
1262		xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
1263		break;
1264	case XFS_BTNUM_RMAP:
1265		xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
1266		break;
1267	case XFS_BTNUM_REFC:
1268		xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF);
1269		break;
1270	default:
1271		ASSERT(0);
1272	}
1273}
1274
1275STATIC int
1276xfs_btree_get_buf_block(
1277	struct xfs_btree_cur	*cur,
1278	union xfs_btree_ptr	*ptr,
1279	int			flags,
1280	struct xfs_btree_block	**block,
1281	struct xfs_buf		**bpp)
1282{
1283	struct xfs_mount	*mp = cur->bc_mp;
1284	xfs_daddr_t		d;
 
1285
1286	/* need to sort out how callers deal with failures first */
1287	ASSERT(!(flags & XBF_TRYLOCK));
1288
1289	d = xfs_btree_ptr_to_daddr(cur, ptr);
1290	*bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
1291				 mp->m_bsize, flags);
1292
1293	if (!*bpp)
1294		return -ENOMEM;
1295
1296	(*bpp)->b_ops = cur->bc_ops->buf_ops;
1297	*block = XFS_BUF_TO_BLOCK(*bpp);
1298	return 0;
1299}
1300
1301/*
1302 * Read in the buffer at the given ptr and return the buffer and
1303 * the block pointer within the buffer.
1304 */
1305STATIC int
1306xfs_btree_read_buf_block(
1307	struct xfs_btree_cur	*cur,
1308	union xfs_btree_ptr	*ptr,
1309	int			flags,
1310	struct xfs_btree_block	**block,
1311	struct xfs_buf		**bpp)
1312{
1313	struct xfs_mount	*mp = cur->bc_mp;
1314	xfs_daddr_t		d;
1315	int			error;
1316
1317	/* need to sort out how callers deal with failures first */
1318	ASSERT(!(flags & XBF_TRYLOCK));
1319
1320	d = xfs_btree_ptr_to_daddr(cur, ptr);
 
 
1321	error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1322				   mp->m_bsize, flags, bpp,
1323				   cur->bc_ops->buf_ops);
1324	if (error)
1325		return error;
1326
1327	xfs_btree_set_refs(cur, *bpp);
1328	*block = XFS_BUF_TO_BLOCK(*bpp);
1329	return 0;
1330}
1331
1332/*
1333 * Copy keys from one btree block to another.
1334 */
1335STATIC void
1336xfs_btree_copy_keys(
1337	struct xfs_btree_cur	*cur,
1338	union xfs_btree_key	*dst_key,
1339	union xfs_btree_key	*src_key,
1340	int			numkeys)
1341{
1342	ASSERT(numkeys >= 0);
1343	memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
1344}
1345
1346/*
1347 * Copy records from one btree block to another.
1348 */
1349STATIC void
1350xfs_btree_copy_recs(
1351	struct xfs_btree_cur	*cur,
1352	union xfs_btree_rec	*dst_rec,
1353	union xfs_btree_rec	*src_rec,
1354	int			numrecs)
1355{
1356	ASSERT(numrecs >= 0);
1357	memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
1358}
1359
1360/*
1361 * Copy block pointers from one btree block to another.
1362 */
1363STATIC void
1364xfs_btree_copy_ptrs(
1365	struct xfs_btree_cur	*cur,
1366	union xfs_btree_ptr	*dst_ptr,
1367	union xfs_btree_ptr	*src_ptr,
1368	int			numptrs)
1369{
1370	ASSERT(numptrs >= 0);
1371	memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
1372}
1373
1374/*
1375 * Shift keys one index left/right inside a single btree block.
1376 */
1377STATIC void
1378xfs_btree_shift_keys(
1379	struct xfs_btree_cur	*cur,
1380	union xfs_btree_key	*key,
1381	int			dir,
1382	int			numkeys)
1383{
1384	char			*dst_key;
1385
1386	ASSERT(numkeys >= 0);
1387	ASSERT(dir == 1 || dir == -1);
1388
1389	dst_key = (char *)key + (dir * cur->bc_ops->key_len);
1390	memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
1391}
1392
1393/*
1394 * Shift records one index left/right inside a single btree block.
1395 */
1396STATIC void
1397xfs_btree_shift_recs(
1398	struct xfs_btree_cur	*cur,
1399	union xfs_btree_rec	*rec,
1400	int			dir,
1401	int			numrecs)
1402{
1403	char			*dst_rec;
1404
1405	ASSERT(numrecs >= 0);
1406	ASSERT(dir == 1 || dir == -1);
1407
1408	dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
1409	memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
1410}
1411
1412/*
1413 * Shift block pointers one index left/right inside a single btree block.
1414 */
1415STATIC void
1416xfs_btree_shift_ptrs(
1417	struct xfs_btree_cur	*cur,
1418	union xfs_btree_ptr	*ptr,
1419	int			dir,
1420	int			numptrs)
1421{
1422	char			*dst_ptr;
1423
1424	ASSERT(numptrs >= 0);
1425	ASSERT(dir == 1 || dir == -1);
1426
1427	dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
1428	memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
1429}
1430
1431/*
1432 * Log key values from the btree block.
1433 */
1434STATIC void
1435xfs_btree_log_keys(
1436	struct xfs_btree_cur	*cur,
1437	struct xfs_buf		*bp,
1438	int			first,
1439	int			last)
1440{
1441
1442	if (bp) {
1443		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1444		xfs_trans_log_buf(cur->bc_tp, bp,
1445				  xfs_btree_key_offset(cur, first),
1446				  xfs_btree_key_offset(cur, last + 1) - 1);
1447	} else {
1448		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1449				xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1450	}
1451}
1452
1453/*
1454 * Log record values from the btree block.
1455 */
1456void
1457xfs_btree_log_recs(
1458	struct xfs_btree_cur	*cur,
1459	struct xfs_buf		*bp,
1460	int			first,
1461	int			last)
1462{
1463
1464	xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1465	xfs_trans_log_buf(cur->bc_tp, bp,
1466			  xfs_btree_rec_offset(cur, first),
1467			  xfs_btree_rec_offset(cur, last + 1) - 1);
1468
1469}
1470
1471/*
1472 * Log block pointer fields from a btree block (nonleaf).
1473 */
1474STATIC void
1475xfs_btree_log_ptrs(
1476	struct xfs_btree_cur	*cur,	/* btree cursor */
1477	struct xfs_buf		*bp,	/* buffer containing btree block */
1478	int			first,	/* index of first pointer to log */
1479	int			last)	/* index of last pointer to log */
1480{
1481
1482	if (bp) {
1483		struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
1484		int			level = xfs_btree_get_level(block);
1485
1486		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1487		xfs_trans_log_buf(cur->bc_tp, bp,
1488				xfs_btree_ptr_offset(cur, first, level),
1489				xfs_btree_ptr_offset(cur, last + 1, level) - 1);
1490	} else {
1491		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1492			xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1493	}
1494
1495}
1496
1497/*
1498 * Log fields from a btree block header.
1499 */
1500void
1501xfs_btree_log_block(
1502	struct xfs_btree_cur	*cur,	/* btree cursor */
1503	struct xfs_buf		*bp,	/* buffer containing btree block */
1504	int			fields)	/* mask of fields: XFS_BB_... */
1505{
1506	int			first;	/* first byte offset logged */
1507	int			last;	/* last byte offset logged */
1508	static const short	soffsets[] = {	/* table of offsets (short) */
1509		offsetof(struct xfs_btree_block, bb_magic),
1510		offsetof(struct xfs_btree_block, bb_level),
1511		offsetof(struct xfs_btree_block, bb_numrecs),
1512		offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
1513		offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
1514		offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
1515		offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
1516		offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
1517		offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
1518		offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
1519		XFS_BTREE_SBLOCK_CRC_LEN
1520	};
1521	static const short	loffsets[] = {	/* table of offsets (long) */
1522		offsetof(struct xfs_btree_block, bb_magic),
1523		offsetof(struct xfs_btree_block, bb_level),
1524		offsetof(struct xfs_btree_block, bb_numrecs),
1525		offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
1526		offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
1527		offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
1528		offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
1529		offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
1530		offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
1531		offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
1532		offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
1533		XFS_BTREE_LBLOCK_CRC_LEN
1534	};
1535
1536	if (bp) {
1537		int nbits;
1538
1539		if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
1540			/*
1541			 * We don't log the CRC when updating a btree
1542			 * block but instead recreate it during log
1543			 * recovery.  As the log buffers have checksums
1544			 * of their own this is safe and avoids logging a crc
1545			 * update in a lot of places.
1546			 */
1547			if (fields == XFS_BB_ALL_BITS)
1548				fields = XFS_BB_ALL_BITS_CRC;
1549			nbits = XFS_BB_NUM_BITS_CRC;
1550		} else {
1551			nbits = XFS_BB_NUM_BITS;
1552		}
1553		xfs_btree_offsets(fields,
1554				  (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
1555					loffsets : soffsets,
1556				  nbits, &first, &last);
1557		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1558		xfs_trans_log_buf(cur->bc_tp, bp, first, last);
1559	} else {
1560		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1561			xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1562	}
1563}
1564
1565/*
1566 * Increment cursor by one record at the level.
1567 * For nonzero levels the leaf-ward information is untouched.
1568 */
1569int						/* error */
1570xfs_btree_increment(
1571	struct xfs_btree_cur	*cur,
1572	int			level,
1573	int			*stat)		/* success/failure */
1574{
1575	struct xfs_btree_block	*block;
1576	union xfs_btree_ptr	ptr;
1577	struct xfs_buf		*bp;
1578	int			error;		/* error return value */
1579	int			lev;
1580
1581	ASSERT(level < cur->bc_nlevels);
1582
1583	/* Read-ahead to the right at this level. */
1584	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1585
1586	/* Get a pointer to the btree block. */
1587	block = xfs_btree_get_block(cur, level, &bp);
1588
1589#ifdef DEBUG
1590	error = xfs_btree_check_block(cur, block, level, bp);
1591	if (error)
1592		goto error0;
1593#endif
1594
1595	/* We're done if we remain in the block after the increment. */
1596	if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
1597		goto out1;
1598
1599	/* Fail if we just went off the right edge of the tree. */
1600	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1601	if (xfs_btree_ptr_is_null(cur, &ptr))
1602		goto out0;
1603
1604	XFS_BTREE_STATS_INC(cur, increment);
1605
1606	/*
1607	 * March up the tree incrementing pointers.
1608	 * Stop when we don't go off the right edge of a block.
1609	 */
1610	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1611		block = xfs_btree_get_block(cur, lev, &bp);
1612
1613#ifdef DEBUG
1614		error = xfs_btree_check_block(cur, block, lev, bp);
1615		if (error)
1616			goto error0;
1617#endif
1618
1619		if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
1620			break;
1621
1622		/* Read-ahead the right block for the next loop. */
1623		xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1624	}
1625
1626	/*
1627	 * If we went off the root then we are either seriously
1628	 * confused or have the tree root in an inode.
1629	 */
1630	if (lev == cur->bc_nlevels) {
1631		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1632			goto out0;
1633		ASSERT(0);
1634		error = -EFSCORRUPTED;
1635		goto error0;
1636	}
1637	ASSERT(lev < cur->bc_nlevels);
1638
1639	/*
1640	 * Now walk back down the tree, fixing up the cursor's buffer
1641	 * pointers and key numbers.
1642	 */
1643	for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1644		union xfs_btree_ptr	*ptrp;
1645
1646		ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1647		--lev;
1648		error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1649		if (error)
1650			goto error0;
1651
1652		xfs_btree_setbuf(cur, lev, bp);
1653		cur->bc_ptrs[lev] = 1;
1654	}
1655out1:
1656	*stat = 1;
1657	return 0;
1658
1659out0:
1660	*stat = 0;
1661	return 0;
1662
1663error0:
1664	return error;
1665}
1666
1667/*
1668 * Decrement cursor by one record at the level.
1669 * For nonzero levels the leaf-ward information is untouched.
1670 */
1671int						/* error */
1672xfs_btree_decrement(
1673	struct xfs_btree_cur	*cur,
1674	int			level,
1675	int			*stat)		/* success/failure */
1676{
1677	struct xfs_btree_block	*block;
1678	xfs_buf_t		*bp;
1679	int			error;		/* error return value */
1680	int			lev;
1681	union xfs_btree_ptr	ptr;
1682
1683	ASSERT(level < cur->bc_nlevels);
1684
1685	/* Read-ahead to the left at this level. */
1686	xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1687
1688	/* We're done if we remain in the block after the decrement. */
1689	if (--cur->bc_ptrs[level] > 0)
1690		goto out1;
1691
1692	/* Get a pointer to the btree block. */
1693	block = xfs_btree_get_block(cur, level, &bp);
1694
1695#ifdef DEBUG
1696	error = xfs_btree_check_block(cur, block, level, bp);
1697	if (error)
1698		goto error0;
1699#endif
1700
1701	/* Fail if we just went off the left edge of the tree. */
1702	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
1703	if (xfs_btree_ptr_is_null(cur, &ptr))
1704		goto out0;
1705
1706	XFS_BTREE_STATS_INC(cur, decrement);
1707
1708	/*
1709	 * March up the tree decrementing pointers.
1710	 * Stop when we don't go off the left edge of a block.
1711	 */
1712	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1713		if (--cur->bc_ptrs[lev] > 0)
1714			break;
1715		/* Read-ahead the left block for the next loop. */
1716		xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1717	}
1718
1719	/*
1720	 * If we went off the root then we are seriously confused.
1721	 * or the root of the tree is in an inode.
1722	 */
1723	if (lev == cur->bc_nlevels) {
1724		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1725			goto out0;
1726		ASSERT(0);
1727		error = -EFSCORRUPTED;
1728		goto error0;
1729	}
1730	ASSERT(lev < cur->bc_nlevels);
1731
1732	/*
1733	 * Now walk back down the tree, fixing up the cursor's buffer
1734	 * pointers and key numbers.
1735	 */
1736	for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1737		union xfs_btree_ptr	*ptrp;
1738
1739		ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1740		--lev;
1741		error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1742		if (error)
1743			goto error0;
1744		xfs_btree_setbuf(cur, lev, bp);
1745		cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
1746	}
1747out1:
1748	*stat = 1;
1749	return 0;
1750
1751out0:
1752	*stat = 0;
1753	return 0;
1754
1755error0:
1756	return error;
1757}
1758
1759int
1760xfs_btree_lookup_get_block(
1761	struct xfs_btree_cur	*cur,	/* btree cursor */
1762	int			level,	/* level in the btree */
1763	union xfs_btree_ptr	*pp,	/* ptr to btree block */
1764	struct xfs_btree_block	**blkp) /* return btree block */
1765{
1766	struct xfs_buf		*bp;	/* buffer pointer for btree block */
 
1767	int			error = 0;
1768
1769	/* special case the root block if in an inode */
1770	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1771	    (level == cur->bc_nlevels - 1)) {
1772		*blkp = xfs_btree_get_iroot(cur);
1773		return 0;
1774	}
1775
1776	/*
1777	 * If the old buffer at this level for the disk address we are
1778	 * looking for re-use it.
1779	 *
1780	 * Otherwise throw it away and get a new one.
1781	 */
1782	bp = cur->bc_bufs[level];
1783	if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
 
 
 
1784		*blkp = XFS_BUF_TO_BLOCK(bp);
1785		return 0;
1786	}
1787
1788	error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
1789	if (error)
1790		return error;
1791
1792	/* Check the inode owner since the verifiers don't. */
1793	if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
1794	    !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) &&
1795	    (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
1796	    be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
1797			cur->bc_private.b.ip->i_ino)
1798		goto out_bad;
1799
1800	/* Did we get the level we were looking for? */
1801	if (be16_to_cpu((*blkp)->bb_level) != level)
1802		goto out_bad;
1803
1804	/* Check that internal nodes have at least one record. */
1805	if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0)
1806		goto out_bad;
1807
1808	xfs_btree_setbuf(cur, level, bp);
1809	return 0;
1810
1811out_bad:
1812	*blkp = NULL;
1813	xfs_trans_brelse(cur->bc_tp, bp);
1814	return -EFSCORRUPTED;
1815}
1816
1817/*
1818 * Get current search key.  For level 0 we don't actually have a key
1819 * structure so we make one up from the record.  For all other levels
1820 * we just return the right key.
1821 */
1822STATIC union xfs_btree_key *
1823xfs_lookup_get_search_key(
1824	struct xfs_btree_cur	*cur,
1825	int			level,
1826	int			keyno,
1827	struct xfs_btree_block	*block,
1828	union xfs_btree_key	*kp)
1829{
1830	if (level == 0) {
1831		cur->bc_ops->init_key_from_rec(kp,
1832				xfs_btree_rec_addr(cur, keyno, block));
1833		return kp;
1834	}
1835
1836	return xfs_btree_key_addr(cur, keyno, block);
1837}
1838
1839/*
1840 * Lookup the record.  The cursor is made to point to it, based on dir.
1841 * stat is set to 0 if can't find any such record, 1 for success.
1842 */
1843int					/* error */
1844xfs_btree_lookup(
1845	struct xfs_btree_cur	*cur,	/* btree cursor */
1846	xfs_lookup_t		dir,	/* <=, ==, or >= */
1847	int			*stat)	/* success/failure */
1848{
1849	struct xfs_btree_block	*block;	/* current btree block */
1850	int64_t			diff;	/* difference for the current key */
1851	int			error;	/* error return value */
1852	int			keyno;	/* current key number */
1853	int			level;	/* level in the btree */
1854	union xfs_btree_ptr	*pp;	/* ptr to btree block */
1855	union xfs_btree_ptr	ptr;	/* ptr to btree block */
1856
1857	XFS_BTREE_STATS_INC(cur, lookup);
1858
1859	/* No such thing as a zero-level tree. */
1860	if (cur->bc_nlevels == 0)
1861		return -EFSCORRUPTED;
1862
1863	block = NULL;
1864	keyno = 0;
1865
1866	/* initialise start pointer from cursor */
1867	cur->bc_ops->init_ptr_from_cur(cur, &ptr);
1868	pp = &ptr;
1869
1870	/*
1871	 * Iterate over each level in the btree, starting at the root.
1872	 * For each level above the leaves, find the key we need, based
1873	 * on the lookup record, then follow the corresponding block
1874	 * pointer down to the next level.
1875	 */
1876	for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
1877		/* Get the block we need to do the lookup on. */
1878		error = xfs_btree_lookup_get_block(cur, level, pp, &block);
1879		if (error)
1880			goto error0;
1881
1882		if (diff == 0) {
1883			/*
1884			 * If we already had a key match at a higher level, we
1885			 * know we need to use the first entry in this block.
1886			 */
1887			keyno = 1;
1888		} else {
1889			/* Otherwise search this block. Do a binary search. */
1890
1891			int	high;	/* high entry number */
1892			int	low;	/* low entry number */
1893
1894			/* Set low and high entry numbers, 1-based. */
1895			low = 1;
1896			high = xfs_btree_get_numrecs(block);
1897			if (!high) {
1898				/* Block is empty, must be an empty leaf. */
1899				ASSERT(level == 0 && cur->bc_nlevels == 1);
 
 
 
 
 
 
1900
1901				cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1902				*stat = 0;
1903				return 0;
1904			}
1905
1906			/* Binary search the block. */
1907			while (low <= high) {
1908				union xfs_btree_key	key;
1909				union xfs_btree_key	*kp;
1910
1911				XFS_BTREE_STATS_INC(cur, compare);
1912
1913				/* keyno is average of low and high. */
1914				keyno = (low + high) >> 1;
1915
1916				/* Get current search key */
1917				kp = xfs_lookup_get_search_key(cur, level,
1918						keyno, block, &key);
1919
1920				/*
1921				 * Compute difference to get next direction:
1922				 *  - less than, move right
1923				 *  - greater than, move left
1924				 *  - equal, we're done
1925				 */
1926				diff = cur->bc_ops->key_diff(cur, kp);
1927				if (diff < 0)
1928					low = keyno + 1;
1929				else if (diff > 0)
1930					high = keyno - 1;
1931				else
1932					break;
1933			}
1934		}
1935
1936		/*
1937		 * If there are more levels, set up for the next level
1938		 * by getting the block number and filling in the cursor.
1939		 */
1940		if (level > 0) {
1941			/*
1942			 * If we moved left, need the previous key number,
1943			 * unless there isn't one.
1944			 */
1945			if (diff > 0 && --keyno < 1)
1946				keyno = 1;
1947			pp = xfs_btree_ptr_addr(cur, keyno, block);
1948
1949#ifdef DEBUG
1950			error = xfs_btree_check_ptr(cur, pp, 0, level);
1951			if (error)
1952				goto error0;
1953#endif
1954			cur->bc_ptrs[level] = keyno;
1955		}
1956	}
1957
1958	/* Done with the search. See if we need to adjust the results. */
1959	if (dir != XFS_LOOKUP_LE && diff < 0) {
1960		keyno++;
1961		/*
1962		 * If ge search and we went off the end of the block, but it's
1963		 * not the last block, we're in the wrong block.
1964		 */
1965		xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1966		if (dir == XFS_LOOKUP_GE &&
1967		    keyno > xfs_btree_get_numrecs(block) &&
1968		    !xfs_btree_ptr_is_null(cur, &ptr)) {
1969			int	i;
1970
1971			cur->bc_ptrs[0] = keyno;
1972			error = xfs_btree_increment(cur, 0, &i);
1973			if (error)
1974				goto error0;
1975			XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1976			*stat = 1;
1977			return 0;
1978		}
1979	} else if (dir == XFS_LOOKUP_LE && diff > 0)
1980		keyno--;
1981	cur->bc_ptrs[0] = keyno;
1982
1983	/* Return if we succeeded or not. */
1984	if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
1985		*stat = 0;
1986	else if (dir != XFS_LOOKUP_EQ || diff == 0)
1987		*stat = 1;
1988	else
1989		*stat = 0;
1990	return 0;
1991
1992error0:
1993	return error;
1994}
1995
1996/* Find the high key storage area from a regular key. */
1997union xfs_btree_key *
1998xfs_btree_high_key_from_key(
1999	struct xfs_btree_cur	*cur,
2000	union xfs_btree_key	*key)
2001{
2002	ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
2003	return (union xfs_btree_key *)((char *)key +
2004			(cur->bc_ops->key_len / 2));
2005}
2006
2007/* Determine the low (and high if overlapped) keys of a leaf block */
2008STATIC void
2009xfs_btree_get_leaf_keys(
2010	struct xfs_btree_cur	*cur,
2011	struct xfs_btree_block	*block,
2012	union xfs_btree_key	*key)
2013{
2014	union xfs_btree_key	max_hkey;
2015	union xfs_btree_key	hkey;
2016	union xfs_btree_rec	*rec;
2017	union xfs_btree_key	*high;
2018	int			n;
2019
2020	rec = xfs_btree_rec_addr(cur, 1, block);
2021	cur->bc_ops->init_key_from_rec(key, rec);
2022
2023	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2024
2025		cur->bc_ops->init_high_key_from_rec(&max_hkey, rec);
2026		for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2027			rec = xfs_btree_rec_addr(cur, n, block);
2028			cur->bc_ops->init_high_key_from_rec(&hkey, rec);
2029			if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey)
2030					> 0)
2031				max_hkey = hkey;
2032		}
2033
2034		high = xfs_btree_high_key_from_key(cur, key);
2035		memcpy(high, &max_hkey, cur->bc_ops->key_len / 2);
2036	}
2037}
2038
2039/* Determine the low (and high if overlapped) keys of a node block */
2040STATIC void
2041xfs_btree_get_node_keys(
2042	struct xfs_btree_cur	*cur,
2043	struct xfs_btree_block	*block,
2044	union xfs_btree_key	*key)
2045{
2046	union xfs_btree_key	*hkey;
2047	union xfs_btree_key	*max_hkey;
2048	union xfs_btree_key	*high;
2049	int			n;
2050
2051	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2052		memcpy(key, xfs_btree_key_addr(cur, 1, block),
2053				cur->bc_ops->key_len / 2);
2054
2055		max_hkey = xfs_btree_high_key_addr(cur, 1, block);
2056		for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2057			hkey = xfs_btree_high_key_addr(cur, n, block);
2058			if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0)
2059				max_hkey = hkey;
2060		}
2061
2062		high = xfs_btree_high_key_from_key(cur, key);
2063		memcpy(high, max_hkey, cur->bc_ops->key_len / 2);
2064	} else {
2065		memcpy(key, xfs_btree_key_addr(cur, 1, block),
2066				cur->bc_ops->key_len);
2067	}
2068}
2069
2070/* Derive the keys for any btree block. */
2071void
2072xfs_btree_get_keys(
2073	struct xfs_btree_cur	*cur,
2074	struct xfs_btree_block	*block,
2075	union xfs_btree_key	*key)
2076{
2077	if (be16_to_cpu(block->bb_level) == 0)
2078		xfs_btree_get_leaf_keys(cur, block, key);
2079	else
2080		xfs_btree_get_node_keys(cur, block, key);
2081}
2082
2083/*
2084 * Decide if we need to update the parent keys of a btree block.  For
2085 * a standard btree this is only necessary if we're updating the first
2086 * record/key.  For an overlapping btree, we must always update the
2087 * keys because the highest key can be in any of the records or keys
2088 * in the block.
2089 */
2090static inline bool
2091xfs_btree_needs_key_update(
2092	struct xfs_btree_cur	*cur,
2093	int			ptr)
2094{
2095	return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1;
2096}
2097
2098/*
2099 * Update the low and high parent keys of the given level, progressing
2100 * towards the root.  If force_all is false, stop if the keys for a given
2101 * level do not need updating.
2102 */
2103STATIC int
2104__xfs_btree_updkeys(
2105	struct xfs_btree_cur	*cur,
2106	int			level,
2107	struct xfs_btree_block	*block,
2108	struct xfs_buf		*bp0,
2109	bool			force_all)
2110{
2111	union xfs_btree_key	key;	/* keys from current level */
2112	union xfs_btree_key	*lkey;	/* keys from the next level up */
2113	union xfs_btree_key	*hkey;
2114	union xfs_btree_key	*nlkey;	/* keys from the next level up */
2115	union xfs_btree_key	*nhkey;
2116	struct xfs_buf		*bp;
2117	int			ptr;
2118
2119	ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
2120
2121	/* Exit if there aren't any parent levels to update. */
2122	if (level + 1 >= cur->bc_nlevels)
2123		return 0;
2124
2125	trace_xfs_btree_updkeys(cur, level, bp0);
2126
2127	lkey = &key;
2128	hkey = xfs_btree_high_key_from_key(cur, lkey);
2129	xfs_btree_get_keys(cur, block, lkey);
2130	for (level++; level < cur->bc_nlevels; level++) {
2131#ifdef DEBUG
2132		int		error;
2133#endif
2134		block = xfs_btree_get_block(cur, level, &bp);
2135		trace_xfs_btree_updkeys(cur, level, bp);
2136#ifdef DEBUG
2137		error = xfs_btree_check_block(cur, block, level, bp);
2138		if (error)
2139			return error;
2140#endif
2141		ptr = cur->bc_ptrs[level];
2142		nlkey = xfs_btree_key_addr(cur, ptr, block);
2143		nhkey = xfs_btree_high_key_addr(cur, ptr, block);
2144		if (!force_all &&
2145		    !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 ||
2146		      cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0))
2147			break;
2148		xfs_btree_copy_keys(cur, nlkey, lkey, 1);
2149		xfs_btree_log_keys(cur, bp, ptr, ptr);
2150		if (level + 1 >= cur->bc_nlevels)
2151			break;
2152		xfs_btree_get_node_keys(cur, block, lkey);
2153	}
2154
2155	return 0;
2156}
2157
2158/* Update all the keys from some level in cursor back to the root. */
2159STATIC int
2160xfs_btree_updkeys_force(
2161	struct xfs_btree_cur	*cur,
2162	int			level)
2163{
2164	struct xfs_buf		*bp;
2165	struct xfs_btree_block	*block;
2166
2167	block = xfs_btree_get_block(cur, level, &bp);
2168	return __xfs_btree_updkeys(cur, level, block, bp, true);
2169}
2170
2171/*
2172 * Update the parent keys of the given level, progressing towards the root.
2173 */
2174STATIC int
2175xfs_btree_update_keys(
2176	struct xfs_btree_cur	*cur,
2177	int			level)
2178{
2179	struct xfs_btree_block	*block;
2180	struct xfs_buf		*bp;
2181	union xfs_btree_key	*kp;
2182	union xfs_btree_key	key;
2183	int			ptr;
2184
2185	ASSERT(level >= 0);
2186
2187	block = xfs_btree_get_block(cur, level, &bp);
2188	if (cur->bc_flags & XFS_BTREE_OVERLAPPING)
2189		return __xfs_btree_updkeys(cur, level, block, bp, false);
2190
2191	/*
2192	 * Go up the tree from this level toward the root.
2193	 * At each level, update the key value to the value input.
2194	 * Stop when we reach a level where the cursor isn't pointing
2195	 * at the first entry in the block.
2196	 */
2197	xfs_btree_get_keys(cur, block, &key);
2198	for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
2199#ifdef DEBUG
2200		int		error;
2201#endif
2202		block = xfs_btree_get_block(cur, level, &bp);
2203#ifdef DEBUG
2204		error = xfs_btree_check_block(cur, block, level, bp);
2205		if (error)
2206			return error;
2207#endif
2208		ptr = cur->bc_ptrs[level];
2209		kp = xfs_btree_key_addr(cur, ptr, block);
2210		xfs_btree_copy_keys(cur, kp, &key, 1);
2211		xfs_btree_log_keys(cur, bp, ptr, ptr);
2212	}
2213
2214	return 0;
2215}
2216
2217/*
2218 * Update the record referred to by cur to the value in the
2219 * given record. This either works (return 0) or gets an
2220 * EFSCORRUPTED error.
2221 */
2222int
2223xfs_btree_update(
2224	struct xfs_btree_cur	*cur,
2225	union xfs_btree_rec	*rec)
2226{
2227	struct xfs_btree_block	*block;
2228	struct xfs_buf		*bp;
2229	int			error;
2230	int			ptr;
2231	union xfs_btree_rec	*rp;
2232
2233	/* Pick up the current block. */
2234	block = xfs_btree_get_block(cur, 0, &bp);
2235
2236#ifdef DEBUG
2237	error = xfs_btree_check_block(cur, block, 0, bp);
2238	if (error)
2239		goto error0;
2240#endif
2241	/* Get the address of the rec to be updated. */
2242	ptr = cur->bc_ptrs[0];
2243	rp = xfs_btree_rec_addr(cur, ptr, block);
2244
2245	/* Fill in the new contents and log them. */
2246	xfs_btree_copy_recs(cur, rp, rec, 1);
2247	xfs_btree_log_recs(cur, bp, ptr, ptr);
2248
2249	/*
2250	 * If we are tracking the last record in the tree and
2251	 * we are at the far right edge of the tree, update it.
2252	 */
2253	if (xfs_btree_is_lastrec(cur, block, 0)) {
2254		cur->bc_ops->update_lastrec(cur, block, rec,
2255					    ptr, LASTREC_UPDATE);
2256	}
2257
2258	/* Pass new key value up to our parent. */
2259	if (xfs_btree_needs_key_update(cur, ptr)) {
2260		error = xfs_btree_update_keys(cur, 0);
2261		if (error)
2262			goto error0;
2263	}
2264
2265	return 0;
2266
2267error0:
2268	return error;
2269}
2270
2271/*
2272 * Move 1 record left from cur/level if possible.
2273 * Update cur to reflect the new path.
2274 */
2275STATIC int					/* error */
2276xfs_btree_lshift(
2277	struct xfs_btree_cur	*cur,
2278	int			level,
2279	int			*stat)		/* success/failure */
2280{
2281	struct xfs_buf		*lbp;		/* left buffer pointer */
2282	struct xfs_btree_block	*left;		/* left btree block */
2283	int			lrecs;		/* left record count */
2284	struct xfs_buf		*rbp;		/* right buffer pointer */
2285	struct xfs_btree_block	*right;		/* right btree block */
2286	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
2287	int			rrecs;		/* right record count */
2288	union xfs_btree_ptr	lptr;		/* left btree pointer */
2289	union xfs_btree_key	*rkp = NULL;	/* right btree key */
2290	union xfs_btree_ptr	*rpp = NULL;	/* right address pointer */
2291	union xfs_btree_rec	*rrp = NULL;	/* right record pointer */
2292	int			error;		/* error return value */
2293	int			i;
2294
2295	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2296	    level == cur->bc_nlevels - 1)
2297		goto out0;
2298
2299	/* Set up variables for this block as "right". */
2300	right = xfs_btree_get_block(cur, level, &rbp);
2301
2302#ifdef DEBUG
2303	error = xfs_btree_check_block(cur, right, level, rbp);
2304	if (error)
2305		goto error0;
2306#endif
2307
2308	/* If we've got no left sibling then we can't shift an entry left. */
2309	xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2310	if (xfs_btree_ptr_is_null(cur, &lptr))
2311		goto out0;
2312
2313	/*
2314	 * If the cursor entry is the one that would be moved, don't
2315	 * do it... it's too complicated.
2316	 */
2317	if (cur->bc_ptrs[level] <= 1)
2318		goto out0;
2319
2320	/* Set up the left neighbor as "left". */
2321	error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2322	if (error)
2323		goto error0;
2324
2325	/* If it's full, it can't take another entry. */
2326	lrecs = xfs_btree_get_numrecs(left);
2327	if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
2328		goto out0;
2329
2330	rrecs = xfs_btree_get_numrecs(right);
2331
2332	/*
2333	 * We add one entry to the left side and remove one for the right side.
2334	 * Account for it here, the changes will be updated on disk and logged
2335	 * later.
2336	 */
2337	lrecs++;
2338	rrecs--;
2339
2340	XFS_BTREE_STATS_INC(cur, lshift);
2341	XFS_BTREE_STATS_ADD(cur, moves, 1);
2342
2343	/*
2344	 * If non-leaf, copy a key and a ptr to the left block.
2345	 * Log the changes to the left block.
2346	 */
2347	if (level > 0) {
2348		/* It's a non-leaf.  Move keys and pointers. */
2349		union xfs_btree_key	*lkp;	/* left btree key */
2350		union xfs_btree_ptr	*lpp;	/* left address pointer */
2351
2352		lkp = xfs_btree_key_addr(cur, lrecs, left);
2353		rkp = xfs_btree_key_addr(cur, 1, right);
2354
2355		lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2356		rpp = xfs_btree_ptr_addr(cur, 1, right);
2357#ifdef DEBUG
2358		error = xfs_btree_check_ptr(cur, rpp, 0, level);
2359		if (error)
2360			goto error0;
2361#endif
2362		xfs_btree_copy_keys(cur, lkp, rkp, 1);
2363		xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
2364
2365		xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
2366		xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
2367
2368		ASSERT(cur->bc_ops->keys_inorder(cur,
2369			xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
2370	} else {
2371		/* It's a leaf.  Move records.  */
2372		union xfs_btree_rec	*lrp;	/* left record pointer */
2373
2374		lrp = xfs_btree_rec_addr(cur, lrecs, left);
2375		rrp = xfs_btree_rec_addr(cur, 1, right);
2376
2377		xfs_btree_copy_recs(cur, lrp, rrp, 1);
2378		xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
2379
2380		ASSERT(cur->bc_ops->recs_inorder(cur,
2381			xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
2382	}
2383
2384	xfs_btree_set_numrecs(left, lrecs);
2385	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2386
2387	xfs_btree_set_numrecs(right, rrecs);
2388	xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2389
2390	/*
2391	 * Slide the contents of right down one entry.
2392	 */
2393	XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
2394	if (level > 0) {
2395		/* It's a nonleaf. operate on keys and ptrs */
2396#ifdef DEBUG
2397		int			i;		/* loop index */
2398
2399		for (i = 0; i < rrecs; i++) {
2400			error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
2401			if (error)
2402				goto error0;
2403		}
2404#endif
2405		xfs_btree_shift_keys(cur,
2406				xfs_btree_key_addr(cur, 2, right),
2407				-1, rrecs);
2408		xfs_btree_shift_ptrs(cur,
2409				xfs_btree_ptr_addr(cur, 2, right),
2410				-1, rrecs);
2411
2412		xfs_btree_log_keys(cur, rbp, 1, rrecs);
2413		xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2414	} else {
2415		/* It's a leaf. operate on records */
2416		xfs_btree_shift_recs(cur,
2417			xfs_btree_rec_addr(cur, 2, right),
2418			-1, rrecs);
2419		xfs_btree_log_recs(cur, rbp, 1, rrecs);
2420	}
2421
2422	/*
2423	 * Using a temporary cursor, update the parent key values of the
2424	 * block on the left.
2425	 */
2426	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2427		error = xfs_btree_dup_cursor(cur, &tcur);
2428		if (error)
2429			goto error0;
2430		i = xfs_btree_firstrec(tcur, level);
2431		XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0);
2432
2433		error = xfs_btree_decrement(tcur, level, &i);
2434		if (error)
2435			goto error1;
2436
2437		/* Update the parent high keys of the left block, if needed. */
2438		error = xfs_btree_update_keys(tcur, level);
2439		if (error)
2440			goto error1;
2441
2442		xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2443	}
2444
2445	/* Update the parent keys of the right block. */
2446	error = xfs_btree_update_keys(cur, level);
2447	if (error)
2448		goto error0;
2449
2450	/* Slide the cursor value left one. */
2451	cur->bc_ptrs[level]--;
2452
2453	*stat = 1;
2454	return 0;
2455
2456out0:
2457	*stat = 0;
2458	return 0;
2459
2460error0:
2461	return error;
2462
2463error1:
2464	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2465	return error;
2466}
2467
2468/*
2469 * Move 1 record right from cur/level if possible.
2470 * Update cur to reflect the new path.
2471 */
2472STATIC int					/* error */
2473xfs_btree_rshift(
2474	struct xfs_btree_cur	*cur,
2475	int			level,
2476	int			*stat)		/* success/failure */
2477{
2478	struct xfs_buf		*lbp;		/* left buffer pointer */
2479	struct xfs_btree_block	*left;		/* left btree block */
2480	struct xfs_buf		*rbp;		/* right buffer pointer */
2481	struct xfs_btree_block	*right;		/* right btree block */
2482	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
2483	union xfs_btree_ptr	rptr;		/* right block pointer */
2484	union xfs_btree_key	*rkp;		/* right btree key */
2485	int			rrecs;		/* right record count */
2486	int			lrecs;		/* left record count */
2487	int			error;		/* error return value */
2488	int			i;		/* loop counter */
2489
2490	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2491	    (level == cur->bc_nlevels - 1))
2492		goto out0;
2493
2494	/* Set up variables for this block as "left". */
2495	left = xfs_btree_get_block(cur, level, &lbp);
2496
2497#ifdef DEBUG
2498	error = xfs_btree_check_block(cur, left, level, lbp);
2499	if (error)
2500		goto error0;
2501#endif
2502
2503	/* If we've got no right sibling then we can't shift an entry right. */
2504	xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2505	if (xfs_btree_ptr_is_null(cur, &rptr))
2506		goto out0;
2507
2508	/*
2509	 * If the cursor entry is the one that would be moved, don't
2510	 * do it... it's too complicated.
2511	 */
2512	lrecs = xfs_btree_get_numrecs(left);
2513	if (cur->bc_ptrs[level] >= lrecs)
2514		goto out0;
2515
2516	/* Set up the right neighbor as "right". */
2517	error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2518	if (error)
2519		goto error0;
2520
2521	/* If it's full, it can't take another entry. */
2522	rrecs = xfs_btree_get_numrecs(right);
2523	if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
2524		goto out0;
2525
2526	XFS_BTREE_STATS_INC(cur, rshift);
2527	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2528
2529	/*
2530	 * Make a hole at the start of the right neighbor block, then
2531	 * copy the last left block entry to the hole.
2532	 */
2533	if (level > 0) {
2534		/* It's a nonleaf. make a hole in the keys and ptrs */
2535		union xfs_btree_key	*lkp;
2536		union xfs_btree_ptr	*lpp;
2537		union xfs_btree_ptr	*rpp;
2538
2539		lkp = xfs_btree_key_addr(cur, lrecs, left);
2540		lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2541		rkp = xfs_btree_key_addr(cur, 1, right);
2542		rpp = xfs_btree_ptr_addr(cur, 1, right);
2543
2544#ifdef DEBUG
2545		for (i = rrecs - 1; i >= 0; i--) {
2546			error = xfs_btree_check_ptr(cur, rpp, i, level);
2547			if (error)
2548				goto error0;
2549		}
2550#endif
2551
2552		xfs_btree_shift_keys(cur, rkp, 1, rrecs);
2553		xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
2554
2555#ifdef DEBUG
2556		error = xfs_btree_check_ptr(cur, lpp, 0, level);
2557		if (error)
2558			goto error0;
2559#endif
2560
2561		/* Now put the new data in, and log it. */
2562		xfs_btree_copy_keys(cur, rkp, lkp, 1);
2563		xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
2564
2565		xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
2566		xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
2567
2568		ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
2569			xfs_btree_key_addr(cur, 2, right)));
2570	} else {
2571		/* It's a leaf. make a hole in the records */
2572		union xfs_btree_rec	*lrp;
2573		union xfs_btree_rec	*rrp;
2574
2575		lrp = xfs_btree_rec_addr(cur, lrecs, left);
2576		rrp = xfs_btree_rec_addr(cur, 1, right);
2577
2578		xfs_btree_shift_recs(cur, rrp, 1, rrecs);
2579
2580		/* Now put the new data in, and log it. */
2581		xfs_btree_copy_recs(cur, rrp, lrp, 1);
2582		xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
2583	}
2584
2585	/*
2586	 * Decrement and log left's numrecs, bump and log right's numrecs.
2587	 */
2588	xfs_btree_set_numrecs(left, --lrecs);
2589	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2590
2591	xfs_btree_set_numrecs(right, ++rrecs);
2592	xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2593
2594	/*
2595	 * Using a temporary cursor, update the parent key values of the
2596	 * block on the right.
2597	 */
2598	error = xfs_btree_dup_cursor(cur, &tcur);
2599	if (error)
2600		goto error0;
2601	i = xfs_btree_lastrec(tcur, level);
2602	XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0);
2603
2604	error = xfs_btree_increment(tcur, level, &i);
2605	if (error)
2606		goto error1;
2607
2608	/* Update the parent high keys of the left block, if needed. */
2609	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2610		error = xfs_btree_update_keys(cur, level);
2611		if (error)
2612			goto error1;
2613	}
2614
2615	/* Update the parent keys of the right block. */
2616	error = xfs_btree_update_keys(tcur, level);
2617	if (error)
2618		goto error1;
2619
2620	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2621
2622	*stat = 1;
2623	return 0;
2624
2625out0:
2626	*stat = 0;
2627	return 0;
2628
2629error0:
2630	return error;
2631
2632error1:
2633	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2634	return error;
2635}
2636
2637/*
2638 * Split cur/level block in half.
2639 * Return new block number and the key to its first
2640 * record (to be inserted into parent).
2641 */
2642STATIC int					/* error */
2643__xfs_btree_split(
2644	struct xfs_btree_cur	*cur,
2645	int			level,
2646	union xfs_btree_ptr	*ptrp,
2647	union xfs_btree_key	*key,
2648	struct xfs_btree_cur	**curp,
2649	int			*stat)		/* success/failure */
2650{
2651	union xfs_btree_ptr	lptr;		/* left sibling block ptr */
2652	struct xfs_buf		*lbp;		/* left buffer pointer */
2653	struct xfs_btree_block	*left;		/* left btree block */
2654	union xfs_btree_ptr	rptr;		/* right sibling block ptr */
2655	struct xfs_buf		*rbp;		/* right buffer pointer */
2656	struct xfs_btree_block	*right;		/* right btree block */
2657	union xfs_btree_ptr	rrptr;		/* right-right sibling ptr */
2658	struct xfs_buf		*rrbp;		/* right-right buffer pointer */
2659	struct xfs_btree_block	*rrblock;	/* right-right btree block */
2660	int			lrecs;
2661	int			rrecs;
2662	int			src_index;
2663	int			error;		/* error return value */
2664#ifdef DEBUG
2665	int			i;
2666#endif
2667
2668	XFS_BTREE_STATS_INC(cur, split);
2669
2670	/* Set up left block (current one). */
2671	left = xfs_btree_get_block(cur, level, &lbp);
2672
2673#ifdef DEBUG
2674	error = xfs_btree_check_block(cur, left, level, lbp);
2675	if (error)
2676		goto error0;
2677#endif
2678
2679	xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2680
2681	/* Allocate the new block. If we can't do it, we're toast. Give up. */
2682	error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
2683	if (error)
2684		goto error0;
2685	if (*stat == 0)
2686		goto out0;
2687	XFS_BTREE_STATS_INC(cur, alloc);
2688
2689	/* Set up the new block as "right". */
2690	error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
2691	if (error)
2692		goto error0;
2693
2694	/* Fill in the btree header for the new right block. */
2695	xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
2696
2697	/*
2698	 * Split the entries between the old and the new block evenly.
2699	 * Make sure that if there's an odd number of entries now, that
2700	 * each new block will have the same number of entries.
2701	 */
2702	lrecs = xfs_btree_get_numrecs(left);
2703	rrecs = lrecs / 2;
2704	if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
2705		rrecs++;
2706	src_index = (lrecs - rrecs + 1);
2707
2708	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2709
2710	/* Adjust numrecs for the later get_*_keys() calls. */
2711	lrecs -= rrecs;
2712	xfs_btree_set_numrecs(left, lrecs);
2713	xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
2714
2715	/*
2716	 * Copy btree block entries from the left block over to the
2717	 * new block, the right. Update the right block and log the
2718	 * changes.
2719	 */
2720	if (level > 0) {
2721		/* It's a non-leaf.  Move keys and pointers. */
2722		union xfs_btree_key	*lkp;	/* left btree key */
2723		union xfs_btree_ptr	*lpp;	/* left address pointer */
2724		union xfs_btree_key	*rkp;	/* right btree key */
2725		union xfs_btree_ptr	*rpp;	/* right address pointer */
2726
2727		lkp = xfs_btree_key_addr(cur, src_index, left);
2728		lpp = xfs_btree_ptr_addr(cur, src_index, left);
2729		rkp = xfs_btree_key_addr(cur, 1, right);
2730		rpp = xfs_btree_ptr_addr(cur, 1, right);
2731
2732#ifdef DEBUG
2733		for (i = src_index; i < rrecs; i++) {
2734			error = xfs_btree_check_ptr(cur, lpp, i, level);
2735			if (error)
2736				goto error0;
2737		}
2738#endif
2739
2740		/* Copy the keys & pointers to the new block. */
2741		xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
2742		xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
2743
2744		xfs_btree_log_keys(cur, rbp, 1, rrecs);
2745		xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2746
2747		/* Stash the keys of the new block for later insertion. */
2748		xfs_btree_get_node_keys(cur, right, key);
2749	} else {
2750		/* It's a leaf.  Move records.  */
2751		union xfs_btree_rec	*lrp;	/* left record pointer */
2752		union xfs_btree_rec	*rrp;	/* right record pointer */
2753
2754		lrp = xfs_btree_rec_addr(cur, src_index, left);
2755		rrp = xfs_btree_rec_addr(cur, 1, right);
2756
2757		/* Copy records to the new block. */
2758		xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
2759		xfs_btree_log_recs(cur, rbp, 1, rrecs);
2760
2761		/* Stash the keys of the new block for later insertion. */
2762		xfs_btree_get_leaf_keys(cur, right, key);
2763	}
2764
2765	/*
2766	 * Find the left block number by looking in the buffer.
2767	 * Adjust sibling pointers.
2768	 */
2769	xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
2770	xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
2771	xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2772	xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2773
2774	xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
2775	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
2776
2777	/*
2778	 * If there's a block to the new block's right, make that block
2779	 * point back to right instead of to left.
2780	 */
2781	if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
2782		error = xfs_btree_read_buf_block(cur, &rrptr,
2783							0, &rrblock, &rrbp);
2784		if (error)
2785			goto error0;
2786		xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
2787		xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
2788	}
2789
2790	/* Update the parent high keys of the left block, if needed. */
2791	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2792		error = xfs_btree_update_keys(cur, level);
2793		if (error)
2794			goto error0;
2795	}
2796
2797	/*
2798	 * If the cursor is really in the right block, move it there.
2799	 * If it's just pointing past the last entry in left, then we'll
2800	 * insert there, so don't change anything in that case.
2801	 */
2802	if (cur->bc_ptrs[level] > lrecs + 1) {
2803		xfs_btree_setbuf(cur, level, rbp);
2804		cur->bc_ptrs[level] -= lrecs;
2805	}
2806	/*
2807	 * If there are more levels, we'll need another cursor which refers
2808	 * the right block, no matter where this cursor was.
2809	 */
2810	if (level + 1 < cur->bc_nlevels) {
2811		error = xfs_btree_dup_cursor(cur, curp);
2812		if (error)
2813			goto error0;
2814		(*curp)->bc_ptrs[level + 1]++;
2815	}
2816	*ptrp = rptr;
2817	*stat = 1;
2818	return 0;
2819out0:
2820	*stat = 0;
2821	return 0;
2822
2823error0:
2824	return error;
2825}
2826
2827struct xfs_btree_split_args {
2828	struct xfs_btree_cur	*cur;
2829	int			level;
2830	union xfs_btree_ptr	*ptrp;
2831	union xfs_btree_key	*key;
2832	struct xfs_btree_cur	**curp;
2833	int			*stat;		/* success/failure */
2834	int			result;
2835	bool			kswapd;	/* allocation in kswapd context */
2836	struct completion	*done;
2837	struct work_struct	work;
2838};
2839
2840/*
2841 * Stack switching interfaces for allocation
2842 */
2843static void
2844xfs_btree_split_worker(
2845	struct work_struct	*work)
2846{
2847	struct xfs_btree_split_args	*args = container_of(work,
2848						struct xfs_btree_split_args, work);
2849	unsigned long		pflags;
2850	unsigned long		new_pflags = PF_MEMALLOC_NOFS;
2851
2852	/*
2853	 * we are in a transaction context here, but may also be doing work
2854	 * in kswapd context, and hence we may need to inherit that state
2855	 * temporarily to ensure that we don't block waiting for memory reclaim
2856	 * in any way.
2857	 */
2858	if (args->kswapd)
2859		new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2860
2861	current_set_flags_nested(&pflags, new_pflags);
2862
2863	args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
2864					 args->key, args->curp, args->stat);
2865	complete(args->done);
2866
2867	current_restore_flags_nested(&pflags, new_pflags);
2868}
2869
2870/*
2871 * BMBT split requests often come in with little stack to work on. Push
2872 * them off to a worker thread so there is lots of stack to use. For the other
2873 * btree types, just call directly to avoid the context switch overhead here.
2874 */
2875STATIC int					/* error */
2876xfs_btree_split(
2877	struct xfs_btree_cur	*cur,
2878	int			level,
2879	union xfs_btree_ptr	*ptrp,
2880	union xfs_btree_key	*key,
2881	struct xfs_btree_cur	**curp,
2882	int			*stat)		/* success/failure */
2883{
2884	struct xfs_btree_split_args	args;
2885	DECLARE_COMPLETION_ONSTACK(done);
2886
2887	if (cur->bc_btnum != XFS_BTNUM_BMAP)
2888		return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
2889
2890	args.cur = cur;
2891	args.level = level;
2892	args.ptrp = ptrp;
2893	args.key = key;
2894	args.curp = curp;
2895	args.stat = stat;
2896	args.done = &done;
2897	args.kswapd = current_is_kswapd();
2898	INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
2899	queue_work(xfs_alloc_wq, &args.work);
2900	wait_for_completion(&done);
2901	destroy_work_on_stack(&args.work);
2902	return args.result;
2903}
2904
2905
2906/*
2907 * Copy the old inode root contents into a real block and make the
2908 * broot point to it.
2909 */
2910int						/* error */
2911xfs_btree_new_iroot(
2912	struct xfs_btree_cur	*cur,		/* btree cursor */
2913	int			*logflags,	/* logging flags for inode */
2914	int			*stat)		/* return status - 0 fail */
2915{
2916	struct xfs_buf		*cbp;		/* buffer for cblock */
2917	struct xfs_btree_block	*block;		/* btree block */
2918	struct xfs_btree_block	*cblock;	/* child btree block */
2919	union xfs_btree_key	*ckp;		/* child key pointer */
2920	union xfs_btree_ptr	*cpp;		/* child ptr pointer */
2921	union xfs_btree_key	*kp;		/* pointer to btree key */
2922	union xfs_btree_ptr	*pp;		/* pointer to block addr */
2923	union xfs_btree_ptr	nptr;		/* new block addr */
2924	int			level;		/* btree level */
2925	int			error;		/* error return code */
2926#ifdef DEBUG
2927	int			i;		/* loop counter */
2928#endif
2929
2930	XFS_BTREE_STATS_INC(cur, newroot);
2931
2932	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
2933
2934	level = cur->bc_nlevels - 1;
2935
2936	block = xfs_btree_get_iroot(cur);
2937	pp = xfs_btree_ptr_addr(cur, 1, block);
2938
2939	/* Allocate the new block. If we can't do it, we're toast. Give up. */
2940	error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
2941	if (error)
2942		goto error0;
2943	if (*stat == 0)
2944		return 0;
2945
2946	XFS_BTREE_STATS_INC(cur, alloc);
2947
2948	/* Copy the root into a real block. */
2949	error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
2950	if (error)
2951		goto error0;
2952
2953	/*
2954	 * we can't just memcpy() the root in for CRC enabled btree blocks.
2955	 * In that case have to also ensure the blkno remains correct
2956	 */
2957	memcpy(cblock, block, xfs_btree_block_len(cur));
2958	if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
2959		if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
2960			cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
2961		else
2962			cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
2963	}
2964
2965	be16_add_cpu(&block->bb_level, 1);
2966	xfs_btree_set_numrecs(block, 1);
2967	cur->bc_nlevels++;
2968	cur->bc_ptrs[level + 1] = 1;
2969
2970	kp = xfs_btree_key_addr(cur, 1, block);
2971	ckp = xfs_btree_key_addr(cur, 1, cblock);
2972	xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
2973
2974	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
2975#ifdef DEBUG
2976	for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
2977		error = xfs_btree_check_ptr(cur, pp, i, level);
2978		if (error)
2979			goto error0;
2980	}
2981#endif
2982	xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
2983
2984#ifdef DEBUG
2985	error = xfs_btree_check_ptr(cur, &nptr, 0, level);
2986	if (error)
2987		goto error0;
2988#endif
2989	xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
2990
2991	xfs_iroot_realloc(cur->bc_private.b.ip,
2992			  1 - xfs_btree_get_numrecs(cblock),
2993			  cur->bc_private.b.whichfork);
2994
2995	xfs_btree_setbuf(cur, level, cbp);
2996
2997	/*
2998	 * Do all this logging at the end so that
2999	 * the root is at the right level.
3000	 */
3001	xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
3002	xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
3003	xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
3004
3005	*logflags |=
3006		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
3007	*stat = 1;
3008	return 0;
3009error0:
3010	return error;
3011}
3012
3013/*
3014 * Allocate a new root block, fill it in.
3015 */
3016STATIC int				/* error */
3017xfs_btree_new_root(
3018	struct xfs_btree_cur	*cur,	/* btree cursor */
3019	int			*stat)	/* success/failure */
3020{
3021	struct xfs_btree_block	*block;	/* one half of the old root block */
3022	struct xfs_buf		*bp;	/* buffer containing block */
3023	int			error;	/* error return value */
3024	struct xfs_buf		*lbp;	/* left buffer pointer */
3025	struct xfs_btree_block	*left;	/* left btree block */
3026	struct xfs_buf		*nbp;	/* new (root) buffer */
3027	struct xfs_btree_block	*new;	/* new (root) btree block */
3028	int			nptr;	/* new value for key index, 1 or 2 */
3029	struct xfs_buf		*rbp;	/* right buffer pointer */
3030	struct xfs_btree_block	*right;	/* right btree block */
3031	union xfs_btree_ptr	rptr;
3032	union xfs_btree_ptr	lptr;
3033
3034	XFS_BTREE_STATS_INC(cur, newroot);
3035
3036	/* initialise our start point from the cursor */
3037	cur->bc_ops->init_ptr_from_cur(cur, &rptr);
3038
3039	/* Allocate the new block. If we can't do it, we're toast. Give up. */
3040	error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
3041	if (error)
3042		goto error0;
3043	if (*stat == 0)
3044		goto out0;
3045	XFS_BTREE_STATS_INC(cur, alloc);
3046
3047	/* Set up the new block. */
3048	error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
3049	if (error)
3050		goto error0;
3051
3052	/* Set the root in the holding structure  increasing the level by 1. */
3053	cur->bc_ops->set_root(cur, &lptr, 1);
3054
3055	/*
3056	 * At the previous root level there are now two blocks: the old root,
3057	 * and the new block generated when it was split.  We don't know which
3058	 * one the cursor is pointing at, so we set up variables "left" and
3059	 * "right" for each case.
3060	 */
3061	block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
3062
3063#ifdef DEBUG
3064	error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
3065	if (error)
3066		goto error0;
3067#endif
3068
3069	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3070	if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3071		/* Our block is left, pick up the right block. */
3072		lbp = bp;
3073		xfs_btree_buf_to_ptr(cur, lbp, &lptr);
3074		left = block;
3075		error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
3076		if (error)
3077			goto error0;
3078		bp = rbp;
3079		nptr = 1;
3080	} else {
3081		/* Our block is right, pick up the left block. */
3082		rbp = bp;
3083		xfs_btree_buf_to_ptr(cur, rbp, &rptr);
3084		right = block;
3085		xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
3086		error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3087		if (error)
3088			goto error0;
3089		bp = lbp;
3090		nptr = 2;
3091	}
3092
3093	/* Fill in the new block's btree header and log it. */
3094	xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
3095	xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
3096	ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
3097			!xfs_btree_ptr_is_null(cur, &rptr));
3098
3099	/* Fill in the key data in the new root. */
3100	if (xfs_btree_get_level(left) > 0) {
3101		/*
3102		 * Get the keys for the left block's keys and put them directly
3103		 * in the parent block.  Do the same for the right block.
3104		 */
3105		xfs_btree_get_node_keys(cur, left,
3106				xfs_btree_key_addr(cur, 1, new));
3107		xfs_btree_get_node_keys(cur, right,
3108				xfs_btree_key_addr(cur, 2, new));
3109	} else {
3110		/*
3111		 * Get the keys for the left block's records and put them
3112		 * directly in the parent block.  Do the same for the right
3113		 * block.
3114		 */
3115		xfs_btree_get_leaf_keys(cur, left,
3116			xfs_btree_key_addr(cur, 1, new));
3117		xfs_btree_get_leaf_keys(cur, right,
3118			xfs_btree_key_addr(cur, 2, new));
3119	}
3120	xfs_btree_log_keys(cur, nbp, 1, 2);
3121
3122	/* Fill in the pointer data in the new root. */
3123	xfs_btree_copy_ptrs(cur,
3124		xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
3125	xfs_btree_copy_ptrs(cur,
3126		xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
3127	xfs_btree_log_ptrs(cur, nbp, 1, 2);
3128
3129	/* Fix up the cursor. */
3130	xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
3131	cur->bc_ptrs[cur->bc_nlevels] = nptr;
3132	cur->bc_nlevels++;
3133	*stat = 1;
3134	return 0;
3135error0:
3136	return error;
3137out0:
3138	*stat = 0;
3139	return 0;
3140}
3141
3142STATIC int
3143xfs_btree_make_block_unfull(
3144	struct xfs_btree_cur	*cur,	/* btree cursor */
3145	int			level,	/* btree level */
3146	int			numrecs,/* # of recs in block */
3147	int			*oindex,/* old tree index */
3148	int			*index,	/* new tree index */
3149	union xfs_btree_ptr	*nptr,	/* new btree ptr */
3150	struct xfs_btree_cur	**ncur,	/* new btree cursor */
3151	union xfs_btree_key	*key,	/* key of new block */
3152	int			*stat)
3153{
3154	int			error = 0;
3155
3156	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3157	    level == cur->bc_nlevels - 1) {
3158		struct xfs_inode *ip = cur->bc_private.b.ip;
3159
3160		if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
3161			/* A root block that can be made bigger. */
3162			xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
3163			*stat = 1;
3164		} else {
3165			/* A root block that needs replacing */
3166			int	logflags = 0;
3167
3168			error = xfs_btree_new_iroot(cur, &logflags, stat);
3169			if (error || *stat == 0)
3170				return error;
3171
3172			xfs_trans_log_inode(cur->bc_tp, ip, logflags);
3173		}
3174
3175		return 0;
3176	}
3177
3178	/* First, try shifting an entry to the right neighbor. */
3179	error = xfs_btree_rshift(cur, level, stat);
3180	if (error || *stat)
3181		return error;
3182
3183	/* Next, try shifting an entry to the left neighbor. */
3184	error = xfs_btree_lshift(cur, level, stat);
3185	if (error)
3186		return error;
3187
3188	if (*stat) {
3189		*oindex = *index = cur->bc_ptrs[level];
3190		return 0;
3191	}
3192
3193	/*
3194	 * Next, try splitting the current block in half.
3195	 *
3196	 * If this works we have to re-set our variables because we
3197	 * could be in a different block now.
3198	 */
3199	error = xfs_btree_split(cur, level, nptr, key, ncur, stat);
3200	if (error || *stat == 0)
3201		return error;
3202
3203
3204	*index = cur->bc_ptrs[level];
3205	return 0;
3206}
3207
3208/*
3209 * Insert one record/level.  Return information to the caller
3210 * allowing the next level up to proceed if necessary.
3211 */
3212STATIC int
3213xfs_btree_insrec(
3214	struct xfs_btree_cur	*cur,	/* btree cursor */
3215	int			level,	/* level to insert record at */
3216	union xfs_btree_ptr	*ptrp,	/* i/o: block number inserted */
3217	union xfs_btree_rec	*rec,	/* record to insert */
3218	union xfs_btree_key	*key,	/* i/o: block key for ptrp */
3219	struct xfs_btree_cur	**curp,	/* output: new cursor replacing cur */
3220	int			*stat)	/* success/failure */
3221{
3222	struct xfs_btree_block	*block;	/* btree block */
3223	struct xfs_buf		*bp;	/* buffer for block */
3224	union xfs_btree_ptr	nptr;	/* new block ptr */
3225	struct xfs_btree_cur	*ncur;	/* new btree cursor */
3226	union xfs_btree_key	nkey;	/* new block key */
3227	union xfs_btree_key	*lkey;
3228	int			optr;	/* old key/record index */
3229	int			ptr;	/* key/record index */
3230	int			numrecs;/* number of records */
3231	int			error;	/* error return value */
3232#ifdef DEBUG
3233	int			i;
3234#endif
3235	xfs_daddr_t		old_bn;
3236
3237	ncur = NULL;
3238	lkey = &nkey;
3239
3240	/*
3241	 * If we have an external root pointer, and we've made it to the
3242	 * root level, allocate a new root block and we're done.
3243	 */
3244	if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3245	    (level >= cur->bc_nlevels)) {
3246		error = xfs_btree_new_root(cur, stat);
3247		xfs_btree_set_ptr_null(cur, ptrp);
3248
3249		return error;
3250	}
3251
3252	/* If we're off the left edge, return failure. */
3253	ptr = cur->bc_ptrs[level];
3254	if (ptr == 0) {
3255		*stat = 0;
3256		return 0;
3257	}
3258
3259	optr = ptr;
3260
3261	XFS_BTREE_STATS_INC(cur, insrec);
3262
3263	/* Get pointers to the btree buffer and block. */
3264	block = xfs_btree_get_block(cur, level, &bp);
3265	old_bn = bp ? bp->b_bn : XFS_BUF_DADDR_NULL;
3266	numrecs = xfs_btree_get_numrecs(block);
3267
3268#ifdef DEBUG
3269	error = xfs_btree_check_block(cur, block, level, bp);
3270	if (error)
3271		goto error0;
3272
3273	/* Check that the new entry is being inserted in the right place. */
3274	if (ptr <= numrecs) {
3275		if (level == 0) {
3276			ASSERT(cur->bc_ops->recs_inorder(cur, rec,
3277				xfs_btree_rec_addr(cur, ptr, block)));
3278		} else {
3279			ASSERT(cur->bc_ops->keys_inorder(cur, key,
3280				xfs_btree_key_addr(cur, ptr, block)));
3281		}
3282	}
3283#endif
3284
3285	/*
3286	 * If the block is full, we can't insert the new entry until we
3287	 * make the block un-full.
3288	 */
3289	xfs_btree_set_ptr_null(cur, &nptr);
3290	if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
3291		error = xfs_btree_make_block_unfull(cur, level, numrecs,
3292					&optr, &ptr, &nptr, &ncur, lkey, stat);
3293		if (error || *stat == 0)
3294			goto error0;
3295	}
3296
3297	/*
3298	 * The current block may have changed if the block was
3299	 * previously full and we have just made space in it.
3300	 */
3301	block = xfs_btree_get_block(cur, level, &bp);
3302	numrecs = xfs_btree_get_numrecs(block);
3303
3304#ifdef DEBUG
3305	error = xfs_btree_check_block(cur, block, level, bp);
3306	if (error)
3307		return error;
3308#endif
3309
3310	/*
3311	 * At this point we know there's room for our new entry in the block
3312	 * we're pointing at.
3313	 */
3314	XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
3315
3316	if (level > 0) {
3317		/* It's a nonleaf. make a hole in the keys and ptrs */
3318		union xfs_btree_key	*kp;
3319		union xfs_btree_ptr	*pp;
3320
3321		kp = xfs_btree_key_addr(cur, ptr, block);
3322		pp = xfs_btree_ptr_addr(cur, ptr, block);
3323
3324#ifdef DEBUG
3325		for (i = numrecs - ptr; i >= 0; i--) {
3326			error = xfs_btree_check_ptr(cur, pp, i, level);
3327			if (error)
3328				return error;
3329		}
3330#endif
3331
3332		xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
3333		xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
3334
3335#ifdef DEBUG
3336		error = xfs_btree_check_ptr(cur, ptrp, 0, level);
3337		if (error)
3338			goto error0;
3339#endif
3340
3341		/* Now put the new data in, bump numrecs and log it. */
3342		xfs_btree_copy_keys(cur, kp, key, 1);
3343		xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
3344		numrecs++;
3345		xfs_btree_set_numrecs(block, numrecs);
3346		xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
3347		xfs_btree_log_keys(cur, bp, ptr, numrecs);
3348#ifdef DEBUG
3349		if (ptr < numrecs) {
3350			ASSERT(cur->bc_ops->keys_inorder(cur, kp,
3351				xfs_btree_key_addr(cur, ptr + 1, block)));
3352		}
3353#endif
3354	} else {
3355		/* It's a leaf. make a hole in the records */
3356		union xfs_btree_rec             *rp;
3357
3358		rp = xfs_btree_rec_addr(cur, ptr, block);
3359
3360		xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
3361
3362		/* Now put the new data in, bump numrecs and log it. */
3363		xfs_btree_copy_recs(cur, rp, rec, 1);
3364		xfs_btree_set_numrecs(block, ++numrecs);
3365		xfs_btree_log_recs(cur, bp, ptr, numrecs);
3366#ifdef DEBUG
3367		if (ptr < numrecs) {
3368			ASSERT(cur->bc_ops->recs_inorder(cur, rp,
3369				xfs_btree_rec_addr(cur, ptr + 1, block)));
3370		}
3371#endif
3372	}
3373
3374	/* Log the new number of records in the btree header. */
3375	xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3376
3377	/*
3378	 * If we just inserted into a new tree block, we have to
3379	 * recalculate nkey here because nkey is out of date.
3380	 *
3381	 * Otherwise we're just updating an existing block (having shoved
3382	 * some records into the new tree block), so use the regular key
3383	 * update mechanism.
3384	 */
3385	if (bp && bp->b_bn != old_bn) {
3386		xfs_btree_get_keys(cur, block, lkey);
3387	} else if (xfs_btree_needs_key_update(cur, optr)) {
3388		error = xfs_btree_update_keys(cur, level);
3389		if (error)
3390			goto error0;
3391	}
3392
3393	/*
3394	 * If we are tracking the last record in the tree and
3395	 * we are at the far right edge of the tree, update it.
3396	 */
3397	if (xfs_btree_is_lastrec(cur, block, level)) {
3398		cur->bc_ops->update_lastrec(cur, block, rec,
3399					    ptr, LASTREC_INSREC);
3400	}
3401
3402	/*
3403	 * Return the new block number, if any.
3404	 * If there is one, give back a record value and a cursor too.
3405	 */
3406	*ptrp = nptr;
3407	if (!xfs_btree_ptr_is_null(cur, &nptr)) {
3408		xfs_btree_copy_keys(cur, key, lkey, 1);
3409		*curp = ncur;
3410	}
3411
3412	*stat = 1;
3413	return 0;
3414
3415error0:
3416	return error;
3417}
3418
3419/*
3420 * Insert the record at the point referenced by cur.
3421 *
3422 * A multi-level split of the tree on insert will invalidate the original
3423 * cursor.  All callers of this function should assume that the cursor is
3424 * no longer valid and revalidate it.
3425 */
3426int
3427xfs_btree_insert(
3428	struct xfs_btree_cur	*cur,
3429	int			*stat)
3430{
3431	int			error;	/* error return value */
3432	int			i;	/* result value, 0 for failure */
3433	int			level;	/* current level number in btree */
3434	union xfs_btree_ptr	nptr;	/* new block number (split result) */
3435	struct xfs_btree_cur	*ncur;	/* new cursor (split result) */
3436	struct xfs_btree_cur	*pcur;	/* previous level's cursor */
3437	union xfs_btree_key	bkey;	/* key of block to insert */
3438	union xfs_btree_key	*key;
3439	union xfs_btree_rec	rec;	/* record to insert */
3440
3441	level = 0;
3442	ncur = NULL;
3443	pcur = cur;
3444	key = &bkey;
3445
3446	xfs_btree_set_ptr_null(cur, &nptr);
3447
3448	/* Make a key out of the record data to be inserted, and save it. */
3449	cur->bc_ops->init_rec_from_cur(cur, &rec);
3450	cur->bc_ops->init_key_from_rec(key, &rec);
3451
3452	/*
3453	 * Loop going up the tree, starting at the leaf level.
3454	 * Stop when we don't get a split block, that must mean that
3455	 * the insert is finished with this level.
3456	 */
3457	do {
3458		/*
3459		 * Insert nrec/nptr into this level of the tree.
3460		 * Note if we fail, nptr will be null.
3461		 */
3462		error = xfs_btree_insrec(pcur, level, &nptr, &rec, key,
3463				&ncur, &i);
3464		if (error) {
3465			if (pcur != cur)
3466				xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
3467			goto error0;
3468		}
3469
3470		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3471		level++;
3472
3473		/*
3474		 * See if the cursor we just used is trash.
3475		 * Can't trash the caller's cursor, but otherwise we should
3476		 * if ncur is a new cursor or we're about to be done.
3477		 */
3478		if (pcur != cur &&
3479		    (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
3480			/* Save the state from the cursor before we trash it */
3481			if (cur->bc_ops->update_cursor)
3482				cur->bc_ops->update_cursor(pcur, cur);
3483			cur->bc_nlevels = pcur->bc_nlevels;
3484			xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
3485		}
3486		/* If we got a new cursor, switch to it. */
3487		if (ncur) {
3488			pcur = ncur;
3489			ncur = NULL;
3490		}
3491	} while (!xfs_btree_ptr_is_null(cur, &nptr));
3492
3493	*stat = i;
3494	return 0;
3495error0:
3496	return error;
3497}
3498
3499/*
3500 * Try to merge a non-leaf block back into the inode root.
3501 *
3502 * Note: the killroot names comes from the fact that we're effectively
3503 * killing the old root block.  But because we can't just delete the
3504 * inode we have to copy the single block it was pointing to into the
3505 * inode.
3506 */
3507STATIC int
3508xfs_btree_kill_iroot(
3509	struct xfs_btree_cur	*cur)
3510{
3511	int			whichfork = cur->bc_private.b.whichfork;
3512	struct xfs_inode	*ip = cur->bc_private.b.ip;
3513	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
3514	struct xfs_btree_block	*block;
3515	struct xfs_btree_block	*cblock;
3516	union xfs_btree_key	*kp;
3517	union xfs_btree_key	*ckp;
3518	union xfs_btree_ptr	*pp;
3519	union xfs_btree_ptr	*cpp;
3520	struct xfs_buf		*cbp;
3521	int			level;
3522	int			index;
3523	int			numrecs;
3524	int			error;
3525#ifdef DEBUG
3526	union xfs_btree_ptr	ptr;
3527	int			i;
3528#endif
 
3529
3530	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3531	ASSERT(cur->bc_nlevels > 1);
3532
3533	/*
3534	 * Don't deal with the root block needs to be a leaf case.
3535	 * We're just going to turn the thing back into extents anyway.
3536	 */
3537	level = cur->bc_nlevels - 1;
3538	if (level == 1)
3539		goto out0;
3540
3541	/*
3542	 * Give up if the root has multiple children.
3543	 */
3544	block = xfs_btree_get_iroot(cur);
3545	if (xfs_btree_get_numrecs(block) != 1)
3546		goto out0;
3547
3548	cblock = xfs_btree_get_block(cur, level - 1, &cbp);
3549	numrecs = xfs_btree_get_numrecs(cblock);
3550
3551	/*
3552	 * Only do this if the next level will fit.
3553	 * Then the data must be copied up to the inode,
3554	 * instead of freeing the root you free the next level.
3555	 */
3556	if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
3557		goto out0;
3558
3559	XFS_BTREE_STATS_INC(cur, killroot);
3560
3561#ifdef DEBUG
3562	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
3563	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3564	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
3565	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3566#endif
3567
3568	index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
3569	if (index) {
3570		xfs_iroot_realloc(cur->bc_private.b.ip, index,
3571				  cur->bc_private.b.whichfork);
3572		block = ifp->if_broot;
3573	}
3574
3575	be16_add_cpu(&block->bb_numrecs, index);
3576	ASSERT(block->bb_numrecs == cblock->bb_numrecs);
3577
3578	kp = xfs_btree_key_addr(cur, 1, block);
3579	ckp = xfs_btree_key_addr(cur, 1, cblock);
3580	xfs_btree_copy_keys(cur, kp, ckp, numrecs);
3581
3582	pp = xfs_btree_ptr_addr(cur, 1, block);
3583	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3584#ifdef DEBUG
3585	for (i = 0; i < numrecs; i++) {
3586		error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
3587		if (error)
3588			return error;
3589	}
3590#endif
3591	xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3592
3593	error = xfs_btree_free_block(cur, cbp);
3594	if (error)
3595		return error;
3596
3597	cur->bc_bufs[level - 1] = NULL;
3598	be16_add_cpu(&block->bb_level, -1);
3599	xfs_trans_log_inode(cur->bc_tp, ip,
3600		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
3601	cur->bc_nlevels--;
3602out0:
3603	return 0;
3604}
3605
3606/*
3607 * Kill the current root node, and replace it with it's only child node.
3608 */
3609STATIC int
3610xfs_btree_kill_root(
3611	struct xfs_btree_cur	*cur,
3612	struct xfs_buf		*bp,
3613	int			level,
3614	union xfs_btree_ptr	*newroot)
3615{
3616	int			error;
3617
3618	XFS_BTREE_STATS_INC(cur, killroot);
3619
3620	/*
3621	 * Update the root pointer, decreasing the level by 1 and then
3622	 * free the old root.
3623	 */
3624	cur->bc_ops->set_root(cur, newroot, -1);
3625
3626	error = xfs_btree_free_block(cur, bp);
3627	if (error)
3628		return error;
3629
3630	cur->bc_bufs[level] = NULL;
3631	cur->bc_ra[level] = 0;
3632	cur->bc_nlevels--;
3633
3634	return 0;
3635}
3636
3637STATIC int
3638xfs_btree_dec_cursor(
3639	struct xfs_btree_cur	*cur,
3640	int			level,
3641	int			*stat)
3642{
3643	int			error;
3644	int			i;
3645
3646	if (level > 0) {
3647		error = xfs_btree_decrement(cur, level, &i);
3648		if (error)
3649			return error;
3650	}
3651
3652	*stat = 1;
3653	return 0;
3654}
3655
3656/*
3657 * Single level of the btree record deletion routine.
3658 * Delete record pointed to by cur/level.
3659 * Remove the record from its block then rebalance the tree.
3660 * Return 0 for error, 1 for done, 2 to go on to the next level.
3661 */
3662STATIC int					/* error */
3663xfs_btree_delrec(
3664	struct xfs_btree_cur	*cur,		/* btree cursor */
3665	int			level,		/* level removing record from */
3666	int			*stat)		/* fail/done/go-on */
3667{
3668	struct xfs_btree_block	*block;		/* btree block */
3669	union xfs_btree_ptr	cptr;		/* current block ptr */
3670	struct xfs_buf		*bp;		/* buffer for block */
3671	int			error;		/* error return value */
3672	int			i;		/* loop counter */
3673	union xfs_btree_ptr	lptr;		/* left sibling block ptr */
3674	struct xfs_buf		*lbp;		/* left buffer pointer */
3675	struct xfs_btree_block	*left;		/* left btree block */
3676	int			lrecs = 0;	/* left record count */
3677	int			ptr;		/* key/record index */
3678	union xfs_btree_ptr	rptr;		/* right sibling block ptr */
3679	struct xfs_buf		*rbp;		/* right buffer pointer */
3680	struct xfs_btree_block	*right;		/* right btree block */
3681	struct xfs_btree_block	*rrblock;	/* right-right btree block */
3682	struct xfs_buf		*rrbp;		/* right-right buffer pointer */
3683	int			rrecs = 0;	/* right record count */
3684	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
3685	int			numrecs;	/* temporary numrec count */
3686
3687	tcur = NULL;
3688
3689	/* Get the index of the entry being deleted, check for nothing there. */
3690	ptr = cur->bc_ptrs[level];
3691	if (ptr == 0) {
3692		*stat = 0;
3693		return 0;
3694	}
3695
3696	/* Get the buffer & block containing the record or key/ptr. */
3697	block = xfs_btree_get_block(cur, level, &bp);
3698	numrecs = xfs_btree_get_numrecs(block);
3699
3700#ifdef DEBUG
3701	error = xfs_btree_check_block(cur, block, level, bp);
3702	if (error)
3703		goto error0;
3704#endif
3705
3706	/* Fail if we're off the end of the block. */
3707	if (ptr > numrecs) {
3708		*stat = 0;
3709		return 0;
3710	}
3711
3712	XFS_BTREE_STATS_INC(cur, delrec);
3713	XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
3714
3715	/* Excise the entries being deleted. */
3716	if (level > 0) {
3717		/* It's a nonleaf. operate on keys and ptrs */
3718		union xfs_btree_key	*lkp;
3719		union xfs_btree_ptr	*lpp;
3720
3721		lkp = xfs_btree_key_addr(cur, ptr + 1, block);
3722		lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
3723
3724#ifdef DEBUG
3725		for (i = 0; i < numrecs - ptr; i++) {
3726			error = xfs_btree_check_ptr(cur, lpp, i, level);
3727			if (error)
3728				goto error0;
3729		}
3730#endif
3731
3732		if (ptr < numrecs) {
3733			xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
3734			xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
3735			xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
3736			xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
3737		}
3738	} else {
3739		/* It's a leaf. operate on records */
3740		if (ptr < numrecs) {
3741			xfs_btree_shift_recs(cur,
3742				xfs_btree_rec_addr(cur, ptr + 1, block),
3743				-1, numrecs - ptr);
3744			xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
3745		}
3746	}
3747
3748	/*
3749	 * Decrement and log the number of entries in the block.
3750	 */
3751	xfs_btree_set_numrecs(block, --numrecs);
3752	xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3753
3754	/*
3755	 * If we are tracking the last record in the tree and
3756	 * we are at the far right edge of the tree, update it.
3757	 */
3758	if (xfs_btree_is_lastrec(cur, block, level)) {
3759		cur->bc_ops->update_lastrec(cur, block, NULL,
3760					    ptr, LASTREC_DELREC);
3761	}
3762
3763	/*
3764	 * We're at the root level.  First, shrink the root block in-memory.
3765	 * Try to get rid of the next level down.  If we can't then there's
3766	 * nothing left to do.
3767	 */
3768	if (level == cur->bc_nlevels - 1) {
3769		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3770			xfs_iroot_realloc(cur->bc_private.b.ip, -1,
3771					  cur->bc_private.b.whichfork);
3772
3773			error = xfs_btree_kill_iroot(cur);
3774			if (error)
3775				goto error0;
3776
3777			error = xfs_btree_dec_cursor(cur, level, stat);
3778			if (error)
3779				goto error0;
3780			*stat = 1;
3781			return 0;
3782		}
3783
3784		/*
3785		 * If this is the root level, and there's only one entry left,
3786		 * and it's NOT the leaf level, then we can get rid of this
3787		 * level.
3788		 */
3789		if (numrecs == 1 && level > 0) {
3790			union xfs_btree_ptr	*pp;
3791			/*
3792			 * pp is still set to the first pointer in the block.
3793			 * Make it the new root of the btree.
3794			 */
3795			pp = xfs_btree_ptr_addr(cur, 1, block);
3796			error = xfs_btree_kill_root(cur, bp, level, pp);
3797			if (error)
3798				goto error0;
3799		} else if (level > 0) {
3800			error = xfs_btree_dec_cursor(cur, level, stat);
3801			if (error)
3802				goto error0;
3803		}
3804		*stat = 1;
3805		return 0;
3806	}
3807
3808	/*
3809	 * If we deleted the leftmost entry in the block, update the
3810	 * key values above us in the tree.
3811	 */
3812	if (xfs_btree_needs_key_update(cur, ptr)) {
3813		error = xfs_btree_update_keys(cur, level);
3814		if (error)
3815			goto error0;
3816	}
3817
3818	/*
3819	 * If the number of records remaining in the block is at least
3820	 * the minimum, we're done.
3821	 */
3822	if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
3823		error = xfs_btree_dec_cursor(cur, level, stat);
3824		if (error)
3825			goto error0;
3826		return 0;
3827	}
3828
3829	/*
3830	 * Otherwise, we have to move some records around to keep the
3831	 * tree balanced.  Look at the left and right sibling blocks to
3832	 * see if we can re-balance by moving only one record.
3833	 */
3834	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3835	xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
3836
3837	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3838		/*
3839		 * One child of root, need to get a chance to copy its contents
3840		 * into the root and delete it. Can't go up to next level,
3841		 * there's nothing to delete there.
3842		 */
3843		if (xfs_btree_ptr_is_null(cur, &rptr) &&
3844		    xfs_btree_ptr_is_null(cur, &lptr) &&
3845		    level == cur->bc_nlevels - 2) {
3846			error = xfs_btree_kill_iroot(cur);
3847			if (!error)
3848				error = xfs_btree_dec_cursor(cur, level, stat);
3849			if (error)
3850				goto error0;
3851			return 0;
3852		}
3853	}
3854
3855	ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
3856	       !xfs_btree_ptr_is_null(cur, &lptr));
3857
3858	/*
3859	 * Duplicate the cursor so our btree manipulations here won't
3860	 * disrupt the next level up.
3861	 */
3862	error = xfs_btree_dup_cursor(cur, &tcur);
3863	if (error)
3864		goto error0;
3865
3866	/*
3867	 * If there's a right sibling, see if it's ok to shift an entry
3868	 * out of it.
3869	 */
3870	if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3871		/*
3872		 * Move the temp cursor to the last entry in the next block.
3873		 * Actually any entry but the first would suffice.
3874		 */
3875		i = xfs_btree_lastrec(tcur, level);
3876		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3877
3878		error = xfs_btree_increment(tcur, level, &i);
3879		if (error)
3880			goto error0;
3881		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3882
3883		i = xfs_btree_lastrec(tcur, level);
3884		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3885
3886		/* Grab a pointer to the block. */
3887		right = xfs_btree_get_block(tcur, level, &rbp);
3888#ifdef DEBUG
3889		error = xfs_btree_check_block(tcur, right, level, rbp);
3890		if (error)
3891			goto error0;
3892#endif
3893		/* Grab the current block number, for future use. */
3894		xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
3895
3896		/*
3897		 * If right block is full enough so that removing one entry
3898		 * won't make it too empty, and left-shifting an entry out
3899		 * of right to us works, we're done.
3900		 */
3901		if (xfs_btree_get_numrecs(right) - 1 >=
3902		    cur->bc_ops->get_minrecs(tcur, level)) {
3903			error = xfs_btree_lshift(tcur, level, &i);
3904			if (error)
3905				goto error0;
3906			if (i) {
3907				ASSERT(xfs_btree_get_numrecs(block) >=
3908				       cur->bc_ops->get_minrecs(tcur, level));
3909
3910				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3911				tcur = NULL;
3912
3913				error = xfs_btree_dec_cursor(cur, level, stat);
3914				if (error)
3915					goto error0;
3916				return 0;
3917			}
3918		}
3919
3920		/*
3921		 * Otherwise, grab the number of records in right for
3922		 * future reference, and fix up the temp cursor to point
3923		 * to our block again (last record).
3924		 */
3925		rrecs = xfs_btree_get_numrecs(right);
3926		if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3927			i = xfs_btree_firstrec(tcur, level);
3928			XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3929
3930			error = xfs_btree_decrement(tcur, level, &i);
3931			if (error)
3932				goto error0;
3933			XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3934		}
3935	}
3936
3937	/*
3938	 * If there's a left sibling, see if it's ok to shift an entry
3939	 * out of it.
3940	 */
3941	if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3942		/*
3943		 * Move the temp cursor to the first entry in the
3944		 * previous block.
3945		 */
3946		i = xfs_btree_firstrec(tcur, level);
3947		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3948
3949		error = xfs_btree_decrement(tcur, level, &i);
3950		if (error)
3951			goto error0;
3952		i = xfs_btree_firstrec(tcur, level);
3953		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3954
3955		/* Grab a pointer to the block. */
3956		left = xfs_btree_get_block(tcur, level, &lbp);
3957#ifdef DEBUG
3958		error = xfs_btree_check_block(cur, left, level, lbp);
3959		if (error)
3960			goto error0;
3961#endif
3962		/* Grab the current block number, for future use. */
3963		xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
3964
3965		/*
3966		 * If left block is full enough so that removing one entry
3967		 * won't make it too empty, and right-shifting an entry out
3968		 * of left to us works, we're done.
3969		 */
3970		if (xfs_btree_get_numrecs(left) - 1 >=
3971		    cur->bc_ops->get_minrecs(tcur, level)) {
3972			error = xfs_btree_rshift(tcur, level, &i);
3973			if (error)
3974				goto error0;
3975			if (i) {
3976				ASSERT(xfs_btree_get_numrecs(block) >=
3977				       cur->bc_ops->get_minrecs(tcur, level));
3978				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3979				tcur = NULL;
3980				if (level == 0)
3981					cur->bc_ptrs[0]++;
3982
3983				*stat = 1;
3984				return 0;
3985			}
3986		}
3987
3988		/*
3989		 * Otherwise, grab the number of records in right for
3990		 * future reference.
3991		 */
3992		lrecs = xfs_btree_get_numrecs(left);
3993	}
3994
3995	/* Delete the temp cursor, we're done with it. */
3996	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3997	tcur = NULL;
3998
3999	/* If here, we need to do a join to keep the tree balanced. */
4000	ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
4001
4002	if (!xfs_btree_ptr_is_null(cur, &lptr) &&
4003	    lrecs + xfs_btree_get_numrecs(block) <=
4004			cur->bc_ops->get_maxrecs(cur, level)) {
4005		/*
4006		 * Set "right" to be the starting block,
4007		 * "left" to be the left neighbor.
4008		 */
4009		rptr = cptr;
4010		right = block;
4011		rbp = bp;
4012		error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
4013		if (error)
4014			goto error0;
4015
4016	/*
4017	 * If that won't work, see if we can join with the right neighbor block.
4018	 */
4019	} else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
4020		   rrecs + xfs_btree_get_numrecs(block) <=
4021			cur->bc_ops->get_maxrecs(cur, level)) {
4022		/*
4023		 * Set "left" to be the starting block,
4024		 * "right" to be the right neighbor.
4025		 */
4026		lptr = cptr;
4027		left = block;
4028		lbp = bp;
4029		error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
4030		if (error)
4031			goto error0;
4032
4033	/*
4034	 * Otherwise, we can't fix the imbalance.
4035	 * Just return.  This is probably a logic error, but it's not fatal.
4036	 */
4037	} else {
4038		error = xfs_btree_dec_cursor(cur, level, stat);
4039		if (error)
4040			goto error0;
4041		return 0;
4042	}
4043
4044	rrecs = xfs_btree_get_numrecs(right);
4045	lrecs = xfs_btree_get_numrecs(left);
4046
4047	/*
4048	 * We're now going to join "left" and "right" by moving all the stuff
4049	 * in "right" to "left" and deleting "right".
4050	 */
4051	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
4052	if (level > 0) {
4053		/* It's a non-leaf.  Move keys and pointers. */
4054		union xfs_btree_key	*lkp;	/* left btree key */
4055		union xfs_btree_ptr	*lpp;	/* left address pointer */
4056		union xfs_btree_key	*rkp;	/* right btree key */
4057		union xfs_btree_ptr	*rpp;	/* right address pointer */
4058
4059		lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
4060		lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
4061		rkp = xfs_btree_key_addr(cur, 1, right);
4062		rpp = xfs_btree_ptr_addr(cur, 1, right);
4063#ifdef DEBUG
4064		for (i = 1; i < rrecs; i++) {
4065			error = xfs_btree_check_ptr(cur, rpp, i, level);
4066			if (error)
4067				goto error0;
4068		}
4069#endif
4070		xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
4071		xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
4072
4073		xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
4074		xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
4075	} else {
4076		/* It's a leaf.  Move records.  */
4077		union xfs_btree_rec	*lrp;	/* left record pointer */
4078		union xfs_btree_rec	*rrp;	/* right record pointer */
4079
4080		lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
4081		rrp = xfs_btree_rec_addr(cur, 1, right);
4082
4083		xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
4084		xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
4085	}
4086
4087	XFS_BTREE_STATS_INC(cur, join);
4088
4089	/*
4090	 * Fix up the number of records and right block pointer in the
4091	 * surviving block, and log it.
4092	 */
4093	xfs_btree_set_numrecs(left, lrecs + rrecs);
4094	xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
4095	xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4096	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
4097
4098	/* If there is a right sibling, point it to the remaining block. */
4099	xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4100	if (!xfs_btree_ptr_is_null(cur, &cptr)) {
4101		error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
4102		if (error)
4103			goto error0;
4104		xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
4105		xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
4106	}
4107
4108	/* Free the deleted block. */
4109	error = xfs_btree_free_block(cur, rbp);
4110	if (error)
4111		goto error0;
4112
4113	/*
4114	 * If we joined with the left neighbor, set the buffer in the
4115	 * cursor to the left block, and fix up the index.
4116	 */
4117	if (bp != lbp) {
4118		cur->bc_bufs[level] = lbp;
4119		cur->bc_ptrs[level] += lrecs;
4120		cur->bc_ra[level] = 0;
4121	}
4122	/*
4123	 * If we joined with the right neighbor and there's a level above
4124	 * us, increment the cursor at that level.
4125	 */
4126	else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
4127		   (level + 1 < cur->bc_nlevels)) {
4128		error = xfs_btree_increment(cur, level + 1, &i);
4129		if (error)
4130			goto error0;
4131	}
4132
4133	/*
4134	 * Readjust the ptr at this level if it's not a leaf, since it's
4135	 * still pointing at the deletion point, which makes the cursor
4136	 * inconsistent.  If this makes the ptr 0, the caller fixes it up.
4137	 * We can't use decrement because it would change the next level up.
4138	 */
4139	if (level > 0)
4140		cur->bc_ptrs[level]--;
4141
4142	/*
4143	 * We combined blocks, so we have to update the parent keys if the
4144	 * btree supports overlapped intervals.  However, bc_ptrs[level + 1]
4145	 * points to the old block so that the caller knows which record to
4146	 * delete.  Therefore, the caller must be savvy enough to call updkeys
4147	 * for us if we return stat == 2.  The other exit points from this
4148	 * function don't require deletions further up the tree, so they can
4149	 * call updkeys directly.
4150	 */
4151
4152	/* Return value means the next level up has something to do. */
4153	*stat = 2;
4154	return 0;
4155
4156error0:
4157	if (tcur)
4158		xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
4159	return error;
4160}
4161
4162/*
4163 * Delete the record pointed to by cur.
4164 * The cursor refers to the place where the record was (could be inserted)
4165 * when the operation returns.
4166 */
4167int					/* error */
4168xfs_btree_delete(
4169	struct xfs_btree_cur	*cur,
4170	int			*stat)	/* success/failure */
4171{
4172	int			error;	/* error return value */
4173	int			level;
4174	int			i;
4175	bool			joined = false;
4176
4177	/*
4178	 * Go up the tree, starting at leaf level.
4179	 *
4180	 * If 2 is returned then a join was done; go to the next level.
4181	 * Otherwise we are done.
4182	 */
4183	for (level = 0, i = 2; i == 2; level++) {
4184		error = xfs_btree_delrec(cur, level, &i);
4185		if (error)
4186			goto error0;
4187		if (i == 2)
4188			joined = true;
4189	}
4190
4191	/*
4192	 * If we combined blocks as part of deleting the record, delrec won't
4193	 * have updated the parent high keys so we have to do that here.
4194	 */
4195	if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) {
4196		error = xfs_btree_updkeys_force(cur, 0);
4197		if (error)
4198			goto error0;
4199	}
4200
4201	if (i == 0) {
4202		for (level = 1; level < cur->bc_nlevels; level++) {
4203			if (cur->bc_ptrs[level] == 0) {
4204				error = xfs_btree_decrement(cur, level, &i);
4205				if (error)
4206					goto error0;
4207				break;
4208			}
4209		}
4210	}
4211
4212	*stat = i;
4213	return 0;
4214error0:
4215	return error;
4216}
4217
4218/*
4219 * Get the data from the pointed-to record.
4220 */
4221int					/* error */
4222xfs_btree_get_rec(
4223	struct xfs_btree_cur	*cur,	/* btree cursor */
4224	union xfs_btree_rec	**recp,	/* output: btree record */
4225	int			*stat)	/* output: success/failure */
4226{
4227	struct xfs_btree_block	*block;	/* btree block */
4228	struct xfs_buf		*bp;	/* buffer pointer */
4229	int			ptr;	/* record number */
4230#ifdef DEBUG
4231	int			error;	/* error return value */
4232#endif
4233
4234	ptr = cur->bc_ptrs[0];
4235	block = xfs_btree_get_block(cur, 0, &bp);
4236
4237#ifdef DEBUG
4238	error = xfs_btree_check_block(cur, block, 0, bp);
4239	if (error)
4240		return error;
4241#endif
4242
4243	/*
4244	 * Off the right end or left end, return failure.
4245	 */
4246	if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
4247		*stat = 0;
4248		return 0;
4249	}
4250
4251	/*
4252	 * Point to the record and extract its data.
4253	 */
4254	*recp = xfs_btree_rec_addr(cur, ptr, block);
4255	*stat = 1;
4256	return 0;
4257}
4258
4259/* Visit a block in a btree. */
4260STATIC int
4261xfs_btree_visit_block(
4262	struct xfs_btree_cur		*cur,
4263	int				level,
4264	xfs_btree_visit_blocks_fn	fn,
4265	void				*data)
4266{
4267	struct xfs_btree_block		*block;
4268	struct xfs_buf			*bp;
4269	union xfs_btree_ptr		rptr;
4270	int				error;
4271
4272	/* do right sibling readahead */
4273	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
4274	block = xfs_btree_get_block(cur, level, &bp);
4275
4276	/* process the block */
4277	error = fn(cur, level, data);
4278	if (error)
4279		return error;
4280
4281	/* now read rh sibling block for next iteration */
4282	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
4283	if (xfs_btree_ptr_is_null(cur, &rptr))
4284		return -ENOENT;
4285
4286	return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
4287}
4288
4289
4290/* Visit every block in a btree. */
4291int
4292xfs_btree_visit_blocks(
4293	struct xfs_btree_cur		*cur,
4294	xfs_btree_visit_blocks_fn	fn,
4295	void				*data)
4296{
4297	union xfs_btree_ptr		lptr;
4298	int				level;
4299	struct xfs_btree_block		*block = NULL;
4300	int				error = 0;
4301
4302	cur->bc_ops->init_ptr_from_cur(cur, &lptr);
4303
4304	/* for each level */
4305	for (level = cur->bc_nlevels - 1; level >= 0; level--) {
4306		/* grab the left hand block */
4307		error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
4308		if (error)
4309			return error;
4310
4311		/* readahead the left most block for the next level down */
4312		if (level > 0) {
4313			union xfs_btree_ptr     *ptr;
4314
4315			ptr = xfs_btree_ptr_addr(cur, 1, block);
4316			xfs_btree_readahead_ptr(cur, ptr, 1);
4317
4318			/* save for the next iteration of the loop */
4319			xfs_btree_copy_ptrs(cur, &lptr, ptr, 1);
4320		}
4321
4322		/* for each buffer in the level */
4323		do {
4324			error = xfs_btree_visit_block(cur, level, fn, data);
4325		} while (!error);
4326
4327		if (error != -ENOENT)
4328			return error;
4329	}
4330
4331	return 0;
4332}
4333
4334/*
4335 * Change the owner of a btree.
4336 *
4337 * The mechanism we use here is ordered buffer logging. Because we don't know
4338 * how many buffers were are going to need to modify, we don't really want to
4339 * have to make transaction reservations for the worst case of every buffer in a
4340 * full size btree as that may be more space that we can fit in the log....
4341 *
4342 * We do the btree walk in the most optimal manner possible - we have sibling
4343 * pointers so we can just walk all the blocks on each level from left to right
4344 * in a single pass, and then move to the next level and do the same. We can
4345 * also do readahead on the sibling pointers to get IO moving more quickly,
4346 * though for slow disks this is unlikely to make much difference to performance
4347 * as the amount of CPU work we have to do before moving to the next block is
4348 * relatively small.
4349 *
4350 * For each btree block that we load, modify the owner appropriately, set the
4351 * buffer as an ordered buffer and log it appropriately. We need to ensure that
4352 * we mark the region we change dirty so that if the buffer is relogged in
4353 * a subsequent transaction the changes we make here as an ordered buffer are
4354 * correctly relogged in that transaction.  If we are in recovery context, then
4355 * just queue the modified buffer as delayed write buffer so the transaction
4356 * recovery completion writes the changes to disk.
4357 */
4358struct xfs_btree_block_change_owner_info {
4359	uint64_t		new_owner;
4360	struct list_head	*buffer_list;
4361};
4362
4363static int
4364xfs_btree_block_change_owner(
4365	struct xfs_btree_cur	*cur,
4366	int			level,
4367	void			*data)
4368{
4369	struct xfs_btree_block_change_owner_info	*bbcoi = data;
4370	struct xfs_btree_block	*block;
4371	struct xfs_buf		*bp;
4372
4373	/* modify the owner */
4374	block = xfs_btree_get_block(cur, level, &bp);
4375	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
4376		if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
4377			return 0;
4378		block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
4379	} else {
4380		if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
4381			return 0;
4382		block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
4383	}
4384
4385	/*
4386	 * If the block is a root block hosted in an inode, we might not have a
4387	 * buffer pointer here and we shouldn't attempt to log the change as the
4388	 * information is already held in the inode and discarded when the root
4389	 * block is formatted into the on-disk inode fork. We still change it,
4390	 * though, so everything is consistent in memory.
4391	 */
4392	if (!bp) {
4393		ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
4394		ASSERT(level == cur->bc_nlevels - 1);
4395		return 0;
4396	}
4397
4398	if (cur->bc_tp) {
4399		if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
4400			xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
4401			return -EAGAIN;
4402		}
4403	} else {
4404		xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
4405	}
4406
4407	return 0;
4408}
4409
4410int
4411xfs_btree_change_owner(
4412	struct xfs_btree_cur	*cur,
4413	uint64_t		new_owner,
4414	struct list_head	*buffer_list)
4415{
4416	struct xfs_btree_block_change_owner_info	bbcoi;
4417
4418	bbcoi.new_owner = new_owner;
4419	bbcoi.buffer_list = buffer_list;
4420
4421	return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner,
4422			&bbcoi);
4423}
4424
4425/* Verify the v5 fields of a long-format btree block. */
4426xfs_failaddr_t
4427xfs_btree_lblock_v5hdr_verify(
4428	struct xfs_buf		*bp,
4429	uint64_t		owner)
4430{
4431	struct xfs_mount	*mp = bp->b_target->bt_mount;
4432	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
4433
4434	if (!xfs_sb_version_hascrc(&mp->m_sb))
4435		return __this_address;
4436	if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
4437		return __this_address;
4438	if (block->bb_u.l.bb_blkno != cpu_to_be64(bp->b_bn))
4439		return __this_address;
4440	if (owner != XFS_RMAP_OWN_UNKNOWN &&
4441	    be64_to_cpu(block->bb_u.l.bb_owner) != owner)
4442		return __this_address;
4443	return NULL;
4444}
4445
4446/* Verify a long-format btree block. */
4447xfs_failaddr_t
4448xfs_btree_lblock_verify(
4449	struct xfs_buf		*bp,
4450	unsigned int		max_recs)
4451{
4452	struct xfs_mount	*mp = bp->b_target->bt_mount;
4453	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
4454
4455	/* numrecs verification */
4456	if (be16_to_cpu(block->bb_numrecs) > max_recs)
4457		return __this_address;
4458
4459	/* sibling pointer verification */
4460	if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
4461	    !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))
4462		return __this_address;
4463	if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
4464	    !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))
4465		return __this_address;
4466
4467	return NULL;
4468}
4469
4470/**
4471 * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format
4472 *				      btree block
4473 *
4474 * @bp: buffer containing the btree block
4475 * @max_recs: pointer to the m_*_mxr max records field in the xfs mount
4476 * @pag_max_level: pointer to the per-ag max level field
4477 */
4478xfs_failaddr_t
4479xfs_btree_sblock_v5hdr_verify(
4480	struct xfs_buf		*bp)
4481{
4482	struct xfs_mount	*mp = bp->b_target->bt_mount;
4483	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
4484	struct xfs_perag	*pag = bp->b_pag;
4485
4486	if (!xfs_sb_version_hascrc(&mp->m_sb))
4487		return __this_address;
4488	if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
4489		return __this_address;
4490	if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
4491		return __this_address;
4492	if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
4493		return __this_address;
4494	return NULL;
4495}
4496
4497/**
4498 * xfs_btree_sblock_verify() -- verify a short-format btree block
4499 *
4500 * @bp: buffer containing the btree block
4501 * @max_recs: maximum records allowed in this btree node
4502 */
4503xfs_failaddr_t
4504xfs_btree_sblock_verify(
4505	struct xfs_buf		*bp,
4506	unsigned int		max_recs)
4507{
4508	struct xfs_mount	*mp = bp->b_target->bt_mount;
4509	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
4510	xfs_agblock_t		agno;
4511
4512	/* numrecs verification */
4513	if (be16_to_cpu(block->bb_numrecs) > max_recs)
4514		return __this_address;
4515
4516	/* sibling pointer verification */
4517	agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp));
4518	if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
4519	    !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib)))
4520		return __this_address;
4521	if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
4522	    !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib)))
4523		return __this_address;
4524
4525	return NULL;
4526}
4527
4528/*
4529 * Calculate the number of btree levels needed to store a given number of
4530 * records in a short-format btree.
4531 */
4532uint
4533xfs_btree_compute_maxlevels(
4534	uint			*limits,
4535	unsigned long		len)
4536{
4537	uint			level;
4538	unsigned long		maxblocks;
4539
4540	maxblocks = (len + limits[0] - 1) / limits[0];
4541	for (level = 1; maxblocks > 1; level++)
4542		maxblocks = (maxblocks + limits[1] - 1) / limits[1];
4543	return level;
4544}
4545
4546/*
4547 * Query a regular btree for all records overlapping a given interval.
4548 * Start with a LE lookup of the key of low_rec and return all records
4549 * until we find a record with a key greater than the key of high_rec.
4550 */
4551STATIC int
4552xfs_btree_simple_query_range(
4553	struct xfs_btree_cur		*cur,
4554	union xfs_btree_key		*low_key,
4555	union xfs_btree_key		*high_key,
4556	xfs_btree_query_range_fn	fn,
4557	void				*priv)
4558{
4559	union xfs_btree_rec		*recp;
4560	union xfs_btree_key		rec_key;
4561	int64_t				diff;
4562	int				stat;
4563	bool				firstrec = true;
4564	int				error;
4565
4566	ASSERT(cur->bc_ops->init_high_key_from_rec);
4567	ASSERT(cur->bc_ops->diff_two_keys);
4568
4569	/*
4570	 * Find the leftmost record.  The btree cursor must be set
4571	 * to the low record used to generate low_key.
4572	 */
4573	stat = 0;
4574	error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
4575	if (error)
4576		goto out;
4577
4578	/* Nothing?  See if there's anything to the right. */
4579	if (!stat) {
4580		error = xfs_btree_increment(cur, 0, &stat);
4581		if (error)
4582			goto out;
4583	}
4584
4585	while (stat) {
4586		/* Find the record. */
4587		error = xfs_btree_get_rec(cur, &recp, &stat);
4588		if (error || !stat)
4589			break;
4590
4591		/* Skip if high_key(rec) < low_key. */
4592		if (firstrec) {
4593			cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
4594			firstrec = false;
4595			diff = cur->bc_ops->diff_two_keys(cur, low_key,
4596					&rec_key);
4597			if (diff > 0)
4598				goto advloop;
4599		}
4600
4601		/* Stop if high_key < low_key(rec). */
4602		cur->bc_ops->init_key_from_rec(&rec_key, recp);
4603		diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key);
4604		if (diff > 0)
4605			break;
4606
4607		/* Callback */
4608		error = fn(cur, recp, priv);
4609		if (error < 0 || error == XFS_BTREE_QUERY_RANGE_ABORT)
4610			break;
4611
4612advloop:
4613		/* Move on to the next record. */
4614		error = xfs_btree_increment(cur, 0, &stat);
4615		if (error)
4616			break;
4617	}
4618
4619out:
4620	return error;
4621}
4622
4623/*
4624 * Query an overlapped interval btree for all records overlapping a given
4625 * interval.  This function roughly follows the algorithm given in
4626 * "Interval Trees" of _Introduction to Algorithms_, which is section
4627 * 14.3 in the 2nd and 3rd editions.
4628 *
4629 * First, generate keys for the low and high records passed in.
4630 *
4631 * For any leaf node, generate the high and low keys for the record.
4632 * If the record keys overlap with the query low/high keys, pass the
4633 * record to the function iterator.
4634 *
4635 * For any internal node, compare the low and high keys of each
4636 * pointer against the query low/high keys.  If there's an overlap,
4637 * follow the pointer.
4638 *
4639 * As an optimization, we stop scanning a block when we find a low key
4640 * that is greater than the query's high key.
4641 */
4642STATIC int
4643xfs_btree_overlapped_query_range(
4644	struct xfs_btree_cur		*cur,
4645	union xfs_btree_key		*low_key,
4646	union xfs_btree_key		*high_key,
4647	xfs_btree_query_range_fn	fn,
4648	void				*priv)
4649{
4650	union xfs_btree_ptr		ptr;
4651	union xfs_btree_ptr		*pp;
4652	union xfs_btree_key		rec_key;
4653	union xfs_btree_key		rec_hkey;
4654	union xfs_btree_key		*lkp;
4655	union xfs_btree_key		*hkp;
4656	union xfs_btree_rec		*recp;
4657	struct xfs_btree_block		*block;
4658	int64_t				ldiff;
4659	int64_t				hdiff;
4660	int				level;
4661	struct xfs_buf			*bp;
4662	int				i;
4663	int				error;
4664
4665	/* Load the root of the btree. */
4666	level = cur->bc_nlevels - 1;
4667	cur->bc_ops->init_ptr_from_cur(cur, &ptr);
4668	error = xfs_btree_lookup_get_block(cur, level, &ptr, &block);
4669	if (error)
4670		return error;
4671	xfs_btree_get_block(cur, level, &bp);
4672	trace_xfs_btree_overlapped_query_range(cur, level, bp);
4673#ifdef DEBUG
4674	error = xfs_btree_check_block(cur, block, level, bp);
4675	if (error)
4676		goto out;
4677#endif
4678	cur->bc_ptrs[level] = 1;
4679
4680	while (level < cur->bc_nlevels) {
4681		block = xfs_btree_get_block(cur, level, &bp);
4682
4683		/* End of node, pop back towards the root. */
4684		if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
4685pop_up:
4686			if (level < cur->bc_nlevels - 1)
4687				cur->bc_ptrs[level + 1]++;
4688			level++;
4689			continue;
4690		}
4691
4692		if (level == 0) {
4693			/* Handle a leaf node. */
4694			recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
4695
4696			cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
4697			ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey,
4698					low_key);
4699
4700			cur->bc_ops->init_key_from_rec(&rec_key, recp);
4701			hdiff = cur->bc_ops->diff_two_keys(cur, high_key,
4702					&rec_key);
4703
4704			/*
4705			 * If (record's high key >= query's low key) and
4706			 *    (query's high key >= record's low key), then
4707			 * this record overlaps the query range; callback.
4708			 */
4709			if (ldiff >= 0 && hdiff >= 0) {
4710				error = fn(cur, recp, priv);
4711				if (error < 0 ||
4712				    error == XFS_BTREE_QUERY_RANGE_ABORT)
4713					break;
4714			} else if (hdiff < 0) {
4715				/* Record is larger than high key; pop. */
4716				goto pop_up;
4717			}
4718			cur->bc_ptrs[level]++;
4719			continue;
4720		}
4721
4722		/* Handle an internal node. */
4723		lkp = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
4724		hkp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
4725		pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
4726
4727		ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key);
4728		hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp);
4729
4730		/*
4731		 * If (pointer's high key >= query's low key) and
4732		 *    (query's high key >= pointer's low key), then
4733		 * this record overlaps the query range; follow pointer.
4734		 */
4735		if (ldiff >= 0 && hdiff >= 0) {
4736			level--;
4737			error = xfs_btree_lookup_get_block(cur, level, pp,
4738					&block);
4739			if (error)
4740				goto out;
4741			xfs_btree_get_block(cur, level, &bp);
4742			trace_xfs_btree_overlapped_query_range(cur, level, bp);
4743#ifdef DEBUG
4744			error = xfs_btree_check_block(cur, block, level, bp);
4745			if (error)
4746				goto out;
4747#endif
4748			cur->bc_ptrs[level] = 1;
4749			continue;
4750		} else if (hdiff < 0) {
4751			/* The low key is larger than the upper range; pop. */
4752			goto pop_up;
4753		}
4754		cur->bc_ptrs[level]++;
4755	}
4756
4757out:
4758	/*
4759	 * If we don't end this function with the cursor pointing at a record
4760	 * block, a subsequent non-error cursor deletion will not release
4761	 * node-level buffers, causing a buffer leak.  This is quite possible
4762	 * with a zero-results range query, so release the buffers if we
4763	 * failed to return any results.
4764	 */
4765	if (cur->bc_bufs[0] == NULL) {
4766		for (i = 0; i < cur->bc_nlevels; i++) {
4767			if (cur->bc_bufs[i]) {
4768				xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
4769				cur->bc_bufs[i] = NULL;
4770				cur->bc_ptrs[i] = 0;
4771				cur->bc_ra[i] = 0;
4772			}
4773		}
4774	}
4775
4776	return error;
4777}
4778
4779/*
4780 * Query a btree for all records overlapping a given interval of keys.  The
4781 * supplied function will be called with each record found; return one of the
4782 * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
4783 * code.  This function returns XFS_BTREE_QUERY_RANGE_ABORT, zero, or a
4784 * negative error code.
4785 */
4786int
4787xfs_btree_query_range(
4788	struct xfs_btree_cur		*cur,
4789	union xfs_btree_irec		*low_rec,
4790	union xfs_btree_irec		*high_rec,
4791	xfs_btree_query_range_fn	fn,
4792	void				*priv)
4793{
4794	union xfs_btree_rec		rec;
4795	union xfs_btree_key		low_key;
4796	union xfs_btree_key		high_key;
4797
4798	/* Find the keys of both ends of the interval. */
4799	cur->bc_rec = *high_rec;
4800	cur->bc_ops->init_rec_from_cur(cur, &rec);
4801	cur->bc_ops->init_key_from_rec(&high_key, &rec);
4802
4803	cur->bc_rec = *low_rec;
4804	cur->bc_ops->init_rec_from_cur(cur, &rec);
4805	cur->bc_ops->init_key_from_rec(&low_key, &rec);
4806
4807	/* Enforce low key < high key. */
4808	if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0)
4809		return -EINVAL;
4810
4811	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
4812		return xfs_btree_simple_query_range(cur, &low_key,
4813				&high_key, fn, priv);
4814	return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
4815			fn, priv);
4816}
4817
4818/* Query a btree for all records. */
4819int
4820xfs_btree_query_all(
4821	struct xfs_btree_cur		*cur,
4822	xfs_btree_query_range_fn	fn,
4823	void				*priv)
4824{
4825	union xfs_btree_key		low_key;
4826	union xfs_btree_key		high_key;
4827
4828	memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));
4829	memset(&low_key, 0, sizeof(low_key));
4830	memset(&high_key, 0xFF, sizeof(high_key));
4831
4832	return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv);
4833}
4834
4835/*
4836 * Calculate the number of blocks needed to store a given number of records
4837 * in a short-format (per-AG metadata) btree.
4838 */
4839xfs_extlen_t
4840xfs_btree_calc_size(
4841	uint			*limits,
4842	unsigned long long	len)
4843{
4844	int			level;
4845	int			maxrecs;
4846	xfs_extlen_t		rval;
4847
4848	maxrecs = limits[0];
4849	for (level = 0, rval = 0; len > 1; level++) {
4850		len += maxrecs - 1;
4851		do_div(len, maxrecs);
4852		maxrecs = limits[1];
4853		rval += len;
4854	}
4855	return rval;
4856}
4857
4858static int
4859xfs_btree_count_blocks_helper(
4860	struct xfs_btree_cur	*cur,
4861	int			level,
4862	void			*data)
4863{
4864	xfs_extlen_t		*blocks = data;
4865	(*blocks)++;
4866
4867	return 0;
4868}
4869
4870/* Count the blocks in a btree and return the result in *blocks. */
4871int
4872xfs_btree_count_blocks(
4873	struct xfs_btree_cur	*cur,
4874	xfs_extlen_t		*blocks)
4875{
4876	*blocks = 0;
4877	return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
4878			blocks);
4879}
4880
4881/* Compare two btree pointers. */
4882int64_t
4883xfs_btree_diff_two_ptrs(
4884	struct xfs_btree_cur		*cur,
4885	const union xfs_btree_ptr	*a,
4886	const union xfs_btree_ptr	*b)
4887{
4888	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
4889		return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l);
4890	return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
4891}
4892
4893/* If there's an extent, we're done. */
4894STATIC int
4895xfs_btree_has_record_helper(
4896	struct xfs_btree_cur		*cur,
4897	union xfs_btree_rec		*rec,
4898	void				*priv)
4899{
4900	return XFS_BTREE_QUERY_RANGE_ABORT;
4901}
4902
4903/* Is there a record covering a given range of keys? */
4904int
4905xfs_btree_has_record(
4906	struct xfs_btree_cur	*cur,
4907	union xfs_btree_irec	*low,
4908	union xfs_btree_irec	*high,
4909	bool			*exists)
4910{
4911	int			error;
4912
4913	error = xfs_btree_query_range(cur, low, high,
4914			&xfs_btree_has_record_helper, NULL);
4915	if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
4916		*exists = true;
4917		return 0;
4918	}
4919	*exists = false;
4920	return error;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4921}
v5.4
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
   4 * All Rights Reserved.
 
 
 
 
 
 
 
 
 
 
 
 
 
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
  10#include "xfs_log_format.h"
  11#include "xfs_trans_resv.h"
  12#include "xfs_bit.h"
  13#include "xfs_mount.h"
 
  14#include "xfs_inode.h"
  15#include "xfs_trans.h"
 
  16#include "xfs_buf_item.h"
  17#include "xfs_btree.h"
  18#include "xfs_errortag.h"
  19#include "xfs_error.h"
  20#include "xfs_trace.h"
 
  21#include "xfs_alloc.h"
  22#include "xfs_log.h"
  23
  24/*
  25 * Cursor allocation zone.
  26 */
  27kmem_zone_t	*xfs_btree_cur_zone;
  28
  29/*
  30 * Btree magic numbers.
  31 */
  32static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
  33	{ XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
  34	  XFS_FIBT_MAGIC, 0 },
  35	{ XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
  36	  XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
  37	  XFS_REFC_CRC_MAGIC }
  38};
  39
  40uint32_t
  41xfs_btree_magic(
  42	int			crc,
  43	xfs_btnum_t		btnum)
  44{
  45	uint32_t		magic = xfs_magics[crc][btnum];
  46
  47	/* Ensure we asked for crc for crc-only magics. */
  48	ASSERT(magic != 0);
  49	return magic;
  50}
  51
  52/*
  53 * Check a long btree block header.  Return the address of the failing check,
  54 * or NULL if everything is ok.
  55 */
  56xfs_failaddr_t
  57__xfs_btree_check_lblock(
  58	struct xfs_btree_cur	*cur,
  59	struct xfs_btree_block	*block,
  60	int			level,
  61	struct xfs_buf		*bp)
  62{
  63	struct xfs_mount	*mp = cur->bc_mp;
  64	xfs_btnum_t		btnum = cur->bc_btnum;
  65	int			crc = xfs_sb_version_hascrc(&mp->m_sb);
  66
  67	if (crc) {
  68		if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
  69			return __this_address;
  70		if (block->bb_u.l.bb_blkno !=
  71		    cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL))
  72			return __this_address;
  73		if (block->bb_u.l.bb_pad != cpu_to_be32(0))
  74			return __this_address;
  75	}
  76
  77	if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
  78		return __this_address;
  79	if (be16_to_cpu(block->bb_level) != level)
  80		return __this_address;
  81	if (be16_to_cpu(block->bb_numrecs) >
  82	    cur->bc_ops->get_maxrecs(cur, level))
  83		return __this_address;
  84	if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
  85	    !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib),
  86			level + 1))
  87		return __this_address;
  88	if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
  89	    !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib),
  90			level + 1))
  91		return __this_address;
  92
  93	return NULL;
  94}
  95
  96/* Check a long btree block header. */
  97static int
  98xfs_btree_check_lblock(
  99	struct xfs_btree_cur	*cur,
 100	struct xfs_btree_block	*block,
 101	int			level,
 102	struct xfs_buf		*bp)
 103{
 104	struct xfs_mount	*mp = cur->bc_mp;
 105	xfs_failaddr_t		fa;
 106
 107	fa = __xfs_btree_check_lblock(cur, block, level, bp);
 108	if (unlikely(XFS_TEST_ERROR(fa != NULL, mp,
 109			XFS_ERRTAG_BTREE_CHECK_LBLOCK))) {
 110		if (bp)
 111			trace_xfs_btree_corrupt(bp, _RET_IP_);
 112		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
 113		return -EFSCORRUPTED;
 114	}
 115	return 0;
 116}
 117
 118/*
 119 * Check a short btree block header.  Return the address of the failing check,
 120 * or NULL if everything is ok.
 121 */
 122xfs_failaddr_t
 123__xfs_btree_check_sblock(
 124	struct xfs_btree_cur	*cur,
 125	struct xfs_btree_block	*block,
 126	int			level,
 127	struct xfs_buf		*bp)
 128{
 129	struct xfs_mount	*mp = cur->bc_mp;
 130	xfs_btnum_t		btnum = cur->bc_btnum;
 131	int			crc = xfs_sb_version_hascrc(&mp->m_sb);
 132
 133	if (crc) {
 134		if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
 135			return __this_address;
 136		if (block->bb_u.s.bb_blkno !=
 137		    cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL))
 138			return __this_address;
 139	}
 140
 141	if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
 142		return __this_address;
 143	if (be16_to_cpu(block->bb_level) != level)
 144		return __this_address;
 145	if (be16_to_cpu(block->bb_numrecs) >
 146	    cur->bc_ops->get_maxrecs(cur, level))
 147		return __this_address;
 148	if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
 149	    !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib),
 150			level + 1))
 151		return __this_address;
 152	if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
 153	    !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib),
 154			level + 1))
 155		return __this_address;
 156
 157	return NULL;
 158}
 159
 160/* Check a short btree block header. */
 161STATIC int
 162xfs_btree_check_sblock(
 163	struct xfs_btree_cur	*cur,
 164	struct xfs_btree_block	*block,
 165	int			level,
 166	struct xfs_buf		*bp)
 167{
 168	struct xfs_mount	*mp = cur->bc_mp;
 169	xfs_failaddr_t		fa;
 170
 171	fa = __xfs_btree_check_sblock(cur, block, level, bp);
 172	if (unlikely(XFS_TEST_ERROR(fa != NULL, mp,
 173			XFS_ERRTAG_BTREE_CHECK_SBLOCK))) {
 174		if (bp)
 175			trace_xfs_btree_corrupt(bp, _RET_IP_);
 176		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
 177		return -EFSCORRUPTED;
 178	}
 179	return 0;
 180}
 181
 182/*
 183 * Debug routine: check that block header is ok.
 184 */
 185int
 186xfs_btree_check_block(
 187	struct xfs_btree_cur	*cur,	/* btree cursor */
 188	struct xfs_btree_block	*block,	/* generic btree block pointer */
 189	int			level,	/* level of the btree block */
 190	struct xfs_buf		*bp)	/* buffer containing block, if any */
 191{
 192	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 193		return xfs_btree_check_lblock(cur, block, level, bp);
 194	else
 195		return xfs_btree_check_sblock(cur, block, level, bp);
 196}
 197
 198/* Check that this long pointer is valid and points within the fs. */
 199bool
 200xfs_btree_check_lptr(
 201	struct xfs_btree_cur	*cur,
 202	xfs_fsblock_t		fsbno,
 203	int			level)
 204{
 205	if (level <= 0)
 206		return false;
 207	return xfs_verify_fsbno(cur->bc_mp, fsbno);
 208}
 209
 210/* Check that this short pointer is valid and points within the AG. */
 211bool
 212xfs_btree_check_sptr(
 213	struct xfs_btree_cur	*cur,
 214	xfs_agblock_t		agbno,
 215	int			level)
 216{
 217	if (level <= 0)
 218		return false;
 219	return xfs_verify_agbno(cur->bc_mp, cur->bc_private.a.agno, agbno);
 220}
 221
 
 222/*
 223 * Check that a given (indexed) btree pointer at a certain level of a
 224 * btree is valid and doesn't point past where it should.
 225 */
 226static int
 227xfs_btree_check_ptr(
 228	struct xfs_btree_cur	*cur,
 229	union xfs_btree_ptr	*ptr,
 230	int			index,
 231	int			level)
 232{
 233	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 234		if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]),
 235				level))
 236			return 0;
 237		xfs_err(cur->bc_mp,
 238"Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.",
 239				cur->bc_private.b.ip->i_ino,
 240				cur->bc_private.b.whichfork, cur->bc_btnum,
 241				level, index);
 242	} else {
 243		if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]),
 244				level))
 245			return 0;
 246		xfs_err(cur->bc_mp,
 247"AG %u: Corrupt btree %d pointer at level %d index %d.",
 248				cur->bc_private.a.agno, cur->bc_btnum,
 249				level, index);
 250	}
 251
 252	return -EFSCORRUPTED;
 253}
 254
 255#ifdef DEBUG
 256# define xfs_btree_debug_check_ptr	xfs_btree_check_ptr
 257#else
 258# define xfs_btree_debug_check_ptr(...)	(0)
 259#endif
 260
 261/*
 262 * Calculate CRC on the whole btree block and stuff it into the
 263 * long-form btree header.
 264 *
 265 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
 266 * it into the buffer so recovery knows what the last modification was that made
 267 * it to disk.
 268 */
 269void
 270xfs_btree_lblock_calc_crc(
 271	struct xfs_buf		*bp)
 272{
 273	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
 274	struct xfs_buf_log_item	*bip = bp->b_log_item;
 275
 276	if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb))
 277		return;
 278	if (bip)
 279		block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
 280	xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
 281}
 282
 283bool
 284xfs_btree_lblock_verify_crc(
 285	struct xfs_buf		*bp)
 286{
 287	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
 288	struct xfs_mount	*mp = bp->b_mount;
 289
 290	if (xfs_sb_version_hascrc(&mp->m_sb)) {
 291		if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
 292			return false;
 293		return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
 294	}
 295
 296	return true;
 297}
 298
 299/*
 300 * Calculate CRC on the whole btree block and stuff it into the
 301 * short-form btree header.
 302 *
 303 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
 304 * it into the buffer so recovery knows what the last modification was that made
 305 * it to disk.
 306 */
 307void
 308xfs_btree_sblock_calc_crc(
 309	struct xfs_buf		*bp)
 310{
 311	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
 312	struct xfs_buf_log_item	*bip = bp->b_log_item;
 313
 314	if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb))
 315		return;
 316	if (bip)
 317		block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
 318	xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
 319}
 320
 321bool
 322xfs_btree_sblock_verify_crc(
 323	struct xfs_buf		*bp)
 324{
 325	struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 326	struct xfs_mount	*mp = bp->b_mount;
 327
 328	if (xfs_sb_version_hascrc(&mp->m_sb)) {
 329		if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
 330			return false;
 331		return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
 332	}
 333
 334	return true;
 335}
 336
 337static int
 338xfs_btree_free_block(
 339	struct xfs_btree_cur	*cur,
 340	struct xfs_buf		*bp)
 341{
 342	int			error;
 343
 344	error = cur->bc_ops->free_block(cur, bp);
 345	if (!error) {
 346		xfs_trans_binval(cur->bc_tp, bp);
 347		XFS_BTREE_STATS_INC(cur, free);
 348	}
 349	return error;
 350}
 351
 352/*
 353 * Delete the btree cursor.
 354 */
 355void
 356xfs_btree_del_cursor(
 357	xfs_btree_cur_t	*cur,		/* btree cursor */
 358	int		error)		/* del because of error */
 359{
 360	int		i;		/* btree level */
 361
 362	/*
 363	 * Clear the buffer pointers, and release the buffers.
 364	 * If we're doing this in the face of an error, we
 365	 * need to make sure to inspect all of the entries
 366	 * in the bc_bufs array for buffers to be unlocked.
 367	 * This is because some of the btree code works from
 368	 * level n down to 0, and if we get an error along
 369	 * the way we won't have initialized all the entries
 370	 * down to 0.
 371	 */
 372	for (i = 0; i < cur->bc_nlevels; i++) {
 373		if (cur->bc_bufs[i])
 374			xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
 375		else if (!error)
 376			break;
 377	}
 378	/*
 379	 * Can't free a bmap cursor without having dealt with the
 380	 * allocated indirect blocks' accounting.
 381	 */
 382	ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
 383	       cur->bc_private.b.allocated == 0);
 384	/*
 385	 * Free the cursor.
 386	 */
 387	kmem_zone_free(xfs_btree_cur_zone, cur);
 388}
 389
 390/*
 391 * Duplicate the btree cursor.
 392 * Allocate a new one, copy the record, re-get the buffers.
 393 */
 394int					/* error */
 395xfs_btree_dup_cursor(
 396	xfs_btree_cur_t	*cur,		/* input cursor */
 397	xfs_btree_cur_t	**ncur)		/* output cursor */
 398{
 399	xfs_buf_t	*bp;		/* btree block's buffer pointer */
 400	int		error;		/* error return value */
 401	int		i;		/* level number of btree block */
 402	xfs_mount_t	*mp;		/* mount structure for filesystem */
 403	xfs_btree_cur_t	*new;		/* new cursor value */
 404	xfs_trans_t	*tp;		/* transaction pointer, can be NULL */
 405
 406	tp = cur->bc_tp;
 407	mp = cur->bc_mp;
 408
 409	/*
 410	 * Allocate a new cursor like the old one.
 411	 */
 412	new = cur->bc_ops->dup_cursor(cur);
 413
 414	/*
 415	 * Copy the record currently in the cursor.
 416	 */
 417	new->bc_rec = cur->bc_rec;
 418
 419	/*
 420	 * For each level current, re-get the buffer and copy the ptr value.
 421	 */
 422	for (i = 0; i < new->bc_nlevels; i++) {
 423		new->bc_ptrs[i] = cur->bc_ptrs[i];
 424		new->bc_ra[i] = cur->bc_ra[i];
 425		bp = cur->bc_bufs[i];
 426		if (bp) {
 427			error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 428						   XFS_BUF_ADDR(bp), mp->m_bsize,
 429						   0, &bp,
 430						   cur->bc_ops->buf_ops);
 431			if (error) {
 432				xfs_btree_del_cursor(new, error);
 433				*ncur = NULL;
 434				return error;
 435			}
 436		}
 437		new->bc_bufs[i] = bp;
 438	}
 439	*ncur = new;
 440	return 0;
 441}
 442
 443/*
 444 * XFS btree block layout and addressing:
 445 *
 446 * There are two types of blocks in the btree: leaf and non-leaf blocks.
 447 *
 448 * The leaf record start with a header then followed by records containing
 449 * the values.  A non-leaf block also starts with the same header, and
 450 * then first contains lookup keys followed by an equal number of pointers
 451 * to the btree blocks at the previous level.
 452 *
 453 *		+--------+-------+-------+-------+-------+-------+-------+
 454 * Leaf:	| header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
 455 *		+--------+-------+-------+-------+-------+-------+-------+
 456 *
 457 *		+--------+-------+-------+-------+-------+-------+-------+
 458 * Non-Leaf:	| header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
 459 *		+--------+-------+-------+-------+-------+-------+-------+
 460 *
 461 * The header is called struct xfs_btree_block for reasons better left unknown
 462 * and comes in different versions for short (32bit) and long (64bit) block
 463 * pointers.  The record and key structures are defined by the btree instances
 464 * and opaque to the btree core.  The block pointers are simple disk endian
 465 * integers, available in a short (32bit) and long (64bit) variant.
 466 *
 467 * The helpers below calculate the offset of a given record, key or pointer
 468 * into a btree block (xfs_btree_*_offset) or return a pointer to the given
 469 * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
 470 * inside the btree block is done using indices starting at one, not zero!
 471 *
 472 * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing
 473 * overlapping intervals.  In such a tree, records are still sorted lowest to
 474 * highest and indexed by the smallest key value that refers to the record.
 475 * However, nodes are different: each pointer has two associated keys -- one
 476 * indexing the lowest key available in the block(s) below (the same behavior
 477 * as the key in a regular btree) and another indexing the highest key
 478 * available in the block(s) below.  Because records are /not/ sorted by the
 479 * highest key, all leaf block updates require us to compute the highest key
 480 * that matches any record in the leaf and to recursively update the high keys
 481 * in the nodes going further up in the tree, if necessary.  Nodes look like
 482 * this:
 483 *
 484 *		+--------+-----+-----+-----+-----+-----+-------+-------+-----+
 485 * Non-Leaf:	| header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... |
 486 *		+--------+-----+-----+-----+-----+-----+-------+-------+-----+
 487 *
 488 * To perform an interval query on an overlapped tree, perform the usual
 489 * depth-first search and use the low and high keys to decide if we can skip
 490 * that particular node.  If a leaf node is reached, return the records that
 491 * intersect the interval.  Note that an interval query may return numerous
 492 * entries.  For a non-overlapped tree, simply search for the record associated
 493 * with the lowest key and iterate forward until a non-matching record is
 494 * found.  Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by
 495 * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in
 496 * more detail.
 497 *
 498 * Why do we care about overlapping intervals?  Let's say you have a bunch of
 499 * reverse mapping records on a reflink filesystem:
 500 *
 501 * 1: +- file A startblock B offset C length D -----------+
 502 * 2:      +- file E startblock F offset G length H --------------+
 503 * 3:      +- file I startblock F offset J length K --+
 504 * 4:                                                        +- file L... --+
 505 *
 506 * Now say we want to map block (B+D) into file A at offset (C+D).  Ideally,
 507 * we'd simply increment the length of record 1.  But how do we find the record
 508 * that ends at (B+D-1) (i.e. record 1)?  A LE lookup of (B+D-1) would return
 509 * record 3 because the keys are ordered first by startblock.  An interval
 510 * query would return records 1 and 2 because they both overlap (B+D-1), and
 511 * from that we can pick out record 1 as the appropriate left neighbor.
 512 *
 513 * In the non-overlapped case you can do a LE lookup and decrement the cursor
 514 * because a record's interval must end before the next record.
 515 */
 516
 517/*
 518 * Return size of the btree block header for this btree instance.
 519 */
 520static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
 521{
 522	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 523		if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
 524			return XFS_BTREE_LBLOCK_CRC_LEN;
 525		return XFS_BTREE_LBLOCK_LEN;
 526	}
 527	if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
 528		return XFS_BTREE_SBLOCK_CRC_LEN;
 529	return XFS_BTREE_SBLOCK_LEN;
 530}
 531
 532/*
 533 * Return size of btree block pointers for this btree instance.
 534 */
 535static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
 536{
 537	return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
 538		sizeof(__be64) : sizeof(__be32);
 539}
 540
 541/*
 542 * Calculate offset of the n-th record in a btree block.
 543 */
 544STATIC size_t
 545xfs_btree_rec_offset(
 546	struct xfs_btree_cur	*cur,
 547	int			n)
 548{
 549	return xfs_btree_block_len(cur) +
 550		(n - 1) * cur->bc_ops->rec_len;
 551}
 552
 553/*
 554 * Calculate offset of the n-th key in a btree block.
 555 */
 556STATIC size_t
 557xfs_btree_key_offset(
 558	struct xfs_btree_cur	*cur,
 559	int			n)
 560{
 561	return xfs_btree_block_len(cur) +
 562		(n - 1) * cur->bc_ops->key_len;
 563}
 564
 565/*
 566 * Calculate offset of the n-th high key in a btree block.
 567 */
 568STATIC size_t
 569xfs_btree_high_key_offset(
 570	struct xfs_btree_cur	*cur,
 571	int			n)
 572{
 573	return xfs_btree_block_len(cur) +
 574		(n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2);
 575}
 576
 577/*
 578 * Calculate offset of the n-th block pointer in a btree block.
 579 */
 580STATIC size_t
 581xfs_btree_ptr_offset(
 582	struct xfs_btree_cur	*cur,
 583	int			n,
 584	int			level)
 585{
 586	return xfs_btree_block_len(cur) +
 587		cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
 588		(n - 1) * xfs_btree_ptr_len(cur);
 589}
 590
 591/*
 592 * Return a pointer to the n-th record in the btree block.
 593 */
 594union xfs_btree_rec *
 595xfs_btree_rec_addr(
 596	struct xfs_btree_cur	*cur,
 597	int			n,
 598	struct xfs_btree_block	*block)
 599{
 600	return (union xfs_btree_rec *)
 601		((char *)block + xfs_btree_rec_offset(cur, n));
 602}
 603
 604/*
 605 * Return a pointer to the n-th key in the btree block.
 606 */
 607union xfs_btree_key *
 608xfs_btree_key_addr(
 609	struct xfs_btree_cur	*cur,
 610	int			n,
 611	struct xfs_btree_block	*block)
 612{
 613	return (union xfs_btree_key *)
 614		((char *)block + xfs_btree_key_offset(cur, n));
 615}
 616
 617/*
 618 * Return a pointer to the n-th high key in the btree block.
 619 */
 620union xfs_btree_key *
 621xfs_btree_high_key_addr(
 622	struct xfs_btree_cur	*cur,
 623	int			n,
 624	struct xfs_btree_block	*block)
 625{
 626	return (union xfs_btree_key *)
 627		((char *)block + xfs_btree_high_key_offset(cur, n));
 628}
 629
 630/*
 631 * Return a pointer to the n-th block pointer in the btree block.
 632 */
 633union xfs_btree_ptr *
 634xfs_btree_ptr_addr(
 635	struct xfs_btree_cur	*cur,
 636	int			n,
 637	struct xfs_btree_block	*block)
 638{
 639	int			level = xfs_btree_get_level(block);
 640
 641	ASSERT(block->bb_level != 0);
 642
 643	return (union xfs_btree_ptr *)
 644		((char *)block + xfs_btree_ptr_offset(cur, n, level));
 645}
 646
 647/*
 648 * Get the root block which is stored in the inode.
 649 *
 650 * For now this btree implementation assumes the btree root is always
 651 * stored in the if_broot field of an inode fork.
 652 */
 653STATIC struct xfs_btree_block *
 654xfs_btree_get_iroot(
 655	struct xfs_btree_cur	*cur)
 656{
 657	struct xfs_ifork	*ifp;
 658
 659	ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
 660	return (struct xfs_btree_block *)ifp->if_broot;
 661}
 662
 663/*
 664 * Retrieve the block pointer from the cursor at the given level.
 665 * This may be an inode btree root or from a buffer.
 666 */
 667struct xfs_btree_block *		/* generic btree block pointer */
 668xfs_btree_get_block(
 669	struct xfs_btree_cur	*cur,	/* btree cursor */
 670	int			level,	/* level in btree */
 671	struct xfs_buf		**bpp)	/* buffer containing the block */
 672{
 673	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
 674	    (level == cur->bc_nlevels - 1)) {
 675		*bpp = NULL;
 676		return xfs_btree_get_iroot(cur);
 677	}
 678
 679	*bpp = cur->bc_bufs[level];
 680	return XFS_BUF_TO_BLOCK(*bpp);
 681}
 682
 683/*
 684 * Get a buffer for the block, return it with no data read.
 685 * Long-form addressing.
 686 */
 687xfs_buf_t *				/* buffer for fsbno */
 688xfs_btree_get_bufl(
 689	xfs_mount_t	*mp,		/* file system mount point */
 690	xfs_trans_t	*tp,		/* transaction pointer */
 691	xfs_fsblock_t	fsbno)		/* file system block number */
 
 692{
 693	xfs_daddr_t		d;		/* real disk block address */
 694
 695	ASSERT(fsbno != NULLFSBLOCK);
 696	d = XFS_FSB_TO_DADDR(mp, fsbno);
 697	return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0);
 698}
 699
 700/*
 701 * Get a buffer for the block, return it with no data read.
 702 * Short-form addressing.
 703 */
 704xfs_buf_t *				/* buffer for agno/agbno */
 705xfs_btree_get_bufs(
 706	xfs_mount_t	*mp,		/* file system mount point */
 707	xfs_trans_t	*tp,		/* transaction pointer */
 708	xfs_agnumber_t	agno,		/* allocation group number */
 709	xfs_agblock_t	agbno)		/* allocation group block number */
 
 710{
 711	xfs_daddr_t		d;		/* real disk block address */
 712
 713	ASSERT(agno != NULLAGNUMBER);
 714	ASSERT(agbno != NULLAGBLOCK);
 715	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
 716	return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0);
 717}
 718
 719/*
 720 * Check for the cursor referring to the last block at the given level.
 721 */
 722int					/* 1=is last block, 0=not last block */
 723xfs_btree_islastblock(
 724	xfs_btree_cur_t		*cur,	/* btree cursor */
 725	int			level)	/* level to check */
 726{
 727	struct xfs_btree_block	*block;	/* generic btree block pointer */
 728	xfs_buf_t		*bp;	/* buffer containing block */
 729
 730	block = xfs_btree_get_block(cur, level, &bp);
 731	xfs_btree_check_block(cur, block, level, bp);
 732	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 733		return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
 734	else
 735		return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
 736}
 737
 738/*
 739 * Change the cursor to point to the first record at the given level.
 740 * Other levels are unaffected.
 741 */
 742STATIC int				/* success=1, failure=0 */
 743xfs_btree_firstrec(
 744	xfs_btree_cur_t		*cur,	/* btree cursor */
 745	int			level)	/* level to change */
 746{
 747	struct xfs_btree_block	*block;	/* generic btree block pointer */
 748	xfs_buf_t		*bp;	/* buffer containing block */
 749
 750	/*
 751	 * Get the block pointer for this level.
 752	 */
 753	block = xfs_btree_get_block(cur, level, &bp);
 754	if (xfs_btree_check_block(cur, block, level, bp))
 755		return 0;
 756	/*
 757	 * It's empty, there is no such record.
 758	 */
 759	if (!block->bb_numrecs)
 760		return 0;
 761	/*
 762	 * Set the ptr value to 1, that's the first record/key.
 763	 */
 764	cur->bc_ptrs[level] = 1;
 765	return 1;
 766}
 767
 768/*
 769 * Change the cursor to point to the last record in the current block
 770 * at the given level.  Other levels are unaffected.
 771 */
 772STATIC int				/* success=1, failure=0 */
 773xfs_btree_lastrec(
 774	xfs_btree_cur_t		*cur,	/* btree cursor */
 775	int			level)	/* level to change */
 776{
 777	struct xfs_btree_block	*block;	/* generic btree block pointer */
 778	xfs_buf_t		*bp;	/* buffer containing block */
 779
 780	/*
 781	 * Get the block pointer for this level.
 782	 */
 783	block = xfs_btree_get_block(cur, level, &bp);
 784	if (xfs_btree_check_block(cur, block, level, bp))
 785		return 0;
 786	/*
 787	 * It's empty, there is no such record.
 788	 */
 789	if (!block->bb_numrecs)
 790		return 0;
 791	/*
 792	 * Set the ptr value to numrecs, that's the last record/key.
 793	 */
 794	cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
 795	return 1;
 796}
 797
 798/*
 799 * Compute first and last byte offsets for the fields given.
 800 * Interprets the offsets table, which contains struct field offsets.
 801 */
 802void
 803xfs_btree_offsets(
 804	int64_t		fields,		/* bitmask of fields */
 805	const short	*offsets,	/* table of field offsets */
 806	int		nbits,		/* number of bits to inspect */
 807	int		*first,		/* output: first byte offset */
 808	int		*last)		/* output: last byte offset */
 809{
 810	int		i;		/* current bit number */
 811	int64_t		imask;		/* mask for current bit number */
 812
 813	ASSERT(fields != 0);
 814	/*
 815	 * Find the lowest bit, so the first byte offset.
 816	 */
 817	for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
 818		if (imask & fields) {
 819			*first = offsets[i];
 820			break;
 821		}
 822	}
 823	/*
 824	 * Find the highest bit, so the last byte offset.
 825	 */
 826	for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
 827		if (imask & fields) {
 828			*last = offsets[i + 1] - 1;
 829			break;
 830		}
 831	}
 832}
 833
 834/*
 835 * Get a buffer for the block, return it read in.
 836 * Long-form addressing.
 837 */
 838int
 839xfs_btree_read_bufl(
 840	struct xfs_mount	*mp,		/* file system mount point */
 841	struct xfs_trans	*tp,		/* transaction pointer */
 842	xfs_fsblock_t		fsbno,		/* file system block number */
 
 843	struct xfs_buf		**bpp,		/* buffer for fsbno */
 844	int			refval,		/* ref count value for buffer */
 845	const struct xfs_buf_ops *ops)
 846{
 847	struct xfs_buf		*bp;		/* return value */
 848	xfs_daddr_t		d;		/* real disk block address */
 849	int			error;
 850
 851	if (!xfs_verify_fsbno(mp, fsbno))
 852		return -EFSCORRUPTED;
 853	d = XFS_FSB_TO_DADDR(mp, fsbno);
 854	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
 855				   mp->m_bsize, 0, &bp, ops);
 856	if (error)
 857		return error;
 858	if (bp)
 859		xfs_buf_set_ref(bp, refval);
 860	*bpp = bp;
 861	return 0;
 862}
 863
 864/*
 865 * Read-ahead the block, don't wait for it, don't return a buffer.
 866 * Long-form addressing.
 867 */
 868/* ARGSUSED */
 869void
 870xfs_btree_reada_bufl(
 871	struct xfs_mount	*mp,		/* file system mount point */
 872	xfs_fsblock_t		fsbno,		/* file system block number */
 873	xfs_extlen_t		count,		/* count of filesystem blocks */
 874	const struct xfs_buf_ops *ops)
 875{
 876	xfs_daddr_t		d;
 877
 878	ASSERT(fsbno != NULLFSBLOCK);
 879	d = XFS_FSB_TO_DADDR(mp, fsbno);
 880	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
 881}
 882
 883/*
 884 * Read-ahead the block, don't wait for it, don't return a buffer.
 885 * Short-form addressing.
 886 */
 887/* ARGSUSED */
 888void
 889xfs_btree_reada_bufs(
 890	struct xfs_mount	*mp,		/* file system mount point */
 891	xfs_agnumber_t		agno,		/* allocation group number */
 892	xfs_agblock_t		agbno,		/* allocation group block number */
 893	xfs_extlen_t		count,		/* count of filesystem blocks */
 894	const struct xfs_buf_ops *ops)
 895{
 896	xfs_daddr_t		d;
 897
 898	ASSERT(agno != NULLAGNUMBER);
 899	ASSERT(agbno != NULLAGBLOCK);
 900	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
 901	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
 902}
 903
 904STATIC int
 905xfs_btree_readahead_lblock(
 906	struct xfs_btree_cur	*cur,
 907	int			lr,
 908	struct xfs_btree_block	*block)
 909{
 910	int			rval = 0;
 911	xfs_fsblock_t		left = be64_to_cpu(block->bb_u.l.bb_leftsib);
 912	xfs_fsblock_t		right = be64_to_cpu(block->bb_u.l.bb_rightsib);
 913
 914	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
 915		xfs_btree_reada_bufl(cur->bc_mp, left, 1,
 916				     cur->bc_ops->buf_ops);
 917		rval++;
 918	}
 919
 920	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
 921		xfs_btree_reada_bufl(cur->bc_mp, right, 1,
 922				     cur->bc_ops->buf_ops);
 923		rval++;
 924	}
 925
 926	return rval;
 927}
 928
 929STATIC int
 930xfs_btree_readahead_sblock(
 931	struct xfs_btree_cur	*cur,
 932	int			lr,
 933	struct xfs_btree_block *block)
 934{
 935	int			rval = 0;
 936	xfs_agblock_t		left = be32_to_cpu(block->bb_u.s.bb_leftsib);
 937	xfs_agblock_t		right = be32_to_cpu(block->bb_u.s.bb_rightsib);
 938
 939
 940	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
 941		xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
 942				     left, 1, cur->bc_ops->buf_ops);
 943		rval++;
 944	}
 945
 946	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
 947		xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
 948				     right, 1, cur->bc_ops->buf_ops);
 949		rval++;
 950	}
 951
 952	return rval;
 953}
 954
 955/*
 956 * Read-ahead btree blocks, at the given level.
 957 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
 958 */
 959STATIC int
 960xfs_btree_readahead(
 961	struct xfs_btree_cur	*cur,		/* btree cursor */
 962	int			lev,		/* level in btree */
 963	int			lr)		/* left/right bits */
 964{
 965	struct xfs_btree_block	*block;
 966
 967	/*
 968	 * No readahead needed if we are at the root level and the
 969	 * btree root is stored in the inode.
 970	 */
 971	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
 972	    (lev == cur->bc_nlevels - 1))
 973		return 0;
 974
 975	if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
 976		return 0;
 977
 978	cur->bc_ra[lev] |= lr;
 979	block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
 980
 981	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 982		return xfs_btree_readahead_lblock(cur, lr, block);
 983	return xfs_btree_readahead_sblock(cur, lr, block);
 984}
 985
 986STATIC int
 987xfs_btree_ptr_to_daddr(
 988	struct xfs_btree_cur	*cur,
 989	union xfs_btree_ptr	*ptr,
 990	xfs_daddr_t		*daddr)
 991{
 992	xfs_fsblock_t		fsbno;
 993	xfs_agblock_t		agbno;
 994	int			error;
 995
 996	error = xfs_btree_check_ptr(cur, ptr, 0, 1);
 997	if (error)
 998		return error;
 
 999
1000	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1001		fsbno = be64_to_cpu(ptr->l);
1002		*daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno);
1003	} else {
1004		agbno = be32_to_cpu(ptr->s);
1005		*daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
1006				agbno);
1007	}
1008
1009	return 0;
1010}
1011
1012/*
1013 * Readahead @count btree blocks at the given @ptr location.
1014 *
1015 * We don't need to care about long or short form btrees here as we have a
1016 * method of converting the ptr directly to a daddr available to us.
1017 */
1018STATIC void
1019xfs_btree_readahead_ptr(
1020	struct xfs_btree_cur	*cur,
1021	union xfs_btree_ptr	*ptr,
1022	xfs_extlen_t		count)
1023{
1024	xfs_daddr_t		daddr;
1025
1026	if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr))
1027		return;
1028	xfs_buf_readahead(cur->bc_mp->m_ddev_targp, daddr,
1029			  cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
1030}
1031
1032/*
1033 * Set the buffer for level "lev" in the cursor to bp, releasing
1034 * any previous buffer.
1035 */
1036STATIC void
1037xfs_btree_setbuf(
1038	xfs_btree_cur_t		*cur,	/* btree cursor */
1039	int			lev,	/* level in btree */
1040	xfs_buf_t		*bp)	/* new buffer to set */
1041{
1042	struct xfs_btree_block	*b;	/* btree block */
1043
1044	if (cur->bc_bufs[lev])
1045		xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
1046	cur->bc_bufs[lev] = bp;
1047	cur->bc_ra[lev] = 0;
1048
1049	b = XFS_BUF_TO_BLOCK(bp);
1050	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1051		if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
1052			cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
1053		if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
1054			cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
1055	} else {
1056		if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
1057			cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
1058		if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
1059			cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
1060	}
1061}
1062
1063bool
1064xfs_btree_ptr_is_null(
1065	struct xfs_btree_cur	*cur,
1066	union xfs_btree_ptr	*ptr)
1067{
1068	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1069		return ptr->l == cpu_to_be64(NULLFSBLOCK);
1070	else
1071		return ptr->s == cpu_to_be32(NULLAGBLOCK);
1072}
1073
1074STATIC void
1075xfs_btree_set_ptr_null(
1076	struct xfs_btree_cur	*cur,
1077	union xfs_btree_ptr	*ptr)
1078{
1079	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1080		ptr->l = cpu_to_be64(NULLFSBLOCK);
1081	else
1082		ptr->s = cpu_to_be32(NULLAGBLOCK);
1083}
1084
1085/*
1086 * Get/set/init sibling pointers
1087 */
1088void
1089xfs_btree_get_sibling(
1090	struct xfs_btree_cur	*cur,
1091	struct xfs_btree_block	*block,
1092	union xfs_btree_ptr	*ptr,
1093	int			lr)
1094{
1095	ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1096
1097	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1098		if (lr == XFS_BB_RIGHTSIB)
1099			ptr->l = block->bb_u.l.bb_rightsib;
1100		else
1101			ptr->l = block->bb_u.l.bb_leftsib;
1102	} else {
1103		if (lr == XFS_BB_RIGHTSIB)
1104			ptr->s = block->bb_u.s.bb_rightsib;
1105		else
1106			ptr->s = block->bb_u.s.bb_leftsib;
1107	}
1108}
1109
1110STATIC void
1111xfs_btree_set_sibling(
1112	struct xfs_btree_cur	*cur,
1113	struct xfs_btree_block	*block,
1114	union xfs_btree_ptr	*ptr,
1115	int			lr)
1116{
1117	ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1118
1119	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1120		if (lr == XFS_BB_RIGHTSIB)
1121			block->bb_u.l.bb_rightsib = ptr->l;
1122		else
1123			block->bb_u.l.bb_leftsib = ptr->l;
1124	} else {
1125		if (lr == XFS_BB_RIGHTSIB)
1126			block->bb_u.s.bb_rightsib = ptr->s;
1127		else
1128			block->bb_u.s.bb_leftsib = ptr->s;
1129	}
1130}
1131
1132void
1133xfs_btree_init_block_int(
1134	struct xfs_mount	*mp,
1135	struct xfs_btree_block	*buf,
1136	xfs_daddr_t		blkno,
1137	xfs_btnum_t		btnum,
1138	__u16			level,
1139	__u16			numrecs,
1140	__u64			owner,
1141	unsigned int		flags)
1142{
1143	int			crc = xfs_sb_version_hascrc(&mp->m_sb);
1144	__u32			magic = xfs_btree_magic(crc, btnum);
1145
1146	buf->bb_magic = cpu_to_be32(magic);
1147	buf->bb_level = cpu_to_be16(level);
1148	buf->bb_numrecs = cpu_to_be16(numrecs);
1149
1150	if (flags & XFS_BTREE_LONG_PTRS) {
1151		buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
1152		buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
1153		if (crc) {
1154			buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1155			buf->bb_u.l.bb_owner = cpu_to_be64(owner);
1156			uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
1157			buf->bb_u.l.bb_pad = 0;
1158			buf->bb_u.l.bb_lsn = 0;
1159		}
1160	} else {
1161		/* owner is a 32 bit value on short blocks */
1162		__u32 __owner = (__u32)owner;
1163
1164		buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1165		buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1166		if (crc) {
1167			buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
1168			buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
1169			uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
1170			buf->bb_u.s.bb_lsn = 0;
1171		}
1172	}
1173}
1174
1175void
1176xfs_btree_init_block(
1177	struct xfs_mount *mp,
1178	struct xfs_buf	*bp,
1179	xfs_btnum_t	btnum,
1180	__u16		level,
1181	__u16		numrecs,
1182	__u64		owner)
 
1183{
1184	xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1185				 btnum, level, numrecs, owner, 0);
1186}
1187
1188STATIC void
1189xfs_btree_init_block_cur(
1190	struct xfs_btree_cur	*cur,
1191	struct xfs_buf		*bp,
1192	int			level,
1193	int			numrecs)
1194{
1195	__u64			owner;
1196
1197	/*
1198	 * we can pull the owner from the cursor right now as the different
1199	 * owners align directly with the pointer size of the btree. This may
1200	 * change in future, but is safe for current users of the generic btree
1201	 * code.
1202	 */
1203	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1204		owner = cur->bc_private.b.ip->i_ino;
1205	else
1206		owner = cur->bc_private.a.agno;
1207
1208	xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1209				 cur->bc_btnum, level, numrecs,
1210				 owner, cur->bc_flags);
1211}
1212
1213/*
1214 * Return true if ptr is the last record in the btree and
1215 * we need to track updates to this record.  The decision
1216 * will be further refined in the update_lastrec method.
1217 */
1218STATIC int
1219xfs_btree_is_lastrec(
1220	struct xfs_btree_cur	*cur,
1221	struct xfs_btree_block	*block,
1222	int			level)
1223{
1224	union xfs_btree_ptr	ptr;
1225
1226	if (level > 0)
1227		return 0;
1228	if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
1229		return 0;
1230
1231	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1232	if (!xfs_btree_ptr_is_null(cur, &ptr))
1233		return 0;
1234	return 1;
1235}
1236
1237STATIC void
1238xfs_btree_buf_to_ptr(
1239	struct xfs_btree_cur	*cur,
1240	struct xfs_buf		*bp,
1241	union xfs_btree_ptr	*ptr)
1242{
1243	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1244		ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
1245					XFS_BUF_ADDR(bp)));
1246	else {
1247		ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
1248					XFS_BUF_ADDR(bp)));
1249	}
1250}
1251
1252STATIC void
1253xfs_btree_set_refs(
1254	struct xfs_btree_cur	*cur,
1255	struct xfs_buf		*bp)
1256{
1257	switch (cur->bc_btnum) {
1258	case XFS_BTNUM_BNO:
1259	case XFS_BTNUM_CNT:
1260		xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
1261		break;
1262	case XFS_BTNUM_INO:
1263	case XFS_BTNUM_FINO:
1264		xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
1265		break;
1266	case XFS_BTNUM_BMAP:
1267		xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
1268		break;
1269	case XFS_BTNUM_RMAP:
1270		xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
1271		break;
1272	case XFS_BTNUM_REFC:
1273		xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF);
1274		break;
1275	default:
1276		ASSERT(0);
1277	}
1278}
1279
1280STATIC int
1281xfs_btree_get_buf_block(
1282	struct xfs_btree_cur	*cur,
1283	union xfs_btree_ptr	*ptr,
 
1284	struct xfs_btree_block	**block,
1285	struct xfs_buf		**bpp)
1286{
1287	struct xfs_mount	*mp = cur->bc_mp;
1288	xfs_daddr_t		d;
1289	int			error;
1290
1291	error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
1292	if (error)
1293		return error;
 
1294	*bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
1295				 mp->m_bsize, 0);
1296
1297	if (!*bpp)
1298		return -ENOMEM;
1299
1300	(*bpp)->b_ops = cur->bc_ops->buf_ops;
1301	*block = XFS_BUF_TO_BLOCK(*bpp);
1302	return 0;
1303}
1304
1305/*
1306 * Read in the buffer at the given ptr and return the buffer and
1307 * the block pointer within the buffer.
1308 */
1309STATIC int
1310xfs_btree_read_buf_block(
1311	struct xfs_btree_cur	*cur,
1312	union xfs_btree_ptr	*ptr,
1313	int			flags,
1314	struct xfs_btree_block	**block,
1315	struct xfs_buf		**bpp)
1316{
1317	struct xfs_mount	*mp = cur->bc_mp;
1318	xfs_daddr_t		d;
1319	int			error;
1320
1321	/* need to sort out how callers deal with failures first */
1322	ASSERT(!(flags & XBF_TRYLOCK));
1323
1324	error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
1325	if (error)
1326		return error;
1327	error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1328				   mp->m_bsize, flags, bpp,
1329				   cur->bc_ops->buf_ops);
1330	if (error)
1331		return error;
1332
1333	xfs_btree_set_refs(cur, *bpp);
1334	*block = XFS_BUF_TO_BLOCK(*bpp);
1335	return 0;
1336}
1337
1338/*
1339 * Copy keys from one btree block to another.
1340 */
1341STATIC void
1342xfs_btree_copy_keys(
1343	struct xfs_btree_cur	*cur,
1344	union xfs_btree_key	*dst_key,
1345	union xfs_btree_key	*src_key,
1346	int			numkeys)
1347{
1348	ASSERT(numkeys >= 0);
1349	memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
1350}
1351
1352/*
1353 * Copy records from one btree block to another.
1354 */
1355STATIC void
1356xfs_btree_copy_recs(
1357	struct xfs_btree_cur	*cur,
1358	union xfs_btree_rec	*dst_rec,
1359	union xfs_btree_rec	*src_rec,
1360	int			numrecs)
1361{
1362	ASSERT(numrecs >= 0);
1363	memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
1364}
1365
1366/*
1367 * Copy block pointers from one btree block to another.
1368 */
1369STATIC void
1370xfs_btree_copy_ptrs(
1371	struct xfs_btree_cur	*cur,
1372	union xfs_btree_ptr	*dst_ptr,
1373	union xfs_btree_ptr	*src_ptr,
1374	int			numptrs)
1375{
1376	ASSERT(numptrs >= 0);
1377	memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
1378}
1379
1380/*
1381 * Shift keys one index left/right inside a single btree block.
1382 */
1383STATIC void
1384xfs_btree_shift_keys(
1385	struct xfs_btree_cur	*cur,
1386	union xfs_btree_key	*key,
1387	int			dir,
1388	int			numkeys)
1389{
1390	char			*dst_key;
1391
1392	ASSERT(numkeys >= 0);
1393	ASSERT(dir == 1 || dir == -1);
1394
1395	dst_key = (char *)key + (dir * cur->bc_ops->key_len);
1396	memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
1397}
1398
1399/*
1400 * Shift records one index left/right inside a single btree block.
1401 */
1402STATIC void
1403xfs_btree_shift_recs(
1404	struct xfs_btree_cur	*cur,
1405	union xfs_btree_rec	*rec,
1406	int			dir,
1407	int			numrecs)
1408{
1409	char			*dst_rec;
1410
1411	ASSERT(numrecs >= 0);
1412	ASSERT(dir == 1 || dir == -1);
1413
1414	dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
1415	memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
1416}
1417
1418/*
1419 * Shift block pointers one index left/right inside a single btree block.
1420 */
1421STATIC void
1422xfs_btree_shift_ptrs(
1423	struct xfs_btree_cur	*cur,
1424	union xfs_btree_ptr	*ptr,
1425	int			dir,
1426	int			numptrs)
1427{
1428	char			*dst_ptr;
1429
1430	ASSERT(numptrs >= 0);
1431	ASSERT(dir == 1 || dir == -1);
1432
1433	dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
1434	memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
1435}
1436
1437/*
1438 * Log key values from the btree block.
1439 */
1440STATIC void
1441xfs_btree_log_keys(
1442	struct xfs_btree_cur	*cur,
1443	struct xfs_buf		*bp,
1444	int			first,
1445	int			last)
1446{
1447
1448	if (bp) {
1449		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1450		xfs_trans_log_buf(cur->bc_tp, bp,
1451				  xfs_btree_key_offset(cur, first),
1452				  xfs_btree_key_offset(cur, last + 1) - 1);
1453	} else {
1454		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1455				xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1456	}
1457}
1458
1459/*
1460 * Log record values from the btree block.
1461 */
1462void
1463xfs_btree_log_recs(
1464	struct xfs_btree_cur	*cur,
1465	struct xfs_buf		*bp,
1466	int			first,
1467	int			last)
1468{
1469
1470	xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1471	xfs_trans_log_buf(cur->bc_tp, bp,
1472			  xfs_btree_rec_offset(cur, first),
1473			  xfs_btree_rec_offset(cur, last + 1) - 1);
1474
1475}
1476
1477/*
1478 * Log block pointer fields from a btree block (nonleaf).
1479 */
1480STATIC void
1481xfs_btree_log_ptrs(
1482	struct xfs_btree_cur	*cur,	/* btree cursor */
1483	struct xfs_buf		*bp,	/* buffer containing btree block */
1484	int			first,	/* index of first pointer to log */
1485	int			last)	/* index of last pointer to log */
1486{
1487
1488	if (bp) {
1489		struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
1490		int			level = xfs_btree_get_level(block);
1491
1492		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1493		xfs_trans_log_buf(cur->bc_tp, bp,
1494				xfs_btree_ptr_offset(cur, first, level),
1495				xfs_btree_ptr_offset(cur, last + 1, level) - 1);
1496	} else {
1497		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1498			xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1499	}
1500
1501}
1502
1503/*
1504 * Log fields from a btree block header.
1505 */
1506void
1507xfs_btree_log_block(
1508	struct xfs_btree_cur	*cur,	/* btree cursor */
1509	struct xfs_buf		*bp,	/* buffer containing btree block */
1510	int			fields)	/* mask of fields: XFS_BB_... */
1511{
1512	int			first;	/* first byte offset logged */
1513	int			last;	/* last byte offset logged */
1514	static const short	soffsets[] = {	/* table of offsets (short) */
1515		offsetof(struct xfs_btree_block, bb_magic),
1516		offsetof(struct xfs_btree_block, bb_level),
1517		offsetof(struct xfs_btree_block, bb_numrecs),
1518		offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
1519		offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
1520		offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
1521		offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
1522		offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
1523		offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
1524		offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
1525		XFS_BTREE_SBLOCK_CRC_LEN
1526	};
1527	static const short	loffsets[] = {	/* table of offsets (long) */
1528		offsetof(struct xfs_btree_block, bb_magic),
1529		offsetof(struct xfs_btree_block, bb_level),
1530		offsetof(struct xfs_btree_block, bb_numrecs),
1531		offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
1532		offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
1533		offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
1534		offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
1535		offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
1536		offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
1537		offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
1538		offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
1539		XFS_BTREE_LBLOCK_CRC_LEN
1540	};
1541
1542	if (bp) {
1543		int nbits;
1544
1545		if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
1546			/*
1547			 * We don't log the CRC when updating a btree
1548			 * block but instead recreate it during log
1549			 * recovery.  As the log buffers have checksums
1550			 * of their own this is safe and avoids logging a crc
1551			 * update in a lot of places.
1552			 */
1553			if (fields == XFS_BB_ALL_BITS)
1554				fields = XFS_BB_ALL_BITS_CRC;
1555			nbits = XFS_BB_NUM_BITS_CRC;
1556		} else {
1557			nbits = XFS_BB_NUM_BITS;
1558		}
1559		xfs_btree_offsets(fields,
1560				  (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
1561					loffsets : soffsets,
1562				  nbits, &first, &last);
1563		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1564		xfs_trans_log_buf(cur->bc_tp, bp, first, last);
1565	} else {
1566		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1567			xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1568	}
1569}
1570
1571/*
1572 * Increment cursor by one record at the level.
1573 * For nonzero levels the leaf-ward information is untouched.
1574 */
1575int						/* error */
1576xfs_btree_increment(
1577	struct xfs_btree_cur	*cur,
1578	int			level,
1579	int			*stat)		/* success/failure */
1580{
1581	struct xfs_btree_block	*block;
1582	union xfs_btree_ptr	ptr;
1583	struct xfs_buf		*bp;
1584	int			error;		/* error return value */
1585	int			lev;
1586
1587	ASSERT(level < cur->bc_nlevels);
1588
1589	/* Read-ahead to the right at this level. */
1590	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1591
1592	/* Get a pointer to the btree block. */
1593	block = xfs_btree_get_block(cur, level, &bp);
1594
1595#ifdef DEBUG
1596	error = xfs_btree_check_block(cur, block, level, bp);
1597	if (error)
1598		goto error0;
1599#endif
1600
1601	/* We're done if we remain in the block after the increment. */
1602	if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
1603		goto out1;
1604
1605	/* Fail if we just went off the right edge of the tree. */
1606	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1607	if (xfs_btree_ptr_is_null(cur, &ptr))
1608		goto out0;
1609
1610	XFS_BTREE_STATS_INC(cur, increment);
1611
1612	/*
1613	 * March up the tree incrementing pointers.
1614	 * Stop when we don't go off the right edge of a block.
1615	 */
1616	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1617		block = xfs_btree_get_block(cur, lev, &bp);
1618
1619#ifdef DEBUG
1620		error = xfs_btree_check_block(cur, block, lev, bp);
1621		if (error)
1622			goto error0;
1623#endif
1624
1625		if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
1626			break;
1627
1628		/* Read-ahead the right block for the next loop. */
1629		xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1630	}
1631
1632	/*
1633	 * If we went off the root then we are either seriously
1634	 * confused or have the tree root in an inode.
1635	 */
1636	if (lev == cur->bc_nlevels) {
1637		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1638			goto out0;
1639		ASSERT(0);
1640		error = -EFSCORRUPTED;
1641		goto error0;
1642	}
1643	ASSERT(lev < cur->bc_nlevels);
1644
1645	/*
1646	 * Now walk back down the tree, fixing up the cursor's buffer
1647	 * pointers and key numbers.
1648	 */
1649	for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1650		union xfs_btree_ptr	*ptrp;
1651
1652		ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1653		--lev;
1654		error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1655		if (error)
1656			goto error0;
1657
1658		xfs_btree_setbuf(cur, lev, bp);
1659		cur->bc_ptrs[lev] = 1;
1660	}
1661out1:
1662	*stat = 1;
1663	return 0;
1664
1665out0:
1666	*stat = 0;
1667	return 0;
1668
1669error0:
1670	return error;
1671}
1672
1673/*
1674 * Decrement cursor by one record at the level.
1675 * For nonzero levels the leaf-ward information is untouched.
1676 */
1677int						/* error */
1678xfs_btree_decrement(
1679	struct xfs_btree_cur	*cur,
1680	int			level,
1681	int			*stat)		/* success/failure */
1682{
1683	struct xfs_btree_block	*block;
1684	xfs_buf_t		*bp;
1685	int			error;		/* error return value */
1686	int			lev;
1687	union xfs_btree_ptr	ptr;
1688
1689	ASSERT(level < cur->bc_nlevels);
1690
1691	/* Read-ahead to the left at this level. */
1692	xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1693
1694	/* We're done if we remain in the block after the decrement. */
1695	if (--cur->bc_ptrs[level] > 0)
1696		goto out1;
1697
1698	/* Get a pointer to the btree block. */
1699	block = xfs_btree_get_block(cur, level, &bp);
1700
1701#ifdef DEBUG
1702	error = xfs_btree_check_block(cur, block, level, bp);
1703	if (error)
1704		goto error0;
1705#endif
1706
1707	/* Fail if we just went off the left edge of the tree. */
1708	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
1709	if (xfs_btree_ptr_is_null(cur, &ptr))
1710		goto out0;
1711
1712	XFS_BTREE_STATS_INC(cur, decrement);
1713
1714	/*
1715	 * March up the tree decrementing pointers.
1716	 * Stop when we don't go off the left edge of a block.
1717	 */
1718	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1719		if (--cur->bc_ptrs[lev] > 0)
1720			break;
1721		/* Read-ahead the left block for the next loop. */
1722		xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1723	}
1724
1725	/*
1726	 * If we went off the root then we are seriously confused.
1727	 * or the root of the tree is in an inode.
1728	 */
1729	if (lev == cur->bc_nlevels) {
1730		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1731			goto out0;
1732		ASSERT(0);
1733		error = -EFSCORRUPTED;
1734		goto error0;
1735	}
1736	ASSERT(lev < cur->bc_nlevels);
1737
1738	/*
1739	 * Now walk back down the tree, fixing up the cursor's buffer
1740	 * pointers and key numbers.
1741	 */
1742	for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1743		union xfs_btree_ptr	*ptrp;
1744
1745		ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1746		--lev;
1747		error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1748		if (error)
1749			goto error0;
1750		xfs_btree_setbuf(cur, lev, bp);
1751		cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
1752	}
1753out1:
1754	*stat = 1;
1755	return 0;
1756
1757out0:
1758	*stat = 0;
1759	return 0;
1760
1761error0:
1762	return error;
1763}
1764
1765int
1766xfs_btree_lookup_get_block(
1767	struct xfs_btree_cur	*cur,	/* btree cursor */
1768	int			level,	/* level in the btree */
1769	union xfs_btree_ptr	*pp,	/* ptr to btree block */
1770	struct xfs_btree_block	**blkp) /* return btree block */
1771{
1772	struct xfs_buf		*bp;	/* buffer pointer for btree block */
1773	xfs_daddr_t		daddr;
1774	int			error = 0;
1775
1776	/* special case the root block if in an inode */
1777	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1778	    (level == cur->bc_nlevels - 1)) {
1779		*blkp = xfs_btree_get_iroot(cur);
1780		return 0;
1781	}
1782
1783	/*
1784	 * If the old buffer at this level for the disk address we are
1785	 * looking for re-use it.
1786	 *
1787	 * Otherwise throw it away and get a new one.
1788	 */
1789	bp = cur->bc_bufs[level];
1790	error = xfs_btree_ptr_to_daddr(cur, pp, &daddr);
1791	if (error)
1792		return error;
1793	if (bp && XFS_BUF_ADDR(bp) == daddr) {
1794		*blkp = XFS_BUF_TO_BLOCK(bp);
1795		return 0;
1796	}
1797
1798	error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
1799	if (error)
1800		return error;
1801
1802	/* Check the inode owner since the verifiers don't. */
1803	if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
1804	    !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) &&
1805	    (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
1806	    be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
1807			cur->bc_private.b.ip->i_ino)
1808		goto out_bad;
1809
1810	/* Did we get the level we were looking for? */
1811	if (be16_to_cpu((*blkp)->bb_level) != level)
1812		goto out_bad;
1813
1814	/* Check that internal nodes have at least one record. */
1815	if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0)
1816		goto out_bad;
1817
1818	xfs_btree_setbuf(cur, level, bp);
1819	return 0;
1820
1821out_bad:
1822	*blkp = NULL;
1823	xfs_trans_brelse(cur->bc_tp, bp);
1824	return -EFSCORRUPTED;
1825}
1826
1827/*
1828 * Get current search key.  For level 0 we don't actually have a key
1829 * structure so we make one up from the record.  For all other levels
1830 * we just return the right key.
1831 */
1832STATIC union xfs_btree_key *
1833xfs_lookup_get_search_key(
1834	struct xfs_btree_cur	*cur,
1835	int			level,
1836	int			keyno,
1837	struct xfs_btree_block	*block,
1838	union xfs_btree_key	*kp)
1839{
1840	if (level == 0) {
1841		cur->bc_ops->init_key_from_rec(kp,
1842				xfs_btree_rec_addr(cur, keyno, block));
1843		return kp;
1844	}
1845
1846	return xfs_btree_key_addr(cur, keyno, block);
1847}
1848
1849/*
1850 * Lookup the record.  The cursor is made to point to it, based on dir.
1851 * stat is set to 0 if can't find any such record, 1 for success.
1852 */
1853int					/* error */
1854xfs_btree_lookup(
1855	struct xfs_btree_cur	*cur,	/* btree cursor */
1856	xfs_lookup_t		dir,	/* <=, ==, or >= */
1857	int			*stat)	/* success/failure */
1858{
1859	struct xfs_btree_block	*block;	/* current btree block */
1860	int64_t			diff;	/* difference for the current key */
1861	int			error;	/* error return value */
1862	int			keyno;	/* current key number */
1863	int			level;	/* level in the btree */
1864	union xfs_btree_ptr	*pp;	/* ptr to btree block */
1865	union xfs_btree_ptr	ptr;	/* ptr to btree block */
1866
1867	XFS_BTREE_STATS_INC(cur, lookup);
1868
1869	/* No such thing as a zero-level tree. */
1870	if (cur->bc_nlevels == 0)
1871		return -EFSCORRUPTED;
1872
1873	block = NULL;
1874	keyno = 0;
1875
1876	/* initialise start pointer from cursor */
1877	cur->bc_ops->init_ptr_from_cur(cur, &ptr);
1878	pp = &ptr;
1879
1880	/*
1881	 * Iterate over each level in the btree, starting at the root.
1882	 * For each level above the leaves, find the key we need, based
1883	 * on the lookup record, then follow the corresponding block
1884	 * pointer down to the next level.
1885	 */
1886	for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
1887		/* Get the block we need to do the lookup on. */
1888		error = xfs_btree_lookup_get_block(cur, level, pp, &block);
1889		if (error)
1890			goto error0;
1891
1892		if (diff == 0) {
1893			/*
1894			 * If we already had a key match at a higher level, we
1895			 * know we need to use the first entry in this block.
1896			 */
1897			keyno = 1;
1898		} else {
1899			/* Otherwise search this block. Do a binary search. */
1900
1901			int	high;	/* high entry number */
1902			int	low;	/* low entry number */
1903
1904			/* Set low and high entry numbers, 1-based. */
1905			low = 1;
1906			high = xfs_btree_get_numrecs(block);
1907			if (!high) {
1908				/* Block is empty, must be an empty leaf. */
1909				if (level != 0 || cur->bc_nlevels != 1) {
1910					XFS_CORRUPTION_ERROR(__func__,
1911							XFS_ERRLEVEL_LOW,
1912							cur->bc_mp, block,
1913							sizeof(*block));
1914					return -EFSCORRUPTED;
1915				}
1916
1917				cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1918				*stat = 0;
1919				return 0;
1920			}
1921
1922			/* Binary search the block. */
1923			while (low <= high) {
1924				union xfs_btree_key	key;
1925				union xfs_btree_key	*kp;
1926
1927				XFS_BTREE_STATS_INC(cur, compare);
1928
1929				/* keyno is average of low and high. */
1930				keyno = (low + high) >> 1;
1931
1932				/* Get current search key */
1933				kp = xfs_lookup_get_search_key(cur, level,
1934						keyno, block, &key);
1935
1936				/*
1937				 * Compute difference to get next direction:
1938				 *  - less than, move right
1939				 *  - greater than, move left
1940				 *  - equal, we're done
1941				 */
1942				diff = cur->bc_ops->key_diff(cur, kp);
1943				if (diff < 0)
1944					low = keyno + 1;
1945				else if (diff > 0)
1946					high = keyno - 1;
1947				else
1948					break;
1949			}
1950		}
1951
1952		/*
1953		 * If there are more levels, set up for the next level
1954		 * by getting the block number and filling in the cursor.
1955		 */
1956		if (level > 0) {
1957			/*
1958			 * If we moved left, need the previous key number,
1959			 * unless there isn't one.
1960			 */
1961			if (diff > 0 && --keyno < 1)
1962				keyno = 1;
1963			pp = xfs_btree_ptr_addr(cur, keyno, block);
1964
1965			error = xfs_btree_debug_check_ptr(cur, pp, 0, level);
 
1966			if (error)
1967				goto error0;
1968
1969			cur->bc_ptrs[level] = keyno;
1970		}
1971	}
1972
1973	/* Done with the search. See if we need to adjust the results. */
1974	if (dir != XFS_LOOKUP_LE && diff < 0) {
1975		keyno++;
1976		/*
1977		 * If ge search and we went off the end of the block, but it's
1978		 * not the last block, we're in the wrong block.
1979		 */
1980		xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1981		if (dir == XFS_LOOKUP_GE &&
1982		    keyno > xfs_btree_get_numrecs(block) &&
1983		    !xfs_btree_ptr_is_null(cur, &ptr)) {
1984			int	i;
1985
1986			cur->bc_ptrs[0] = keyno;
1987			error = xfs_btree_increment(cur, 0, &i);
1988			if (error)
1989				goto error0;
1990			XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1991			*stat = 1;
1992			return 0;
1993		}
1994	} else if (dir == XFS_LOOKUP_LE && diff > 0)
1995		keyno--;
1996	cur->bc_ptrs[0] = keyno;
1997
1998	/* Return if we succeeded or not. */
1999	if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
2000		*stat = 0;
2001	else if (dir != XFS_LOOKUP_EQ || diff == 0)
2002		*stat = 1;
2003	else
2004		*stat = 0;
2005	return 0;
2006
2007error0:
2008	return error;
2009}
2010
2011/* Find the high key storage area from a regular key. */
2012union xfs_btree_key *
2013xfs_btree_high_key_from_key(
2014	struct xfs_btree_cur	*cur,
2015	union xfs_btree_key	*key)
2016{
2017	ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
2018	return (union xfs_btree_key *)((char *)key +
2019			(cur->bc_ops->key_len / 2));
2020}
2021
2022/* Determine the low (and high if overlapped) keys of a leaf block */
2023STATIC void
2024xfs_btree_get_leaf_keys(
2025	struct xfs_btree_cur	*cur,
2026	struct xfs_btree_block	*block,
2027	union xfs_btree_key	*key)
2028{
2029	union xfs_btree_key	max_hkey;
2030	union xfs_btree_key	hkey;
2031	union xfs_btree_rec	*rec;
2032	union xfs_btree_key	*high;
2033	int			n;
2034
2035	rec = xfs_btree_rec_addr(cur, 1, block);
2036	cur->bc_ops->init_key_from_rec(key, rec);
2037
2038	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2039
2040		cur->bc_ops->init_high_key_from_rec(&max_hkey, rec);
2041		for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2042			rec = xfs_btree_rec_addr(cur, n, block);
2043			cur->bc_ops->init_high_key_from_rec(&hkey, rec);
2044			if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey)
2045					> 0)
2046				max_hkey = hkey;
2047		}
2048
2049		high = xfs_btree_high_key_from_key(cur, key);
2050		memcpy(high, &max_hkey, cur->bc_ops->key_len / 2);
2051	}
2052}
2053
2054/* Determine the low (and high if overlapped) keys of a node block */
2055STATIC void
2056xfs_btree_get_node_keys(
2057	struct xfs_btree_cur	*cur,
2058	struct xfs_btree_block	*block,
2059	union xfs_btree_key	*key)
2060{
2061	union xfs_btree_key	*hkey;
2062	union xfs_btree_key	*max_hkey;
2063	union xfs_btree_key	*high;
2064	int			n;
2065
2066	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2067		memcpy(key, xfs_btree_key_addr(cur, 1, block),
2068				cur->bc_ops->key_len / 2);
2069
2070		max_hkey = xfs_btree_high_key_addr(cur, 1, block);
2071		for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2072			hkey = xfs_btree_high_key_addr(cur, n, block);
2073			if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0)
2074				max_hkey = hkey;
2075		}
2076
2077		high = xfs_btree_high_key_from_key(cur, key);
2078		memcpy(high, max_hkey, cur->bc_ops->key_len / 2);
2079	} else {
2080		memcpy(key, xfs_btree_key_addr(cur, 1, block),
2081				cur->bc_ops->key_len);
2082	}
2083}
2084
2085/* Derive the keys for any btree block. */
2086void
2087xfs_btree_get_keys(
2088	struct xfs_btree_cur	*cur,
2089	struct xfs_btree_block	*block,
2090	union xfs_btree_key	*key)
2091{
2092	if (be16_to_cpu(block->bb_level) == 0)
2093		xfs_btree_get_leaf_keys(cur, block, key);
2094	else
2095		xfs_btree_get_node_keys(cur, block, key);
2096}
2097
2098/*
2099 * Decide if we need to update the parent keys of a btree block.  For
2100 * a standard btree this is only necessary if we're updating the first
2101 * record/key.  For an overlapping btree, we must always update the
2102 * keys because the highest key can be in any of the records or keys
2103 * in the block.
2104 */
2105static inline bool
2106xfs_btree_needs_key_update(
2107	struct xfs_btree_cur	*cur,
2108	int			ptr)
2109{
2110	return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1;
2111}
2112
2113/*
2114 * Update the low and high parent keys of the given level, progressing
2115 * towards the root.  If force_all is false, stop if the keys for a given
2116 * level do not need updating.
2117 */
2118STATIC int
2119__xfs_btree_updkeys(
2120	struct xfs_btree_cur	*cur,
2121	int			level,
2122	struct xfs_btree_block	*block,
2123	struct xfs_buf		*bp0,
2124	bool			force_all)
2125{
2126	union xfs_btree_key	key;	/* keys from current level */
2127	union xfs_btree_key	*lkey;	/* keys from the next level up */
2128	union xfs_btree_key	*hkey;
2129	union xfs_btree_key	*nlkey;	/* keys from the next level up */
2130	union xfs_btree_key	*nhkey;
2131	struct xfs_buf		*bp;
2132	int			ptr;
2133
2134	ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
2135
2136	/* Exit if there aren't any parent levels to update. */
2137	if (level + 1 >= cur->bc_nlevels)
2138		return 0;
2139
2140	trace_xfs_btree_updkeys(cur, level, bp0);
2141
2142	lkey = &key;
2143	hkey = xfs_btree_high_key_from_key(cur, lkey);
2144	xfs_btree_get_keys(cur, block, lkey);
2145	for (level++; level < cur->bc_nlevels; level++) {
2146#ifdef DEBUG
2147		int		error;
2148#endif
2149		block = xfs_btree_get_block(cur, level, &bp);
2150		trace_xfs_btree_updkeys(cur, level, bp);
2151#ifdef DEBUG
2152		error = xfs_btree_check_block(cur, block, level, bp);
2153		if (error)
2154			return error;
2155#endif
2156		ptr = cur->bc_ptrs[level];
2157		nlkey = xfs_btree_key_addr(cur, ptr, block);
2158		nhkey = xfs_btree_high_key_addr(cur, ptr, block);
2159		if (!force_all &&
2160		    !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 ||
2161		      cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0))
2162			break;
2163		xfs_btree_copy_keys(cur, nlkey, lkey, 1);
2164		xfs_btree_log_keys(cur, bp, ptr, ptr);
2165		if (level + 1 >= cur->bc_nlevels)
2166			break;
2167		xfs_btree_get_node_keys(cur, block, lkey);
2168	}
2169
2170	return 0;
2171}
2172
2173/* Update all the keys from some level in cursor back to the root. */
2174STATIC int
2175xfs_btree_updkeys_force(
2176	struct xfs_btree_cur	*cur,
2177	int			level)
2178{
2179	struct xfs_buf		*bp;
2180	struct xfs_btree_block	*block;
2181
2182	block = xfs_btree_get_block(cur, level, &bp);
2183	return __xfs_btree_updkeys(cur, level, block, bp, true);
2184}
2185
2186/*
2187 * Update the parent keys of the given level, progressing towards the root.
2188 */
2189STATIC int
2190xfs_btree_update_keys(
2191	struct xfs_btree_cur	*cur,
2192	int			level)
2193{
2194	struct xfs_btree_block	*block;
2195	struct xfs_buf		*bp;
2196	union xfs_btree_key	*kp;
2197	union xfs_btree_key	key;
2198	int			ptr;
2199
2200	ASSERT(level >= 0);
2201
2202	block = xfs_btree_get_block(cur, level, &bp);
2203	if (cur->bc_flags & XFS_BTREE_OVERLAPPING)
2204		return __xfs_btree_updkeys(cur, level, block, bp, false);
2205
2206	/*
2207	 * Go up the tree from this level toward the root.
2208	 * At each level, update the key value to the value input.
2209	 * Stop when we reach a level where the cursor isn't pointing
2210	 * at the first entry in the block.
2211	 */
2212	xfs_btree_get_keys(cur, block, &key);
2213	for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
2214#ifdef DEBUG
2215		int		error;
2216#endif
2217		block = xfs_btree_get_block(cur, level, &bp);
2218#ifdef DEBUG
2219		error = xfs_btree_check_block(cur, block, level, bp);
2220		if (error)
2221			return error;
2222#endif
2223		ptr = cur->bc_ptrs[level];
2224		kp = xfs_btree_key_addr(cur, ptr, block);
2225		xfs_btree_copy_keys(cur, kp, &key, 1);
2226		xfs_btree_log_keys(cur, bp, ptr, ptr);
2227	}
2228
2229	return 0;
2230}
2231
2232/*
2233 * Update the record referred to by cur to the value in the
2234 * given record. This either works (return 0) or gets an
2235 * EFSCORRUPTED error.
2236 */
2237int
2238xfs_btree_update(
2239	struct xfs_btree_cur	*cur,
2240	union xfs_btree_rec	*rec)
2241{
2242	struct xfs_btree_block	*block;
2243	struct xfs_buf		*bp;
2244	int			error;
2245	int			ptr;
2246	union xfs_btree_rec	*rp;
2247
2248	/* Pick up the current block. */
2249	block = xfs_btree_get_block(cur, 0, &bp);
2250
2251#ifdef DEBUG
2252	error = xfs_btree_check_block(cur, block, 0, bp);
2253	if (error)
2254		goto error0;
2255#endif
2256	/* Get the address of the rec to be updated. */
2257	ptr = cur->bc_ptrs[0];
2258	rp = xfs_btree_rec_addr(cur, ptr, block);
2259
2260	/* Fill in the new contents and log them. */
2261	xfs_btree_copy_recs(cur, rp, rec, 1);
2262	xfs_btree_log_recs(cur, bp, ptr, ptr);
2263
2264	/*
2265	 * If we are tracking the last record in the tree and
2266	 * we are at the far right edge of the tree, update it.
2267	 */
2268	if (xfs_btree_is_lastrec(cur, block, 0)) {
2269		cur->bc_ops->update_lastrec(cur, block, rec,
2270					    ptr, LASTREC_UPDATE);
2271	}
2272
2273	/* Pass new key value up to our parent. */
2274	if (xfs_btree_needs_key_update(cur, ptr)) {
2275		error = xfs_btree_update_keys(cur, 0);
2276		if (error)
2277			goto error0;
2278	}
2279
2280	return 0;
2281
2282error0:
2283	return error;
2284}
2285
2286/*
2287 * Move 1 record left from cur/level if possible.
2288 * Update cur to reflect the new path.
2289 */
2290STATIC int					/* error */
2291xfs_btree_lshift(
2292	struct xfs_btree_cur	*cur,
2293	int			level,
2294	int			*stat)		/* success/failure */
2295{
2296	struct xfs_buf		*lbp;		/* left buffer pointer */
2297	struct xfs_btree_block	*left;		/* left btree block */
2298	int			lrecs;		/* left record count */
2299	struct xfs_buf		*rbp;		/* right buffer pointer */
2300	struct xfs_btree_block	*right;		/* right btree block */
2301	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
2302	int			rrecs;		/* right record count */
2303	union xfs_btree_ptr	lptr;		/* left btree pointer */
2304	union xfs_btree_key	*rkp = NULL;	/* right btree key */
2305	union xfs_btree_ptr	*rpp = NULL;	/* right address pointer */
2306	union xfs_btree_rec	*rrp = NULL;	/* right record pointer */
2307	int			error;		/* error return value */
2308	int			i;
2309
2310	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2311	    level == cur->bc_nlevels - 1)
2312		goto out0;
2313
2314	/* Set up variables for this block as "right". */
2315	right = xfs_btree_get_block(cur, level, &rbp);
2316
2317#ifdef DEBUG
2318	error = xfs_btree_check_block(cur, right, level, rbp);
2319	if (error)
2320		goto error0;
2321#endif
2322
2323	/* If we've got no left sibling then we can't shift an entry left. */
2324	xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2325	if (xfs_btree_ptr_is_null(cur, &lptr))
2326		goto out0;
2327
2328	/*
2329	 * If the cursor entry is the one that would be moved, don't
2330	 * do it... it's too complicated.
2331	 */
2332	if (cur->bc_ptrs[level] <= 1)
2333		goto out0;
2334
2335	/* Set up the left neighbor as "left". */
2336	error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2337	if (error)
2338		goto error0;
2339
2340	/* If it's full, it can't take another entry. */
2341	lrecs = xfs_btree_get_numrecs(left);
2342	if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
2343		goto out0;
2344
2345	rrecs = xfs_btree_get_numrecs(right);
2346
2347	/*
2348	 * We add one entry to the left side and remove one for the right side.
2349	 * Account for it here, the changes will be updated on disk and logged
2350	 * later.
2351	 */
2352	lrecs++;
2353	rrecs--;
2354
2355	XFS_BTREE_STATS_INC(cur, lshift);
2356	XFS_BTREE_STATS_ADD(cur, moves, 1);
2357
2358	/*
2359	 * If non-leaf, copy a key and a ptr to the left block.
2360	 * Log the changes to the left block.
2361	 */
2362	if (level > 0) {
2363		/* It's a non-leaf.  Move keys and pointers. */
2364		union xfs_btree_key	*lkp;	/* left btree key */
2365		union xfs_btree_ptr	*lpp;	/* left address pointer */
2366
2367		lkp = xfs_btree_key_addr(cur, lrecs, left);
2368		rkp = xfs_btree_key_addr(cur, 1, right);
2369
2370		lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2371		rpp = xfs_btree_ptr_addr(cur, 1, right);
2372
2373		error = xfs_btree_debug_check_ptr(cur, rpp, 0, level);
2374		if (error)
2375			goto error0;
2376
2377		xfs_btree_copy_keys(cur, lkp, rkp, 1);
2378		xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
2379
2380		xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
2381		xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
2382
2383		ASSERT(cur->bc_ops->keys_inorder(cur,
2384			xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
2385	} else {
2386		/* It's a leaf.  Move records.  */
2387		union xfs_btree_rec	*lrp;	/* left record pointer */
2388
2389		lrp = xfs_btree_rec_addr(cur, lrecs, left);
2390		rrp = xfs_btree_rec_addr(cur, 1, right);
2391
2392		xfs_btree_copy_recs(cur, lrp, rrp, 1);
2393		xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
2394
2395		ASSERT(cur->bc_ops->recs_inorder(cur,
2396			xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
2397	}
2398
2399	xfs_btree_set_numrecs(left, lrecs);
2400	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2401
2402	xfs_btree_set_numrecs(right, rrecs);
2403	xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2404
2405	/*
2406	 * Slide the contents of right down one entry.
2407	 */
2408	XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
2409	if (level > 0) {
2410		/* It's a nonleaf. operate on keys and ptrs */
 
2411		int			i;		/* loop index */
2412
2413		for (i = 0; i < rrecs; i++) {
2414			error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level);
2415			if (error)
2416				goto error0;
2417		}
2418
2419		xfs_btree_shift_keys(cur,
2420				xfs_btree_key_addr(cur, 2, right),
2421				-1, rrecs);
2422		xfs_btree_shift_ptrs(cur,
2423				xfs_btree_ptr_addr(cur, 2, right),
2424				-1, rrecs);
2425
2426		xfs_btree_log_keys(cur, rbp, 1, rrecs);
2427		xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2428	} else {
2429		/* It's a leaf. operate on records */
2430		xfs_btree_shift_recs(cur,
2431			xfs_btree_rec_addr(cur, 2, right),
2432			-1, rrecs);
2433		xfs_btree_log_recs(cur, rbp, 1, rrecs);
2434	}
2435
2436	/*
2437	 * Using a temporary cursor, update the parent key values of the
2438	 * block on the left.
2439	 */
2440	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2441		error = xfs_btree_dup_cursor(cur, &tcur);
2442		if (error)
2443			goto error0;
2444		i = xfs_btree_firstrec(tcur, level);
2445		XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0);
2446
2447		error = xfs_btree_decrement(tcur, level, &i);
2448		if (error)
2449			goto error1;
2450
2451		/* Update the parent high keys of the left block, if needed. */
2452		error = xfs_btree_update_keys(tcur, level);
2453		if (error)
2454			goto error1;
2455
2456		xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2457	}
2458
2459	/* Update the parent keys of the right block. */
2460	error = xfs_btree_update_keys(cur, level);
2461	if (error)
2462		goto error0;
2463
2464	/* Slide the cursor value left one. */
2465	cur->bc_ptrs[level]--;
2466
2467	*stat = 1;
2468	return 0;
2469
2470out0:
2471	*stat = 0;
2472	return 0;
2473
2474error0:
2475	return error;
2476
2477error1:
2478	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2479	return error;
2480}
2481
2482/*
2483 * Move 1 record right from cur/level if possible.
2484 * Update cur to reflect the new path.
2485 */
2486STATIC int					/* error */
2487xfs_btree_rshift(
2488	struct xfs_btree_cur	*cur,
2489	int			level,
2490	int			*stat)		/* success/failure */
2491{
2492	struct xfs_buf		*lbp;		/* left buffer pointer */
2493	struct xfs_btree_block	*left;		/* left btree block */
2494	struct xfs_buf		*rbp;		/* right buffer pointer */
2495	struct xfs_btree_block	*right;		/* right btree block */
2496	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
2497	union xfs_btree_ptr	rptr;		/* right block pointer */
2498	union xfs_btree_key	*rkp;		/* right btree key */
2499	int			rrecs;		/* right record count */
2500	int			lrecs;		/* left record count */
2501	int			error;		/* error return value */
2502	int			i;		/* loop counter */
2503
2504	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2505	    (level == cur->bc_nlevels - 1))
2506		goto out0;
2507
2508	/* Set up variables for this block as "left". */
2509	left = xfs_btree_get_block(cur, level, &lbp);
2510
2511#ifdef DEBUG
2512	error = xfs_btree_check_block(cur, left, level, lbp);
2513	if (error)
2514		goto error0;
2515#endif
2516
2517	/* If we've got no right sibling then we can't shift an entry right. */
2518	xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2519	if (xfs_btree_ptr_is_null(cur, &rptr))
2520		goto out0;
2521
2522	/*
2523	 * If the cursor entry is the one that would be moved, don't
2524	 * do it... it's too complicated.
2525	 */
2526	lrecs = xfs_btree_get_numrecs(left);
2527	if (cur->bc_ptrs[level] >= lrecs)
2528		goto out0;
2529
2530	/* Set up the right neighbor as "right". */
2531	error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2532	if (error)
2533		goto error0;
2534
2535	/* If it's full, it can't take another entry. */
2536	rrecs = xfs_btree_get_numrecs(right);
2537	if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
2538		goto out0;
2539
2540	XFS_BTREE_STATS_INC(cur, rshift);
2541	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2542
2543	/*
2544	 * Make a hole at the start of the right neighbor block, then
2545	 * copy the last left block entry to the hole.
2546	 */
2547	if (level > 0) {
2548		/* It's a nonleaf. make a hole in the keys and ptrs */
2549		union xfs_btree_key	*lkp;
2550		union xfs_btree_ptr	*lpp;
2551		union xfs_btree_ptr	*rpp;
2552
2553		lkp = xfs_btree_key_addr(cur, lrecs, left);
2554		lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2555		rkp = xfs_btree_key_addr(cur, 1, right);
2556		rpp = xfs_btree_ptr_addr(cur, 1, right);
2557
 
2558		for (i = rrecs - 1; i >= 0; i--) {
2559			error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
2560			if (error)
2561				goto error0;
2562		}
 
2563
2564		xfs_btree_shift_keys(cur, rkp, 1, rrecs);
2565		xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
2566
2567		error = xfs_btree_debug_check_ptr(cur, lpp, 0, level);
 
2568		if (error)
2569			goto error0;
 
2570
2571		/* Now put the new data in, and log it. */
2572		xfs_btree_copy_keys(cur, rkp, lkp, 1);
2573		xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
2574
2575		xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
2576		xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
2577
2578		ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
2579			xfs_btree_key_addr(cur, 2, right)));
2580	} else {
2581		/* It's a leaf. make a hole in the records */
2582		union xfs_btree_rec	*lrp;
2583		union xfs_btree_rec	*rrp;
2584
2585		lrp = xfs_btree_rec_addr(cur, lrecs, left);
2586		rrp = xfs_btree_rec_addr(cur, 1, right);
2587
2588		xfs_btree_shift_recs(cur, rrp, 1, rrecs);
2589
2590		/* Now put the new data in, and log it. */
2591		xfs_btree_copy_recs(cur, rrp, lrp, 1);
2592		xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
2593	}
2594
2595	/*
2596	 * Decrement and log left's numrecs, bump and log right's numrecs.
2597	 */
2598	xfs_btree_set_numrecs(left, --lrecs);
2599	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2600
2601	xfs_btree_set_numrecs(right, ++rrecs);
2602	xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2603
2604	/*
2605	 * Using a temporary cursor, update the parent key values of the
2606	 * block on the right.
2607	 */
2608	error = xfs_btree_dup_cursor(cur, &tcur);
2609	if (error)
2610		goto error0;
2611	i = xfs_btree_lastrec(tcur, level);
2612	XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0);
2613
2614	error = xfs_btree_increment(tcur, level, &i);
2615	if (error)
2616		goto error1;
2617
2618	/* Update the parent high keys of the left block, if needed. */
2619	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2620		error = xfs_btree_update_keys(cur, level);
2621		if (error)
2622			goto error1;
2623	}
2624
2625	/* Update the parent keys of the right block. */
2626	error = xfs_btree_update_keys(tcur, level);
2627	if (error)
2628		goto error1;
2629
2630	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2631
2632	*stat = 1;
2633	return 0;
2634
2635out0:
2636	*stat = 0;
2637	return 0;
2638
2639error0:
2640	return error;
2641
2642error1:
2643	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2644	return error;
2645}
2646
2647/*
2648 * Split cur/level block in half.
2649 * Return new block number and the key to its first
2650 * record (to be inserted into parent).
2651 */
2652STATIC int					/* error */
2653__xfs_btree_split(
2654	struct xfs_btree_cur	*cur,
2655	int			level,
2656	union xfs_btree_ptr	*ptrp,
2657	union xfs_btree_key	*key,
2658	struct xfs_btree_cur	**curp,
2659	int			*stat)		/* success/failure */
2660{
2661	union xfs_btree_ptr	lptr;		/* left sibling block ptr */
2662	struct xfs_buf		*lbp;		/* left buffer pointer */
2663	struct xfs_btree_block	*left;		/* left btree block */
2664	union xfs_btree_ptr	rptr;		/* right sibling block ptr */
2665	struct xfs_buf		*rbp;		/* right buffer pointer */
2666	struct xfs_btree_block	*right;		/* right btree block */
2667	union xfs_btree_ptr	rrptr;		/* right-right sibling ptr */
2668	struct xfs_buf		*rrbp;		/* right-right buffer pointer */
2669	struct xfs_btree_block	*rrblock;	/* right-right btree block */
2670	int			lrecs;
2671	int			rrecs;
2672	int			src_index;
2673	int			error;		/* error return value */
 
2674	int			i;
 
2675
2676	XFS_BTREE_STATS_INC(cur, split);
2677
2678	/* Set up left block (current one). */
2679	left = xfs_btree_get_block(cur, level, &lbp);
2680
2681#ifdef DEBUG
2682	error = xfs_btree_check_block(cur, left, level, lbp);
2683	if (error)
2684		goto error0;
2685#endif
2686
2687	xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2688
2689	/* Allocate the new block. If we can't do it, we're toast. Give up. */
2690	error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
2691	if (error)
2692		goto error0;
2693	if (*stat == 0)
2694		goto out0;
2695	XFS_BTREE_STATS_INC(cur, alloc);
2696
2697	/* Set up the new block as "right". */
2698	error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp);
2699	if (error)
2700		goto error0;
2701
2702	/* Fill in the btree header for the new right block. */
2703	xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
2704
2705	/*
2706	 * Split the entries between the old and the new block evenly.
2707	 * Make sure that if there's an odd number of entries now, that
2708	 * each new block will have the same number of entries.
2709	 */
2710	lrecs = xfs_btree_get_numrecs(left);
2711	rrecs = lrecs / 2;
2712	if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
2713		rrecs++;
2714	src_index = (lrecs - rrecs + 1);
2715
2716	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2717
2718	/* Adjust numrecs for the later get_*_keys() calls. */
2719	lrecs -= rrecs;
2720	xfs_btree_set_numrecs(left, lrecs);
2721	xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
2722
2723	/*
2724	 * Copy btree block entries from the left block over to the
2725	 * new block, the right. Update the right block and log the
2726	 * changes.
2727	 */
2728	if (level > 0) {
2729		/* It's a non-leaf.  Move keys and pointers. */
2730		union xfs_btree_key	*lkp;	/* left btree key */
2731		union xfs_btree_ptr	*lpp;	/* left address pointer */
2732		union xfs_btree_key	*rkp;	/* right btree key */
2733		union xfs_btree_ptr	*rpp;	/* right address pointer */
2734
2735		lkp = xfs_btree_key_addr(cur, src_index, left);
2736		lpp = xfs_btree_ptr_addr(cur, src_index, left);
2737		rkp = xfs_btree_key_addr(cur, 1, right);
2738		rpp = xfs_btree_ptr_addr(cur, 1, right);
2739
 
2740		for (i = src_index; i < rrecs; i++) {
2741			error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
2742			if (error)
2743				goto error0;
2744		}
 
2745
2746		/* Copy the keys & pointers to the new block. */
2747		xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
2748		xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
2749
2750		xfs_btree_log_keys(cur, rbp, 1, rrecs);
2751		xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2752
2753		/* Stash the keys of the new block for later insertion. */
2754		xfs_btree_get_node_keys(cur, right, key);
2755	} else {
2756		/* It's a leaf.  Move records.  */
2757		union xfs_btree_rec	*lrp;	/* left record pointer */
2758		union xfs_btree_rec	*rrp;	/* right record pointer */
2759
2760		lrp = xfs_btree_rec_addr(cur, src_index, left);
2761		rrp = xfs_btree_rec_addr(cur, 1, right);
2762
2763		/* Copy records to the new block. */
2764		xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
2765		xfs_btree_log_recs(cur, rbp, 1, rrecs);
2766
2767		/* Stash the keys of the new block for later insertion. */
2768		xfs_btree_get_leaf_keys(cur, right, key);
2769	}
2770
2771	/*
2772	 * Find the left block number by looking in the buffer.
2773	 * Adjust sibling pointers.
2774	 */
2775	xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
2776	xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
2777	xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2778	xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2779
2780	xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
2781	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
2782
2783	/*
2784	 * If there's a block to the new block's right, make that block
2785	 * point back to right instead of to left.
2786	 */
2787	if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
2788		error = xfs_btree_read_buf_block(cur, &rrptr,
2789							0, &rrblock, &rrbp);
2790		if (error)
2791			goto error0;
2792		xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
2793		xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
2794	}
2795
2796	/* Update the parent high keys of the left block, if needed. */
2797	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2798		error = xfs_btree_update_keys(cur, level);
2799		if (error)
2800			goto error0;
2801	}
2802
2803	/*
2804	 * If the cursor is really in the right block, move it there.
2805	 * If it's just pointing past the last entry in left, then we'll
2806	 * insert there, so don't change anything in that case.
2807	 */
2808	if (cur->bc_ptrs[level] > lrecs + 1) {
2809		xfs_btree_setbuf(cur, level, rbp);
2810		cur->bc_ptrs[level] -= lrecs;
2811	}
2812	/*
2813	 * If there are more levels, we'll need another cursor which refers
2814	 * the right block, no matter where this cursor was.
2815	 */
2816	if (level + 1 < cur->bc_nlevels) {
2817		error = xfs_btree_dup_cursor(cur, curp);
2818		if (error)
2819			goto error0;
2820		(*curp)->bc_ptrs[level + 1]++;
2821	}
2822	*ptrp = rptr;
2823	*stat = 1;
2824	return 0;
2825out0:
2826	*stat = 0;
2827	return 0;
2828
2829error0:
2830	return error;
2831}
2832
2833struct xfs_btree_split_args {
2834	struct xfs_btree_cur	*cur;
2835	int			level;
2836	union xfs_btree_ptr	*ptrp;
2837	union xfs_btree_key	*key;
2838	struct xfs_btree_cur	**curp;
2839	int			*stat;		/* success/failure */
2840	int			result;
2841	bool			kswapd;	/* allocation in kswapd context */
2842	struct completion	*done;
2843	struct work_struct	work;
2844};
2845
2846/*
2847 * Stack switching interfaces for allocation
2848 */
2849static void
2850xfs_btree_split_worker(
2851	struct work_struct	*work)
2852{
2853	struct xfs_btree_split_args	*args = container_of(work,
2854						struct xfs_btree_split_args, work);
2855	unsigned long		pflags;
2856	unsigned long		new_pflags = PF_MEMALLOC_NOFS;
2857
2858	/*
2859	 * we are in a transaction context here, but may also be doing work
2860	 * in kswapd context, and hence we may need to inherit that state
2861	 * temporarily to ensure that we don't block waiting for memory reclaim
2862	 * in any way.
2863	 */
2864	if (args->kswapd)
2865		new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2866
2867	current_set_flags_nested(&pflags, new_pflags);
2868
2869	args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
2870					 args->key, args->curp, args->stat);
2871	complete(args->done);
2872
2873	current_restore_flags_nested(&pflags, new_pflags);
2874}
2875
2876/*
2877 * BMBT split requests often come in with little stack to work on. Push
2878 * them off to a worker thread so there is lots of stack to use. For the other
2879 * btree types, just call directly to avoid the context switch overhead here.
2880 */
2881STATIC int					/* error */
2882xfs_btree_split(
2883	struct xfs_btree_cur	*cur,
2884	int			level,
2885	union xfs_btree_ptr	*ptrp,
2886	union xfs_btree_key	*key,
2887	struct xfs_btree_cur	**curp,
2888	int			*stat)		/* success/failure */
2889{
2890	struct xfs_btree_split_args	args;
2891	DECLARE_COMPLETION_ONSTACK(done);
2892
2893	if (cur->bc_btnum != XFS_BTNUM_BMAP)
2894		return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
2895
2896	args.cur = cur;
2897	args.level = level;
2898	args.ptrp = ptrp;
2899	args.key = key;
2900	args.curp = curp;
2901	args.stat = stat;
2902	args.done = &done;
2903	args.kswapd = current_is_kswapd();
2904	INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
2905	queue_work(xfs_alloc_wq, &args.work);
2906	wait_for_completion(&done);
2907	destroy_work_on_stack(&args.work);
2908	return args.result;
2909}
2910
2911
2912/*
2913 * Copy the old inode root contents into a real block and make the
2914 * broot point to it.
2915 */
2916int						/* error */
2917xfs_btree_new_iroot(
2918	struct xfs_btree_cur	*cur,		/* btree cursor */
2919	int			*logflags,	/* logging flags for inode */
2920	int			*stat)		/* return status - 0 fail */
2921{
2922	struct xfs_buf		*cbp;		/* buffer for cblock */
2923	struct xfs_btree_block	*block;		/* btree block */
2924	struct xfs_btree_block	*cblock;	/* child btree block */
2925	union xfs_btree_key	*ckp;		/* child key pointer */
2926	union xfs_btree_ptr	*cpp;		/* child ptr pointer */
2927	union xfs_btree_key	*kp;		/* pointer to btree key */
2928	union xfs_btree_ptr	*pp;		/* pointer to block addr */
2929	union xfs_btree_ptr	nptr;		/* new block addr */
2930	int			level;		/* btree level */
2931	int			error;		/* error return code */
 
2932	int			i;		/* loop counter */
 
2933
2934	XFS_BTREE_STATS_INC(cur, newroot);
2935
2936	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
2937
2938	level = cur->bc_nlevels - 1;
2939
2940	block = xfs_btree_get_iroot(cur);
2941	pp = xfs_btree_ptr_addr(cur, 1, block);
2942
2943	/* Allocate the new block. If we can't do it, we're toast. Give up. */
2944	error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
2945	if (error)
2946		goto error0;
2947	if (*stat == 0)
2948		return 0;
2949
2950	XFS_BTREE_STATS_INC(cur, alloc);
2951
2952	/* Copy the root into a real block. */
2953	error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp);
2954	if (error)
2955		goto error0;
2956
2957	/*
2958	 * we can't just memcpy() the root in for CRC enabled btree blocks.
2959	 * In that case have to also ensure the blkno remains correct
2960	 */
2961	memcpy(cblock, block, xfs_btree_block_len(cur));
2962	if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
2963		if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
2964			cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
2965		else
2966			cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
2967	}
2968
2969	be16_add_cpu(&block->bb_level, 1);
2970	xfs_btree_set_numrecs(block, 1);
2971	cur->bc_nlevels++;
2972	cur->bc_ptrs[level + 1] = 1;
2973
2974	kp = xfs_btree_key_addr(cur, 1, block);
2975	ckp = xfs_btree_key_addr(cur, 1, cblock);
2976	xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
2977
2978	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
 
2979	for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
2980		error = xfs_btree_debug_check_ptr(cur, pp, i, level);
2981		if (error)
2982			goto error0;
2983	}
2984
2985	xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
2986
2987	error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level);
 
2988	if (error)
2989		goto error0;
2990
2991	xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
2992
2993	xfs_iroot_realloc(cur->bc_private.b.ip,
2994			  1 - xfs_btree_get_numrecs(cblock),
2995			  cur->bc_private.b.whichfork);
2996
2997	xfs_btree_setbuf(cur, level, cbp);
2998
2999	/*
3000	 * Do all this logging at the end so that
3001	 * the root is at the right level.
3002	 */
3003	xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
3004	xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
3005	xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
3006
3007	*logflags |=
3008		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
3009	*stat = 1;
3010	return 0;
3011error0:
3012	return error;
3013}
3014
3015/*
3016 * Allocate a new root block, fill it in.
3017 */
3018STATIC int				/* error */
3019xfs_btree_new_root(
3020	struct xfs_btree_cur	*cur,	/* btree cursor */
3021	int			*stat)	/* success/failure */
3022{
3023	struct xfs_btree_block	*block;	/* one half of the old root block */
3024	struct xfs_buf		*bp;	/* buffer containing block */
3025	int			error;	/* error return value */
3026	struct xfs_buf		*lbp;	/* left buffer pointer */
3027	struct xfs_btree_block	*left;	/* left btree block */
3028	struct xfs_buf		*nbp;	/* new (root) buffer */
3029	struct xfs_btree_block	*new;	/* new (root) btree block */
3030	int			nptr;	/* new value for key index, 1 or 2 */
3031	struct xfs_buf		*rbp;	/* right buffer pointer */
3032	struct xfs_btree_block	*right;	/* right btree block */
3033	union xfs_btree_ptr	rptr;
3034	union xfs_btree_ptr	lptr;
3035
3036	XFS_BTREE_STATS_INC(cur, newroot);
3037
3038	/* initialise our start point from the cursor */
3039	cur->bc_ops->init_ptr_from_cur(cur, &rptr);
3040
3041	/* Allocate the new block. If we can't do it, we're toast. Give up. */
3042	error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
3043	if (error)
3044		goto error0;
3045	if (*stat == 0)
3046		goto out0;
3047	XFS_BTREE_STATS_INC(cur, alloc);
3048
3049	/* Set up the new block. */
3050	error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp);
3051	if (error)
3052		goto error0;
3053
3054	/* Set the root in the holding structure  increasing the level by 1. */
3055	cur->bc_ops->set_root(cur, &lptr, 1);
3056
3057	/*
3058	 * At the previous root level there are now two blocks: the old root,
3059	 * and the new block generated when it was split.  We don't know which
3060	 * one the cursor is pointing at, so we set up variables "left" and
3061	 * "right" for each case.
3062	 */
3063	block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
3064
3065#ifdef DEBUG
3066	error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
3067	if (error)
3068		goto error0;
3069#endif
3070
3071	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3072	if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3073		/* Our block is left, pick up the right block. */
3074		lbp = bp;
3075		xfs_btree_buf_to_ptr(cur, lbp, &lptr);
3076		left = block;
3077		error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
3078		if (error)
3079			goto error0;
3080		bp = rbp;
3081		nptr = 1;
3082	} else {
3083		/* Our block is right, pick up the left block. */
3084		rbp = bp;
3085		xfs_btree_buf_to_ptr(cur, rbp, &rptr);
3086		right = block;
3087		xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
3088		error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3089		if (error)
3090			goto error0;
3091		bp = lbp;
3092		nptr = 2;
3093	}
3094
3095	/* Fill in the new block's btree header and log it. */
3096	xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
3097	xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
3098	ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
3099			!xfs_btree_ptr_is_null(cur, &rptr));
3100
3101	/* Fill in the key data in the new root. */
3102	if (xfs_btree_get_level(left) > 0) {
3103		/*
3104		 * Get the keys for the left block's keys and put them directly
3105		 * in the parent block.  Do the same for the right block.
3106		 */
3107		xfs_btree_get_node_keys(cur, left,
3108				xfs_btree_key_addr(cur, 1, new));
3109		xfs_btree_get_node_keys(cur, right,
3110				xfs_btree_key_addr(cur, 2, new));
3111	} else {
3112		/*
3113		 * Get the keys for the left block's records and put them
3114		 * directly in the parent block.  Do the same for the right
3115		 * block.
3116		 */
3117		xfs_btree_get_leaf_keys(cur, left,
3118			xfs_btree_key_addr(cur, 1, new));
3119		xfs_btree_get_leaf_keys(cur, right,
3120			xfs_btree_key_addr(cur, 2, new));
3121	}
3122	xfs_btree_log_keys(cur, nbp, 1, 2);
3123
3124	/* Fill in the pointer data in the new root. */
3125	xfs_btree_copy_ptrs(cur,
3126		xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
3127	xfs_btree_copy_ptrs(cur,
3128		xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
3129	xfs_btree_log_ptrs(cur, nbp, 1, 2);
3130
3131	/* Fix up the cursor. */
3132	xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
3133	cur->bc_ptrs[cur->bc_nlevels] = nptr;
3134	cur->bc_nlevels++;
3135	*stat = 1;
3136	return 0;
3137error0:
3138	return error;
3139out0:
3140	*stat = 0;
3141	return 0;
3142}
3143
3144STATIC int
3145xfs_btree_make_block_unfull(
3146	struct xfs_btree_cur	*cur,	/* btree cursor */
3147	int			level,	/* btree level */
3148	int			numrecs,/* # of recs in block */
3149	int			*oindex,/* old tree index */
3150	int			*index,	/* new tree index */
3151	union xfs_btree_ptr	*nptr,	/* new btree ptr */
3152	struct xfs_btree_cur	**ncur,	/* new btree cursor */
3153	union xfs_btree_key	*key,	/* key of new block */
3154	int			*stat)
3155{
3156	int			error = 0;
3157
3158	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3159	    level == cur->bc_nlevels - 1) {
3160		struct xfs_inode *ip = cur->bc_private.b.ip;
3161
3162		if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
3163			/* A root block that can be made bigger. */
3164			xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
3165			*stat = 1;
3166		} else {
3167			/* A root block that needs replacing */
3168			int	logflags = 0;
3169
3170			error = xfs_btree_new_iroot(cur, &logflags, stat);
3171			if (error || *stat == 0)
3172				return error;
3173
3174			xfs_trans_log_inode(cur->bc_tp, ip, logflags);
3175		}
3176
3177		return 0;
3178	}
3179
3180	/* First, try shifting an entry to the right neighbor. */
3181	error = xfs_btree_rshift(cur, level, stat);
3182	if (error || *stat)
3183		return error;
3184
3185	/* Next, try shifting an entry to the left neighbor. */
3186	error = xfs_btree_lshift(cur, level, stat);
3187	if (error)
3188		return error;
3189
3190	if (*stat) {
3191		*oindex = *index = cur->bc_ptrs[level];
3192		return 0;
3193	}
3194
3195	/*
3196	 * Next, try splitting the current block in half.
3197	 *
3198	 * If this works we have to re-set our variables because we
3199	 * could be in a different block now.
3200	 */
3201	error = xfs_btree_split(cur, level, nptr, key, ncur, stat);
3202	if (error || *stat == 0)
3203		return error;
3204
3205
3206	*index = cur->bc_ptrs[level];
3207	return 0;
3208}
3209
3210/*
3211 * Insert one record/level.  Return information to the caller
3212 * allowing the next level up to proceed if necessary.
3213 */
3214STATIC int
3215xfs_btree_insrec(
3216	struct xfs_btree_cur	*cur,	/* btree cursor */
3217	int			level,	/* level to insert record at */
3218	union xfs_btree_ptr	*ptrp,	/* i/o: block number inserted */
3219	union xfs_btree_rec	*rec,	/* record to insert */
3220	union xfs_btree_key	*key,	/* i/o: block key for ptrp */
3221	struct xfs_btree_cur	**curp,	/* output: new cursor replacing cur */
3222	int			*stat)	/* success/failure */
3223{
3224	struct xfs_btree_block	*block;	/* btree block */
3225	struct xfs_buf		*bp;	/* buffer for block */
3226	union xfs_btree_ptr	nptr;	/* new block ptr */
3227	struct xfs_btree_cur	*ncur;	/* new btree cursor */
3228	union xfs_btree_key	nkey;	/* new block key */
3229	union xfs_btree_key	*lkey;
3230	int			optr;	/* old key/record index */
3231	int			ptr;	/* key/record index */
3232	int			numrecs;/* number of records */
3233	int			error;	/* error return value */
 
3234	int			i;
 
3235	xfs_daddr_t		old_bn;
3236
3237	ncur = NULL;
3238	lkey = &nkey;
3239
3240	/*
3241	 * If we have an external root pointer, and we've made it to the
3242	 * root level, allocate a new root block and we're done.
3243	 */
3244	if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3245	    (level >= cur->bc_nlevels)) {
3246		error = xfs_btree_new_root(cur, stat);
3247		xfs_btree_set_ptr_null(cur, ptrp);
3248
3249		return error;
3250	}
3251
3252	/* If we're off the left edge, return failure. */
3253	ptr = cur->bc_ptrs[level];
3254	if (ptr == 0) {
3255		*stat = 0;
3256		return 0;
3257	}
3258
3259	optr = ptr;
3260
3261	XFS_BTREE_STATS_INC(cur, insrec);
3262
3263	/* Get pointers to the btree buffer and block. */
3264	block = xfs_btree_get_block(cur, level, &bp);
3265	old_bn = bp ? bp->b_bn : XFS_BUF_DADDR_NULL;
3266	numrecs = xfs_btree_get_numrecs(block);
3267
3268#ifdef DEBUG
3269	error = xfs_btree_check_block(cur, block, level, bp);
3270	if (error)
3271		goto error0;
3272
3273	/* Check that the new entry is being inserted in the right place. */
3274	if (ptr <= numrecs) {
3275		if (level == 0) {
3276			ASSERT(cur->bc_ops->recs_inorder(cur, rec,
3277				xfs_btree_rec_addr(cur, ptr, block)));
3278		} else {
3279			ASSERT(cur->bc_ops->keys_inorder(cur, key,
3280				xfs_btree_key_addr(cur, ptr, block)));
3281		}
3282	}
3283#endif
3284
3285	/*
3286	 * If the block is full, we can't insert the new entry until we
3287	 * make the block un-full.
3288	 */
3289	xfs_btree_set_ptr_null(cur, &nptr);
3290	if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
3291		error = xfs_btree_make_block_unfull(cur, level, numrecs,
3292					&optr, &ptr, &nptr, &ncur, lkey, stat);
3293		if (error || *stat == 0)
3294			goto error0;
3295	}
3296
3297	/*
3298	 * The current block may have changed if the block was
3299	 * previously full and we have just made space in it.
3300	 */
3301	block = xfs_btree_get_block(cur, level, &bp);
3302	numrecs = xfs_btree_get_numrecs(block);
3303
3304#ifdef DEBUG
3305	error = xfs_btree_check_block(cur, block, level, bp);
3306	if (error)
3307		return error;
3308#endif
3309
3310	/*
3311	 * At this point we know there's room for our new entry in the block
3312	 * we're pointing at.
3313	 */
3314	XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
3315
3316	if (level > 0) {
3317		/* It's a nonleaf. make a hole in the keys and ptrs */
3318		union xfs_btree_key	*kp;
3319		union xfs_btree_ptr	*pp;
3320
3321		kp = xfs_btree_key_addr(cur, ptr, block);
3322		pp = xfs_btree_ptr_addr(cur, ptr, block);
3323
 
3324		for (i = numrecs - ptr; i >= 0; i--) {
3325			error = xfs_btree_debug_check_ptr(cur, pp, i, level);
3326			if (error)
3327				return error;
3328		}
 
3329
3330		xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
3331		xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
3332
3333		error = xfs_btree_debug_check_ptr(cur, ptrp, 0, level);
 
3334		if (error)
3335			goto error0;
 
3336
3337		/* Now put the new data in, bump numrecs and log it. */
3338		xfs_btree_copy_keys(cur, kp, key, 1);
3339		xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
3340		numrecs++;
3341		xfs_btree_set_numrecs(block, numrecs);
3342		xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
3343		xfs_btree_log_keys(cur, bp, ptr, numrecs);
3344#ifdef DEBUG
3345		if (ptr < numrecs) {
3346			ASSERT(cur->bc_ops->keys_inorder(cur, kp,
3347				xfs_btree_key_addr(cur, ptr + 1, block)));
3348		}
3349#endif
3350	} else {
3351		/* It's a leaf. make a hole in the records */
3352		union xfs_btree_rec             *rp;
3353
3354		rp = xfs_btree_rec_addr(cur, ptr, block);
3355
3356		xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
3357
3358		/* Now put the new data in, bump numrecs and log it. */
3359		xfs_btree_copy_recs(cur, rp, rec, 1);
3360		xfs_btree_set_numrecs(block, ++numrecs);
3361		xfs_btree_log_recs(cur, bp, ptr, numrecs);
3362#ifdef DEBUG
3363		if (ptr < numrecs) {
3364			ASSERT(cur->bc_ops->recs_inorder(cur, rp,
3365				xfs_btree_rec_addr(cur, ptr + 1, block)));
3366		}
3367#endif
3368	}
3369
3370	/* Log the new number of records in the btree header. */
3371	xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3372
3373	/*
3374	 * If we just inserted into a new tree block, we have to
3375	 * recalculate nkey here because nkey is out of date.
3376	 *
3377	 * Otherwise we're just updating an existing block (having shoved
3378	 * some records into the new tree block), so use the regular key
3379	 * update mechanism.
3380	 */
3381	if (bp && bp->b_bn != old_bn) {
3382		xfs_btree_get_keys(cur, block, lkey);
3383	} else if (xfs_btree_needs_key_update(cur, optr)) {
3384		error = xfs_btree_update_keys(cur, level);
3385		if (error)
3386			goto error0;
3387	}
3388
3389	/*
3390	 * If we are tracking the last record in the tree and
3391	 * we are at the far right edge of the tree, update it.
3392	 */
3393	if (xfs_btree_is_lastrec(cur, block, level)) {
3394		cur->bc_ops->update_lastrec(cur, block, rec,
3395					    ptr, LASTREC_INSREC);
3396	}
3397
3398	/*
3399	 * Return the new block number, if any.
3400	 * If there is one, give back a record value and a cursor too.
3401	 */
3402	*ptrp = nptr;
3403	if (!xfs_btree_ptr_is_null(cur, &nptr)) {
3404		xfs_btree_copy_keys(cur, key, lkey, 1);
3405		*curp = ncur;
3406	}
3407
3408	*stat = 1;
3409	return 0;
3410
3411error0:
3412	return error;
3413}
3414
3415/*
3416 * Insert the record at the point referenced by cur.
3417 *
3418 * A multi-level split of the tree on insert will invalidate the original
3419 * cursor.  All callers of this function should assume that the cursor is
3420 * no longer valid and revalidate it.
3421 */
3422int
3423xfs_btree_insert(
3424	struct xfs_btree_cur	*cur,
3425	int			*stat)
3426{
3427	int			error;	/* error return value */
3428	int			i;	/* result value, 0 for failure */
3429	int			level;	/* current level number in btree */
3430	union xfs_btree_ptr	nptr;	/* new block number (split result) */
3431	struct xfs_btree_cur	*ncur;	/* new cursor (split result) */
3432	struct xfs_btree_cur	*pcur;	/* previous level's cursor */
3433	union xfs_btree_key	bkey;	/* key of block to insert */
3434	union xfs_btree_key	*key;
3435	union xfs_btree_rec	rec;	/* record to insert */
3436
3437	level = 0;
3438	ncur = NULL;
3439	pcur = cur;
3440	key = &bkey;
3441
3442	xfs_btree_set_ptr_null(cur, &nptr);
3443
3444	/* Make a key out of the record data to be inserted, and save it. */
3445	cur->bc_ops->init_rec_from_cur(cur, &rec);
3446	cur->bc_ops->init_key_from_rec(key, &rec);
3447
3448	/*
3449	 * Loop going up the tree, starting at the leaf level.
3450	 * Stop when we don't get a split block, that must mean that
3451	 * the insert is finished with this level.
3452	 */
3453	do {
3454		/*
3455		 * Insert nrec/nptr into this level of the tree.
3456		 * Note if we fail, nptr will be null.
3457		 */
3458		error = xfs_btree_insrec(pcur, level, &nptr, &rec, key,
3459				&ncur, &i);
3460		if (error) {
3461			if (pcur != cur)
3462				xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
3463			goto error0;
3464		}
3465
3466		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3467		level++;
3468
3469		/*
3470		 * See if the cursor we just used is trash.
3471		 * Can't trash the caller's cursor, but otherwise we should
3472		 * if ncur is a new cursor or we're about to be done.
3473		 */
3474		if (pcur != cur &&
3475		    (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
3476			/* Save the state from the cursor before we trash it */
3477			if (cur->bc_ops->update_cursor)
3478				cur->bc_ops->update_cursor(pcur, cur);
3479			cur->bc_nlevels = pcur->bc_nlevels;
3480			xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
3481		}
3482		/* If we got a new cursor, switch to it. */
3483		if (ncur) {
3484			pcur = ncur;
3485			ncur = NULL;
3486		}
3487	} while (!xfs_btree_ptr_is_null(cur, &nptr));
3488
3489	*stat = i;
3490	return 0;
3491error0:
3492	return error;
3493}
3494
3495/*
3496 * Try to merge a non-leaf block back into the inode root.
3497 *
3498 * Note: the killroot names comes from the fact that we're effectively
3499 * killing the old root block.  But because we can't just delete the
3500 * inode we have to copy the single block it was pointing to into the
3501 * inode.
3502 */
3503STATIC int
3504xfs_btree_kill_iroot(
3505	struct xfs_btree_cur	*cur)
3506{
3507	int			whichfork = cur->bc_private.b.whichfork;
3508	struct xfs_inode	*ip = cur->bc_private.b.ip;
3509	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
3510	struct xfs_btree_block	*block;
3511	struct xfs_btree_block	*cblock;
3512	union xfs_btree_key	*kp;
3513	union xfs_btree_key	*ckp;
3514	union xfs_btree_ptr	*pp;
3515	union xfs_btree_ptr	*cpp;
3516	struct xfs_buf		*cbp;
3517	int			level;
3518	int			index;
3519	int			numrecs;
3520	int			error;
3521#ifdef DEBUG
3522	union xfs_btree_ptr	ptr;
 
3523#endif
3524	int			i;
3525
3526	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3527	ASSERT(cur->bc_nlevels > 1);
3528
3529	/*
3530	 * Don't deal with the root block needs to be a leaf case.
3531	 * We're just going to turn the thing back into extents anyway.
3532	 */
3533	level = cur->bc_nlevels - 1;
3534	if (level == 1)
3535		goto out0;
3536
3537	/*
3538	 * Give up if the root has multiple children.
3539	 */
3540	block = xfs_btree_get_iroot(cur);
3541	if (xfs_btree_get_numrecs(block) != 1)
3542		goto out0;
3543
3544	cblock = xfs_btree_get_block(cur, level - 1, &cbp);
3545	numrecs = xfs_btree_get_numrecs(cblock);
3546
3547	/*
3548	 * Only do this if the next level will fit.
3549	 * Then the data must be copied up to the inode,
3550	 * instead of freeing the root you free the next level.
3551	 */
3552	if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
3553		goto out0;
3554
3555	XFS_BTREE_STATS_INC(cur, killroot);
3556
3557#ifdef DEBUG
3558	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
3559	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3560	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
3561	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3562#endif
3563
3564	index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
3565	if (index) {
3566		xfs_iroot_realloc(cur->bc_private.b.ip, index,
3567				  cur->bc_private.b.whichfork);
3568		block = ifp->if_broot;
3569	}
3570
3571	be16_add_cpu(&block->bb_numrecs, index);
3572	ASSERT(block->bb_numrecs == cblock->bb_numrecs);
3573
3574	kp = xfs_btree_key_addr(cur, 1, block);
3575	ckp = xfs_btree_key_addr(cur, 1, cblock);
3576	xfs_btree_copy_keys(cur, kp, ckp, numrecs);
3577
3578	pp = xfs_btree_ptr_addr(cur, 1, block);
3579	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3580
3581	for (i = 0; i < numrecs; i++) {
3582		error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1);
3583		if (error)
3584			return error;
3585	}
3586
3587	xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3588
3589	error = xfs_btree_free_block(cur, cbp);
3590	if (error)
3591		return error;
3592
3593	cur->bc_bufs[level - 1] = NULL;
3594	be16_add_cpu(&block->bb_level, -1);
3595	xfs_trans_log_inode(cur->bc_tp, ip,
3596		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
3597	cur->bc_nlevels--;
3598out0:
3599	return 0;
3600}
3601
3602/*
3603 * Kill the current root node, and replace it with it's only child node.
3604 */
3605STATIC int
3606xfs_btree_kill_root(
3607	struct xfs_btree_cur	*cur,
3608	struct xfs_buf		*bp,
3609	int			level,
3610	union xfs_btree_ptr	*newroot)
3611{
3612	int			error;
3613
3614	XFS_BTREE_STATS_INC(cur, killroot);
3615
3616	/*
3617	 * Update the root pointer, decreasing the level by 1 and then
3618	 * free the old root.
3619	 */
3620	cur->bc_ops->set_root(cur, newroot, -1);
3621
3622	error = xfs_btree_free_block(cur, bp);
3623	if (error)
3624		return error;
3625
3626	cur->bc_bufs[level] = NULL;
3627	cur->bc_ra[level] = 0;
3628	cur->bc_nlevels--;
3629
3630	return 0;
3631}
3632
3633STATIC int
3634xfs_btree_dec_cursor(
3635	struct xfs_btree_cur	*cur,
3636	int			level,
3637	int			*stat)
3638{
3639	int			error;
3640	int			i;
3641
3642	if (level > 0) {
3643		error = xfs_btree_decrement(cur, level, &i);
3644		if (error)
3645			return error;
3646	}
3647
3648	*stat = 1;
3649	return 0;
3650}
3651
3652/*
3653 * Single level of the btree record deletion routine.
3654 * Delete record pointed to by cur/level.
3655 * Remove the record from its block then rebalance the tree.
3656 * Return 0 for error, 1 for done, 2 to go on to the next level.
3657 */
3658STATIC int					/* error */
3659xfs_btree_delrec(
3660	struct xfs_btree_cur	*cur,		/* btree cursor */
3661	int			level,		/* level removing record from */
3662	int			*stat)		/* fail/done/go-on */
3663{
3664	struct xfs_btree_block	*block;		/* btree block */
3665	union xfs_btree_ptr	cptr;		/* current block ptr */
3666	struct xfs_buf		*bp;		/* buffer for block */
3667	int			error;		/* error return value */
3668	int			i;		/* loop counter */
3669	union xfs_btree_ptr	lptr;		/* left sibling block ptr */
3670	struct xfs_buf		*lbp;		/* left buffer pointer */
3671	struct xfs_btree_block	*left;		/* left btree block */
3672	int			lrecs = 0;	/* left record count */
3673	int			ptr;		/* key/record index */
3674	union xfs_btree_ptr	rptr;		/* right sibling block ptr */
3675	struct xfs_buf		*rbp;		/* right buffer pointer */
3676	struct xfs_btree_block	*right;		/* right btree block */
3677	struct xfs_btree_block	*rrblock;	/* right-right btree block */
3678	struct xfs_buf		*rrbp;		/* right-right buffer pointer */
3679	int			rrecs = 0;	/* right record count */
3680	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
3681	int			numrecs;	/* temporary numrec count */
3682
3683	tcur = NULL;
3684
3685	/* Get the index of the entry being deleted, check for nothing there. */
3686	ptr = cur->bc_ptrs[level];
3687	if (ptr == 0) {
3688		*stat = 0;
3689		return 0;
3690	}
3691
3692	/* Get the buffer & block containing the record or key/ptr. */
3693	block = xfs_btree_get_block(cur, level, &bp);
3694	numrecs = xfs_btree_get_numrecs(block);
3695
3696#ifdef DEBUG
3697	error = xfs_btree_check_block(cur, block, level, bp);
3698	if (error)
3699		goto error0;
3700#endif
3701
3702	/* Fail if we're off the end of the block. */
3703	if (ptr > numrecs) {
3704		*stat = 0;
3705		return 0;
3706	}
3707
3708	XFS_BTREE_STATS_INC(cur, delrec);
3709	XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
3710
3711	/* Excise the entries being deleted. */
3712	if (level > 0) {
3713		/* It's a nonleaf. operate on keys and ptrs */
3714		union xfs_btree_key	*lkp;
3715		union xfs_btree_ptr	*lpp;
3716
3717		lkp = xfs_btree_key_addr(cur, ptr + 1, block);
3718		lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
3719
 
3720		for (i = 0; i < numrecs - ptr; i++) {
3721			error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
3722			if (error)
3723				goto error0;
3724		}
 
3725
3726		if (ptr < numrecs) {
3727			xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
3728			xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
3729			xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
3730			xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
3731		}
3732	} else {
3733		/* It's a leaf. operate on records */
3734		if (ptr < numrecs) {
3735			xfs_btree_shift_recs(cur,
3736				xfs_btree_rec_addr(cur, ptr + 1, block),
3737				-1, numrecs - ptr);
3738			xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
3739		}
3740	}
3741
3742	/*
3743	 * Decrement and log the number of entries in the block.
3744	 */
3745	xfs_btree_set_numrecs(block, --numrecs);
3746	xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3747
3748	/*
3749	 * If we are tracking the last record in the tree and
3750	 * we are at the far right edge of the tree, update it.
3751	 */
3752	if (xfs_btree_is_lastrec(cur, block, level)) {
3753		cur->bc_ops->update_lastrec(cur, block, NULL,
3754					    ptr, LASTREC_DELREC);
3755	}
3756
3757	/*
3758	 * We're at the root level.  First, shrink the root block in-memory.
3759	 * Try to get rid of the next level down.  If we can't then there's
3760	 * nothing left to do.
3761	 */
3762	if (level == cur->bc_nlevels - 1) {
3763		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3764			xfs_iroot_realloc(cur->bc_private.b.ip, -1,
3765					  cur->bc_private.b.whichfork);
3766
3767			error = xfs_btree_kill_iroot(cur);
3768			if (error)
3769				goto error0;
3770
3771			error = xfs_btree_dec_cursor(cur, level, stat);
3772			if (error)
3773				goto error0;
3774			*stat = 1;
3775			return 0;
3776		}
3777
3778		/*
3779		 * If this is the root level, and there's only one entry left,
3780		 * and it's NOT the leaf level, then we can get rid of this
3781		 * level.
3782		 */
3783		if (numrecs == 1 && level > 0) {
3784			union xfs_btree_ptr	*pp;
3785			/*
3786			 * pp is still set to the first pointer in the block.
3787			 * Make it the new root of the btree.
3788			 */
3789			pp = xfs_btree_ptr_addr(cur, 1, block);
3790			error = xfs_btree_kill_root(cur, bp, level, pp);
3791			if (error)
3792				goto error0;
3793		} else if (level > 0) {
3794			error = xfs_btree_dec_cursor(cur, level, stat);
3795			if (error)
3796				goto error0;
3797		}
3798		*stat = 1;
3799		return 0;
3800	}
3801
3802	/*
3803	 * If we deleted the leftmost entry in the block, update the
3804	 * key values above us in the tree.
3805	 */
3806	if (xfs_btree_needs_key_update(cur, ptr)) {
3807		error = xfs_btree_update_keys(cur, level);
3808		if (error)
3809			goto error0;
3810	}
3811
3812	/*
3813	 * If the number of records remaining in the block is at least
3814	 * the minimum, we're done.
3815	 */
3816	if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
3817		error = xfs_btree_dec_cursor(cur, level, stat);
3818		if (error)
3819			goto error0;
3820		return 0;
3821	}
3822
3823	/*
3824	 * Otherwise, we have to move some records around to keep the
3825	 * tree balanced.  Look at the left and right sibling blocks to
3826	 * see if we can re-balance by moving only one record.
3827	 */
3828	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3829	xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
3830
3831	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3832		/*
3833		 * One child of root, need to get a chance to copy its contents
3834		 * into the root and delete it. Can't go up to next level,
3835		 * there's nothing to delete there.
3836		 */
3837		if (xfs_btree_ptr_is_null(cur, &rptr) &&
3838		    xfs_btree_ptr_is_null(cur, &lptr) &&
3839		    level == cur->bc_nlevels - 2) {
3840			error = xfs_btree_kill_iroot(cur);
3841			if (!error)
3842				error = xfs_btree_dec_cursor(cur, level, stat);
3843			if (error)
3844				goto error0;
3845			return 0;
3846		}
3847	}
3848
3849	ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
3850	       !xfs_btree_ptr_is_null(cur, &lptr));
3851
3852	/*
3853	 * Duplicate the cursor so our btree manipulations here won't
3854	 * disrupt the next level up.
3855	 */
3856	error = xfs_btree_dup_cursor(cur, &tcur);
3857	if (error)
3858		goto error0;
3859
3860	/*
3861	 * If there's a right sibling, see if it's ok to shift an entry
3862	 * out of it.
3863	 */
3864	if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3865		/*
3866		 * Move the temp cursor to the last entry in the next block.
3867		 * Actually any entry but the first would suffice.
3868		 */
3869		i = xfs_btree_lastrec(tcur, level);
3870		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3871
3872		error = xfs_btree_increment(tcur, level, &i);
3873		if (error)
3874			goto error0;
3875		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3876
3877		i = xfs_btree_lastrec(tcur, level);
3878		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3879
3880		/* Grab a pointer to the block. */
3881		right = xfs_btree_get_block(tcur, level, &rbp);
3882#ifdef DEBUG
3883		error = xfs_btree_check_block(tcur, right, level, rbp);
3884		if (error)
3885			goto error0;
3886#endif
3887		/* Grab the current block number, for future use. */
3888		xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
3889
3890		/*
3891		 * If right block is full enough so that removing one entry
3892		 * won't make it too empty, and left-shifting an entry out
3893		 * of right to us works, we're done.
3894		 */
3895		if (xfs_btree_get_numrecs(right) - 1 >=
3896		    cur->bc_ops->get_minrecs(tcur, level)) {
3897			error = xfs_btree_lshift(tcur, level, &i);
3898			if (error)
3899				goto error0;
3900			if (i) {
3901				ASSERT(xfs_btree_get_numrecs(block) >=
3902				       cur->bc_ops->get_minrecs(tcur, level));
3903
3904				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3905				tcur = NULL;
3906
3907				error = xfs_btree_dec_cursor(cur, level, stat);
3908				if (error)
3909					goto error0;
3910				return 0;
3911			}
3912		}
3913
3914		/*
3915		 * Otherwise, grab the number of records in right for
3916		 * future reference, and fix up the temp cursor to point
3917		 * to our block again (last record).
3918		 */
3919		rrecs = xfs_btree_get_numrecs(right);
3920		if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3921			i = xfs_btree_firstrec(tcur, level);
3922			XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3923
3924			error = xfs_btree_decrement(tcur, level, &i);
3925			if (error)
3926				goto error0;
3927			XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3928		}
3929	}
3930
3931	/*
3932	 * If there's a left sibling, see if it's ok to shift an entry
3933	 * out of it.
3934	 */
3935	if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3936		/*
3937		 * Move the temp cursor to the first entry in the
3938		 * previous block.
3939		 */
3940		i = xfs_btree_firstrec(tcur, level);
3941		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3942
3943		error = xfs_btree_decrement(tcur, level, &i);
3944		if (error)
3945			goto error0;
3946		i = xfs_btree_firstrec(tcur, level);
3947		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3948
3949		/* Grab a pointer to the block. */
3950		left = xfs_btree_get_block(tcur, level, &lbp);
3951#ifdef DEBUG
3952		error = xfs_btree_check_block(cur, left, level, lbp);
3953		if (error)
3954			goto error0;
3955#endif
3956		/* Grab the current block number, for future use. */
3957		xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
3958
3959		/*
3960		 * If left block is full enough so that removing one entry
3961		 * won't make it too empty, and right-shifting an entry out
3962		 * of left to us works, we're done.
3963		 */
3964		if (xfs_btree_get_numrecs(left) - 1 >=
3965		    cur->bc_ops->get_minrecs(tcur, level)) {
3966			error = xfs_btree_rshift(tcur, level, &i);
3967			if (error)
3968				goto error0;
3969			if (i) {
3970				ASSERT(xfs_btree_get_numrecs(block) >=
3971				       cur->bc_ops->get_minrecs(tcur, level));
3972				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3973				tcur = NULL;
3974				if (level == 0)
3975					cur->bc_ptrs[0]++;
3976
3977				*stat = 1;
3978				return 0;
3979			}
3980		}
3981
3982		/*
3983		 * Otherwise, grab the number of records in right for
3984		 * future reference.
3985		 */
3986		lrecs = xfs_btree_get_numrecs(left);
3987	}
3988
3989	/* Delete the temp cursor, we're done with it. */
3990	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3991	tcur = NULL;
3992
3993	/* If here, we need to do a join to keep the tree balanced. */
3994	ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
3995
3996	if (!xfs_btree_ptr_is_null(cur, &lptr) &&
3997	    lrecs + xfs_btree_get_numrecs(block) <=
3998			cur->bc_ops->get_maxrecs(cur, level)) {
3999		/*
4000		 * Set "right" to be the starting block,
4001		 * "left" to be the left neighbor.
4002		 */
4003		rptr = cptr;
4004		right = block;
4005		rbp = bp;
4006		error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
4007		if (error)
4008			goto error0;
4009
4010	/*
4011	 * If that won't work, see if we can join with the right neighbor block.
4012	 */
4013	} else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
4014		   rrecs + xfs_btree_get_numrecs(block) <=
4015			cur->bc_ops->get_maxrecs(cur, level)) {
4016		/*
4017		 * Set "left" to be the starting block,
4018		 * "right" to be the right neighbor.
4019		 */
4020		lptr = cptr;
4021		left = block;
4022		lbp = bp;
4023		error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
4024		if (error)
4025			goto error0;
4026
4027	/*
4028	 * Otherwise, we can't fix the imbalance.
4029	 * Just return.  This is probably a logic error, but it's not fatal.
4030	 */
4031	} else {
4032		error = xfs_btree_dec_cursor(cur, level, stat);
4033		if (error)
4034			goto error0;
4035		return 0;
4036	}
4037
4038	rrecs = xfs_btree_get_numrecs(right);
4039	lrecs = xfs_btree_get_numrecs(left);
4040
4041	/*
4042	 * We're now going to join "left" and "right" by moving all the stuff
4043	 * in "right" to "left" and deleting "right".
4044	 */
4045	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
4046	if (level > 0) {
4047		/* It's a non-leaf.  Move keys and pointers. */
4048		union xfs_btree_key	*lkp;	/* left btree key */
4049		union xfs_btree_ptr	*lpp;	/* left address pointer */
4050		union xfs_btree_key	*rkp;	/* right btree key */
4051		union xfs_btree_ptr	*rpp;	/* right address pointer */
4052
4053		lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
4054		lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
4055		rkp = xfs_btree_key_addr(cur, 1, right);
4056		rpp = xfs_btree_ptr_addr(cur, 1, right);
4057
4058		for (i = 1; i < rrecs; i++) {
4059			error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
4060			if (error)
4061				goto error0;
4062		}
4063
4064		xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
4065		xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
4066
4067		xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
4068		xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
4069	} else {
4070		/* It's a leaf.  Move records.  */
4071		union xfs_btree_rec	*lrp;	/* left record pointer */
4072		union xfs_btree_rec	*rrp;	/* right record pointer */
4073
4074		lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
4075		rrp = xfs_btree_rec_addr(cur, 1, right);
4076
4077		xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
4078		xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
4079	}
4080
4081	XFS_BTREE_STATS_INC(cur, join);
4082
4083	/*
4084	 * Fix up the number of records and right block pointer in the
4085	 * surviving block, and log it.
4086	 */
4087	xfs_btree_set_numrecs(left, lrecs + rrecs);
4088	xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
4089	xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4090	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
4091
4092	/* If there is a right sibling, point it to the remaining block. */
4093	xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4094	if (!xfs_btree_ptr_is_null(cur, &cptr)) {
4095		error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
4096		if (error)
4097			goto error0;
4098		xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
4099		xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
4100	}
4101
4102	/* Free the deleted block. */
4103	error = xfs_btree_free_block(cur, rbp);
4104	if (error)
4105		goto error0;
4106
4107	/*
4108	 * If we joined with the left neighbor, set the buffer in the
4109	 * cursor to the left block, and fix up the index.
4110	 */
4111	if (bp != lbp) {
4112		cur->bc_bufs[level] = lbp;
4113		cur->bc_ptrs[level] += lrecs;
4114		cur->bc_ra[level] = 0;
4115	}
4116	/*
4117	 * If we joined with the right neighbor and there's a level above
4118	 * us, increment the cursor at that level.
4119	 */
4120	else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
4121		   (level + 1 < cur->bc_nlevels)) {
4122		error = xfs_btree_increment(cur, level + 1, &i);
4123		if (error)
4124			goto error0;
4125	}
4126
4127	/*
4128	 * Readjust the ptr at this level if it's not a leaf, since it's
4129	 * still pointing at the deletion point, which makes the cursor
4130	 * inconsistent.  If this makes the ptr 0, the caller fixes it up.
4131	 * We can't use decrement because it would change the next level up.
4132	 */
4133	if (level > 0)
4134		cur->bc_ptrs[level]--;
4135
4136	/*
4137	 * We combined blocks, so we have to update the parent keys if the
4138	 * btree supports overlapped intervals.  However, bc_ptrs[level + 1]
4139	 * points to the old block so that the caller knows which record to
4140	 * delete.  Therefore, the caller must be savvy enough to call updkeys
4141	 * for us if we return stat == 2.  The other exit points from this
4142	 * function don't require deletions further up the tree, so they can
4143	 * call updkeys directly.
4144	 */
4145
4146	/* Return value means the next level up has something to do. */
4147	*stat = 2;
4148	return 0;
4149
4150error0:
4151	if (tcur)
4152		xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
4153	return error;
4154}
4155
4156/*
4157 * Delete the record pointed to by cur.
4158 * The cursor refers to the place where the record was (could be inserted)
4159 * when the operation returns.
4160 */
4161int					/* error */
4162xfs_btree_delete(
4163	struct xfs_btree_cur	*cur,
4164	int			*stat)	/* success/failure */
4165{
4166	int			error;	/* error return value */
4167	int			level;
4168	int			i;
4169	bool			joined = false;
4170
4171	/*
4172	 * Go up the tree, starting at leaf level.
4173	 *
4174	 * If 2 is returned then a join was done; go to the next level.
4175	 * Otherwise we are done.
4176	 */
4177	for (level = 0, i = 2; i == 2; level++) {
4178		error = xfs_btree_delrec(cur, level, &i);
4179		if (error)
4180			goto error0;
4181		if (i == 2)
4182			joined = true;
4183	}
4184
4185	/*
4186	 * If we combined blocks as part of deleting the record, delrec won't
4187	 * have updated the parent high keys so we have to do that here.
4188	 */
4189	if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) {
4190		error = xfs_btree_updkeys_force(cur, 0);
4191		if (error)
4192			goto error0;
4193	}
4194
4195	if (i == 0) {
4196		for (level = 1; level < cur->bc_nlevels; level++) {
4197			if (cur->bc_ptrs[level] == 0) {
4198				error = xfs_btree_decrement(cur, level, &i);
4199				if (error)
4200					goto error0;
4201				break;
4202			}
4203		}
4204	}
4205
4206	*stat = i;
4207	return 0;
4208error0:
4209	return error;
4210}
4211
4212/*
4213 * Get the data from the pointed-to record.
4214 */
4215int					/* error */
4216xfs_btree_get_rec(
4217	struct xfs_btree_cur	*cur,	/* btree cursor */
4218	union xfs_btree_rec	**recp,	/* output: btree record */
4219	int			*stat)	/* output: success/failure */
4220{
4221	struct xfs_btree_block	*block;	/* btree block */
4222	struct xfs_buf		*bp;	/* buffer pointer */
4223	int			ptr;	/* record number */
4224#ifdef DEBUG
4225	int			error;	/* error return value */
4226#endif
4227
4228	ptr = cur->bc_ptrs[0];
4229	block = xfs_btree_get_block(cur, 0, &bp);
4230
4231#ifdef DEBUG
4232	error = xfs_btree_check_block(cur, block, 0, bp);
4233	if (error)
4234		return error;
4235#endif
4236
4237	/*
4238	 * Off the right end or left end, return failure.
4239	 */
4240	if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
4241		*stat = 0;
4242		return 0;
4243	}
4244
4245	/*
4246	 * Point to the record and extract its data.
4247	 */
4248	*recp = xfs_btree_rec_addr(cur, ptr, block);
4249	*stat = 1;
4250	return 0;
4251}
4252
4253/* Visit a block in a btree. */
4254STATIC int
4255xfs_btree_visit_block(
4256	struct xfs_btree_cur		*cur,
4257	int				level,
4258	xfs_btree_visit_blocks_fn	fn,
4259	void				*data)
4260{
4261	struct xfs_btree_block		*block;
4262	struct xfs_buf			*bp;
4263	union xfs_btree_ptr		rptr;
4264	int				error;
4265
4266	/* do right sibling readahead */
4267	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
4268	block = xfs_btree_get_block(cur, level, &bp);
4269
4270	/* process the block */
4271	error = fn(cur, level, data);
4272	if (error)
4273		return error;
4274
4275	/* now read rh sibling block for next iteration */
4276	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
4277	if (xfs_btree_ptr_is_null(cur, &rptr))
4278		return -ENOENT;
4279
4280	return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
4281}
4282
4283
4284/* Visit every block in a btree. */
4285int
4286xfs_btree_visit_blocks(
4287	struct xfs_btree_cur		*cur,
4288	xfs_btree_visit_blocks_fn	fn,
4289	void				*data)
4290{
4291	union xfs_btree_ptr		lptr;
4292	int				level;
4293	struct xfs_btree_block		*block = NULL;
4294	int				error = 0;
4295
4296	cur->bc_ops->init_ptr_from_cur(cur, &lptr);
4297
4298	/* for each level */
4299	for (level = cur->bc_nlevels - 1; level >= 0; level--) {
4300		/* grab the left hand block */
4301		error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
4302		if (error)
4303			return error;
4304
4305		/* readahead the left most block for the next level down */
4306		if (level > 0) {
4307			union xfs_btree_ptr     *ptr;
4308
4309			ptr = xfs_btree_ptr_addr(cur, 1, block);
4310			xfs_btree_readahead_ptr(cur, ptr, 1);
4311
4312			/* save for the next iteration of the loop */
4313			xfs_btree_copy_ptrs(cur, &lptr, ptr, 1);
4314		}
4315
4316		/* for each buffer in the level */
4317		do {
4318			error = xfs_btree_visit_block(cur, level, fn, data);
4319		} while (!error);
4320
4321		if (error != -ENOENT)
4322			return error;
4323	}
4324
4325	return 0;
4326}
4327
4328/*
4329 * Change the owner of a btree.
4330 *
4331 * The mechanism we use here is ordered buffer logging. Because we don't know
4332 * how many buffers were are going to need to modify, we don't really want to
4333 * have to make transaction reservations for the worst case of every buffer in a
4334 * full size btree as that may be more space that we can fit in the log....
4335 *
4336 * We do the btree walk in the most optimal manner possible - we have sibling
4337 * pointers so we can just walk all the blocks on each level from left to right
4338 * in a single pass, and then move to the next level and do the same. We can
4339 * also do readahead on the sibling pointers to get IO moving more quickly,
4340 * though for slow disks this is unlikely to make much difference to performance
4341 * as the amount of CPU work we have to do before moving to the next block is
4342 * relatively small.
4343 *
4344 * For each btree block that we load, modify the owner appropriately, set the
4345 * buffer as an ordered buffer and log it appropriately. We need to ensure that
4346 * we mark the region we change dirty so that if the buffer is relogged in
4347 * a subsequent transaction the changes we make here as an ordered buffer are
4348 * correctly relogged in that transaction.  If we are in recovery context, then
4349 * just queue the modified buffer as delayed write buffer so the transaction
4350 * recovery completion writes the changes to disk.
4351 */
4352struct xfs_btree_block_change_owner_info {
4353	uint64_t		new_owner;
4354	struct list_head	*buffer_list;
4355};
4356
4357static int
4358xfs_btree_block_change_owner(
4359	struct xfs_btree_cur	*cur,
4360	int			level,
4361	void			*data)
4362{
4363	struct xfs_btree_block_change_owner_info	*bbcoi = data;
4364	struct xfs_btree_block	*block;
4365	struct xfs_buf		*bp;
4366
4367	/* modify the owner */
4368	block = xfs_btree_get_block(cur, level, &bp);
4369	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
4370		if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
4371			return 0;
4372		block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
4373	} else {
4374		if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
4375			return 0;
4376		block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
4377	}
4378
4379	/*
4380	 * If the block is a root block hosted in an inode, we might not have a
4381	 * buffer pointer here and we shouldn't attempt to log the change as the
4382	 * information is already held in the inode and discarded when the root
4383	 * block is formatted into the on-disk inode fork. We still change it,
4384	 * though, so everything is consistent in memory.
4385	 */
4386	if (!bp) {
4387		ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
4388		ASSERT(level == cur->bc_nlevels - 1);
4389		return 0;
4390	}
4391
4392	if (cur->bc_tp) {
4393		if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
4394			xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
4395			return -EAGAIN;
4396		}
4397	} else {
4398		xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
4399	}
4400
4401	return 0;
4402}
4403
4404int
4405xfs_btree_change_owner(
4406	struct xfs_btree_cur	*cur,
4407	uint64_t		new_owner,
4408	struct list_head	*buffer_list)
4409{
4410	struct xfs_btree_block_change_owner_info	bbcoi;
4411
4412	bbcoi.new_owner = new_owner;
4413	bbcoi.buffer_list = buffer_list;
4414
4415	return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner,
4416			&bbcoi);
4417}
4418
4419/* Verify the v5 fields of a long-format btree block. */
4420xfs_failaddr_t
4421xfs_btree_lblock_v5hdr_verify(
4422	struct xfs_buf		*bp,
4423	uint64_t		owner)
4424{
4425	struct xfs_mount	*mp = bp->b_mount;
4426	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
4427
4428	if (!xfs_sb_version_hascrc(&mp->m_sb))
4429		return __this_address;
4430	if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
4431		return __this_address;
4432	if (block->bb_u.l.bb_blkno != cpu_to_be64(bp->b_bn))
4433		return __this_address;
4434	if (owner != XFS_RMAP_OWN_UNKNOWN &&
4435	    be64_to_cpu(block->bb_u.l.bb_owner) != owner)
4436		return __this_address;
4437	return NULL;
4438}
4439
4440/* Verify a long-format btree block. */
4441xfs_failaddr_t
4442xfs_btree_lblock_verify(
4443	struct xfs_buf		*bp,
4444	unsigned int		max_recs)
4445{
4446	struct xfs_mount	*mp = bp->b_mount;
4447	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
4448
4449	/* numrecs verification */
4450	if (be16_to_cpu(block->bb_numrecs) > max_recs)
4451		return __this_address;
4452
4453	/* sibling pointer verification */
4454	if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
4455	    !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))
4456		return __this_address;
4457	if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
4458	    !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))
4459		return __this_address;
4460
4461	return NULL;
4462}
4463
4464/**
4465 * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format
4466 *				      btree block
4467 *
4468 * @bp: buffer containing the btree block
 
 
4469 */
4470xfs_failaddr_t
4471xfs_btree_sblock_v5hdr_verify(
4472	struct xfs_buf		*bp)
4473{
4474	struct xfs_mount	*mp = bp->b_mount;
4475	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
4476	struct xfs_perag	*pag = bp->b_pag;
4477
4478	if (!xfs_sb_version_hascrc(&mp->m_sb))
4479		return __this_address;
4480	if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
4481		return __this_address;
4482	if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
4483		return __this_address;
4484	if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
4485		return __this_address;
4486	return NULL;
4487}
4488
4489/**
4490 * xfs_btree_sblock_verify() -- verify a short-format btree block
4491 *
4492 * @bp: buffer containing the btree block
4493 * @max_recs: maximum records allowed in this btree node
4494 */
4495xfs_failaddr_t
4496xfs_btree_sblock_verify(
4497	struct xfs_buf		*bp,
4498	unsigned int		max_recs)
4499{
4500	struct xfs_mount	*mp = bp->b_mount;
4501	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
4502	xfs_agblock_t		agno;
4503
4504	/* numrecs verification */
4505	if (be16_to_cpu(block->bb_numrecs) > max_recs)
4506		return __this_address;
4507
4508	/* sibling pointer verification */
4509	agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp));
4510	if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
4511	    !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib)))
4512		return __this_address;
4513	if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
4514	    !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib)))
4515		return __this_address;
4516
4517	return NULL;
4518}
4519
4520/*
4521 * Calculate the number of btree levels needed to store a given number of
4522 * records in a short-format btree.
4523 */
4524uint
4525xfs_btree_compute_maxlevels(
4526	uint			*limits,
4527	unsigned long		len)
4528{
4529	uint			level;
4530	unsigned long		maxblocks;
4531
4532	maxblocks = (len + limits[0] - 1) / limits[0];
4533	for (level = 1; maxblocks > 1; level++)
4534		maxblocks = (maxblocks + limits[1] - 1) / limits[1];
4535	return level;
4536}
4537
4538/*
4539 * Query a regular btree for all records overlapping a given interval.
4540 * Start with a LE lookup of the key of low_rec and return all records
4541 * until we find a record with a key greater than the key of high_rec.
4542 */
4543STATIC int
4544xfs_btree_simple_query_range(
4545	struct xfs_btree_cur		*cur,
4546	union xfs_btree_key		*low_key,
4547	union xfs_btree_key		*high_key,
4548	xfs_btree_query_range_fn	fn,
4549	void				*priv)
4550{
4551	union xfs_btree_rec		*recp;
4552	union xfs_btree_key		rec_key;
4553	int64_t				diff;
4554	int				stat;
4555	bool				firstrec = true;
4556	int				error;
4557
4558	ASSERT(cur->bc_ops->init_high_key_from_rec);
4559	ASSERT(cur->bc_ops->diff_two_keys);
4560
4561	/*
4562	 * Find the leftmost record.  The btree cursor must be set
4563	 * to the low record used to generate low_key.
4564	 */
4565	stat = 0;
4566	error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
4567	if (error)
4568		goto out;
4569
4570	/* Nothing?  See if there's anything to the right. */
4571	if (!stat) {
4572		error = xfs_btree_increment(cur, 0, &stat);
4573		if (error)
4574			goto out;
4575	}
4576
4577	while (stat) {
4578		/* Find the record. */
4579		error = xfs_btree_get_rec(cur, &recp, &stat);
4580		if (error || !stat)
4581			break;
4582
4583		/* Skip if high_key(rec) < low_key. */
4584		if (firstrec) {
4585			cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
4586			firstrec = false;
4587			diff = cur->bc_ops->diff_two_keys(cur, low_key,
4588					&rec_key);
4589			if (diff > 0)
4590				goto advloop;
4591		}
4592
4593		/* Stop if high_key < low_key(rec). */
4594		cur->bc_ops->init_key_from_rec(&rec_key, recp);
4595		diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key);
4596		if (diff > 0)
4597			break;
4598
4599		/* Callback */
4600		error = fn(cur, recp, priv);
4601		if (error)
4602			break;
4603
4604advloop:
4605		/* Move on to the next record. */
4606		error = xfs_btree_increment(cur, 0, &stat);
4607		if (error)
4608			break;
4609	}
4610
4611out:
4612	return error;
4613}
4614
4615/*
4616 * Query an overlapped interval btree for all records overlapping a given
4617 * interval.  This function roughly follows the algorithm given in
4618 * "Interval Trees" of _Introduction to Algorithms_, which is section
4619 * 14.3 in the 2nd and 3rd editions.
4620 *
4621 * First, generate keys for the low and high records passed in.
4622 *
4623 * For any leaf node, generate the high and low keys for the record.
4624 * If the record keys overlap with the query low/high keys, pass the
4625 * record to the function iterator.
4626 *
4627 * For any internal node, compare the low and high keys of each
4628 * pointer against the query low/high keys.  If there's an overlap,
4629 * follow the pointer.
4630 *
4631 * As an optimization, we stop scanning a block when we find a low key
4632 * that is greater than the query's high key.
4633 */
4634STATIC int
4635xfs_btree_overlapped_query_range(
4636	struct xfs_btree_cur		*cur,
4637	union xfs_btree_key		*low_key,
4638	union xfs_btree_key		*high_key,
4639	xfs_btree_query_range_fn	fn,
4640	void				*priv)
4641{
4642	union xfs_btree_ptr		ptr;
4643	union xfs_btree_ptr		*pp;
4644	union xfs_btree_key		rec_key;
4645	union xfs_btree_key		rec_hkey;
4646	union xfs_btree_key		*lkp;
4647	union xfs_btree_key		*hkp;
4648	union xfs_btree_rec		*recp;
4649	struct xfs_btree_block		*block;
4650	int64_t				ldiff;
4651	int64_t				hdiff;
4652	int				level;
4653	struct xfs_buf			*bp;
4654	int				i;
4655	int				error;
4656
4657	/* Load the root of the btree. */
4658	level = cur->bc_nlevels - 1;
4659	cur->bc_ops->init_ptr_from_cur(cur, &ptr);
4660	error = xfs_btree_lookup_get_block(cur, level, &ptr, &block);
4661	if (error)
4662		return error;
4663	xfs_btree_get_block(cur, level, &bp);
4664	trace_xfs_btree_overlapped_query_range(cur, level, bp);
4665#ifdef DEBUG
4666	error = xfs_btree_check_block(cur, block, level, bp);
4667	if (error)
4668		goto out;
4669#endif
4670	cur->bc_ptrs[level] = 1;
4671
4672	while (level < cur->bc_nlevels) {
4673		block = xfs_btree_get_block(cur, level, &bp);
4674
4675		/* End of node, pop back towards the root. */
4676		if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
4677pop_up:
4678			if (level < cur->bc_nlevels - 1)
4679				cur->bc_ptrs[level + 1]++;
4680			level++;
4681			continue;
4682		}
4683
4684		if (level == 0) {
4685			/* Handle a leaf node. */
4686			recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
4687
4688			cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
4689			ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey,
4690					low_key);
4691
4692			cur->bc_ops->init_key_from_rec(&rec_key, recp);
4693			hdiff = cur->bc_ops->diff_two_keys(cur, high_key,
4694					&rec_key);
4695
4696			/*
4697			 * If (record's high key >= query's low key) and
4698			 *    (query's high key >= record's low key), then
4699			 * this record overlaps the query range; callback.
4700			 */
4701			if (ldiff >= 0 && hdiff >= 0) {
4702				error = fn(cur, recp, priv);
4703				if (error)
 
4704					break;
4705			} else if (hdiff < 0) {
4706				/* Record is larger than high key; pop. */
4707				goto pop_up;
4708			}
4709			cur->bc_ptrs[level]++;
4710			continue;
4711		}
4712
4713		/* Handle an internal node. */
4714		lkp = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
4715		hkp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
4716		pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
4717
4718		ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key);
4719		hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp);
4720
4721		/*
4722		 * If (pointer's high key >= query's low key) and
4723		 *    (query's high key >= pointer's low key), then
4724		 * this record overlaps the query range; follow pointer.
4725		 */
4726		if (ldiff >= 0 && hdiff >= 0) {
4727			level--;
4728			error = xfs_btree_lookup_get_block(cur, level, pp,
4729					&block);
4730			if (error)
4731				goto out;
4732			xfs_btree_get_block(cur, level, &bp);
4733			trace_xfs_btree_overlapped_query_range(cur, level, bp);
4734#ifdef DEBUG
4735			error = xfs_btree_check_block(cur, block, level, bp);
4736			if (error)
4737				goto out;
4738#endif
4739			cur->bc_ptrs[level] = 1;
4740			continue;
4741		} else if (hdiff < 0) {
4742			/* The low key is larger than the upper range; pop. */
4743			goto pop_up;
4744		}
4745		cur->bc_ptrs[level]++;
4746	}
4747
4748out:
4749	/*
4750	 * If we don't end this function with the cursor pointing at a record
4751	 * block, a subsequent non-error cursor deletion will not release
4752	 * node-level buffers, causing a buffer leak.  This is quite possible
4753	 * with a zero-results range query, so release the buffers if we
4754	 * failed to return any results.
4755	 */
4756	if (cur->bc_bufs[0] == NULL) {
4757		for (i = 0; i < cur->bc_nlevels; i++) {
4758			if (cur->bc_bufs[i]) {
4759				xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
4760				cur->bc_bufs[i] = NULL;
4761				cur->bc_ptrs[i] = 0;
4762				cur->bc_ra[i] = 0;
4763			}
4764		}
4765	}
4766
4767	return error;
4768}
4769
4770/*
4771 * Query a btree for all records overlapping a given interval of keys.  The
4772 * supplied function will be called with each record found; return one of the
4773 * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
4774 * code.  This function returns -ECANCELED, zero, or a negative error code.
 
4775 */
4776int
4777xfs_btree_query_range(
4778	struct xfs_btree_cur		*cur,
4779	union xfs_btree_irec		*low_rec,
4780	union xfs_btree_irec		*high_rec,
4781	xfs_btree_query_range_fn	fn,
4782	void				*priv)
4783{
4784	union xfs_btree_rec		rec;
4785	union xfs_btree_key		low_key;
4786	union xfs_btree_key		high_key;
4787
4788	/* Find the keys of both ends of the interval. */
4789	cur->bc_rec = *high_rec;
4790	cur->bc_ops->init_rec_from_cur(cur, &rec);
4791	cur->bc_ops->init_key_from_rec(&high_key, &rec);
4792
4793	cur->bc_rec = *low_rec;
4794	cur->bc_ops->init_rec_from_cur(cur, &rec);
4795	cur->bc_ops->init_key_from_rec(&low_key, &rec);
4796
4797	/* Enforce low key < high key. */
4798	if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0)
4799		return -EINVAL;
4800
4801	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
4802		return xfs_btree_simple_query_range(cur, &low_key,
4803				&high_key, fn, priv);
4804	return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
4805			fn, priv);
4806}
4807
4808/* Query a btree for all records. */
4809int
4810xfs_btree_query_all(
4811	struct xfs_btree_cur		*cur,
4812	xfs_btree_query_range_fn	fn,
4813	void				*priv)
4814{
4815	union xfs_btree_key		low_key;
4816	union xfs_btree_key		high_key;
4817
4818	memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));
4819	memset(&low_key, 0, sizeof(low_key));
4820	memset(&high_key, 0xFF, sizeof(high_key));
4821
4822	return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv);
4823}
4824
4825/*
4826 * Calculate the number of blocks needed to store a given number of records
4827 * in a short-format (per-AG metadata) btree.
4828 */
4829unsigned long long
4830xfs_btree_calc_size(
4831	uint			*limits,
4832	unsigned long long	len)
4833{
4834	int			level;
4835	int			maxrecs;
4836	unsigned long long	rval;
4837
4838	maxrecs = limits[0];
4839	for (level = 0, rval = 0; len > 1; level++) {
4840		len += maxrecs - 1;
4841		do_div(len, maxrecs);
4842		maxrecs = limits[1];
4843		rval += len;
4844	}
4845	return rval;
4846}
4847
4848static int
4849xfs_btree_count_blocks_helper(
4850	struct xfs_btree_cur	*cur,
4851	int			level,
4852	void			*data)
4853{
4854	xfs_extlen_t		*blocks = data;
4855	(*blocks)++;
4856
4857	return 0;
4858}
4859
4860/* Count the blocks in a btree and return the result in *blocks. */
4861int
4862xfs_btree_count_blocks(
4863	struct xfs_btree_cur	*cur,
4864	xfs_extlen_t		*blocks)
4865{
4866	*blocks = 0;
4867	return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
4868			blocks);
4869}
4870
4871/* Compare two btree pointers. */
4872int64_t
4873xfs_btree_diff_two_ptrs(
4874	struct xfs_btree_cur		*cur,
4875	const union xfs_btree_ptr	*a,
4876	const union xfs_btree_ptr	*b)
4877{
4878	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
4879		return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l);
4880	return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
4881}
4882
4883/* If there's an extent, we're done. */
4884STATIC int
4885xfs_btree_has_record_helper(
4886	struct xfs_btree_cur		*cur,
4887	union xfs_btree_rec		*rec,
4888	void				*priv)
4889{
4890	return -ECANCELED;
4891}
4892
4893/* Is there a record covering a given range of keys? */
4894int
4895xfs_btree_has_record(
4896	struct xfs_btree_cur	*cur,
4897	union xfs_btree_irec	*low,
4898	union xfs_btree_irec	*high,
4899	bool			*exists)
4900{
4901	int			error;
4902
4903	error = xfs_btree_query_range(cur, low, high,
4904			&xfs_btree_has_record_helper, NULL);
4905	if (error == -ECANCELED) {
4906		*exists = true;
4907		return 0;
4908	}
4909	*exists = false;
4910	return error;
4911}
4912
4913/* Are there more records in this btree? */
4914bool
4915xfs_btree_has_more_records(
4916	struct xfs_btree_cur	*cur)
4917{
4918	struct xfs_btree_block	*block;
4919	struct xfs_buf		*bp;
4920
4921	block = xfs_btree_get_block(cur, 0, &bp);
4922
4923	/* There are still records in this block. */
4924	if (cur->bc_ptrs[0] < xfs_btree_get_numrecs(block))
4925		return true;
4926
4927	/* There are more record blocks. */
4928	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
4929		return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK);
4930	else
4931		return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
4932}