Linux Audio

Check our new training course

Loading...
v5.4
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 *   Copyright (C) International Business Machines Corp., 2000-2004
 
 
 
 
 
 
 
 
 
 
 
 
 
 
   4 */
   5
   6/*
   7 *	jfs_dtree.c: directory B+-tree manager
   8 *
   9 * B+-tree with variable length key directory:
  10 *
  11 * each directory page is structured as an array of 32-byte
  12 * directory entry slots initialized as a freelist
  13 * to avoid search/compaction of free space at insertion.
  14 * when an entry is inserted, a number of slots are allocated
  15 * from the freelist as required to store variable length data
  16 * of the entry; when the entry is deleted, slots of the entry
  17 * are returned to freelist.
  18 *
  19 * leaf entry stores full name as key and file serial number
  20 * (aka inode number) as data.
  21 * internal/router entry stores sufffix compressed name
  22 * as key and simple extent descriptor as data.
  23 *
  24 * each directory page maintains a sorted entry index table
  25 * which stores the start slot index of sorted entries
  26 * to allow binary search on the table.
  27 *
  28 * directory starts as a root/leaf page in on-disk inode
  29 * inline data area.
  30 * when it becomes full, it starts a leaf of a external extent
  31 * of length of 1 block. each time the first leaf becomes full,
  32 * it is extended rather than split (its size is doubled),
  33 * until its length becoms 4 KBytes, from then the extent is split
  34 * with new 4 Kbyte extent when it becomes full
  35 * to reduce external fragmentation of small directories.
  36 *
  37 * blah, blah, blah, for linear scan of directory in pieces by
  38 * readdir().
  39 *
  40 *
  41 *	case-insensitive directory file system
  42 *
  43 * names are stored in case-sensitive way in leaf entry.
  44 * but stored, searched and compared in case-insensitive (uppercase) order
  45 * (i.e., both search key and entry key are folded for search/compare):
  46 * (note that case-sensitive order is BROKEN in storage, e.g.,
  47 *  sensitive: Ad, aB, aC, aD -> insensitive: aB, aC, aD, Ad
  48 *
  49 *  entries which folds to the same key makes up a equivalent class
  50 *  whose members are stored as contiguous cluster (may cross page boundary)
  51 *  but whose order is arbitrary and acts as duplicate, e.g.,
  52 *  abc, Abc, aBc, abC)
  53 *
  54 * once match is found at leaf, requires scan forward/backward
  55 * either for, in case-insensitive search, duplicate
  56 * or for, in case-sensitive search, for exact match
  57 *
  58 * router entry must be created/stored in case-insensitive way
  59 * in internal entry:
  60 * (right most key of left page and left most key of right page
  61 * are folded, and its suffix compression is propagated as router
  62 * key in parent)
  63 * (e.g., if split occurs <abc> and <aBd>, <ABD> trather than <aB>
  64 * should be made the router key for the split)
  65 *
  66 * case-insensitive search:
  67 *
  68 *	fold search key;
  69 *
  70 *	case-insensitive search of B-tree:
  71 *	for internal entry, router key is already folded;
  72 *	for leaf entry, fold the entry key before comparison.
  73 *
  74 *	if (leaf entry case-insensitive match found)
  75 *		if (next entry satisfies case-insensitive match)
  76 *			return EDUPLICATE;
  77 *		if (prev entry satisfies case-insensitive match)
  78 *			return EDUPLICATE;
  79 *		return match;
  80 *	else
  81 *		return no match;
  82 *
  83 *	serialization:
  84 * target directory inode lock is being held on entry/exit
  85 * of all main directory service routines.
  86 *
  87 *	log based recovery:
  88 */
  89
  90#include <linux/fs.h>
  91#include <linux/quotaops.h>
  92#include <linux/slab.h>
  93#include "jfs_incore.h"
  94#include "jfs_superblock.h"
  95#include "jfs_filsys.h"
  96#include "jfs_metapage.h"
  97#include "jfs_dmap.h"
  98#include "jfs_unicode.h"
  99#include "jfs_debug.h"
 100
 101/* dtree split parameter */
 102struct dtsplit {
 103	struct metapage *mp;
 104	s16 index;
 105	s16 nslot;
 106	struct component_name *key;
 107	ddata_t *data;
 108	struct pxdlist *pxdlist;
 109};
 110
 111#define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot)
 112
 113/* get page buffer for specified block address */
 114#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC)				\
 115do {									\
 116	BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot);	\
 117	if (!(RC)) {							\
 118		if (((P)->header.nextindex >				\
 119		     (((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \
 120		    ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT))) {	\
 121			BT_PUTPAGE(MP);					\
 122			jfs_error((IP)->i_sb,				\
 123				  "DT_GETPAGE: dtree page corrupt\n");	\
 124			MP = NULL;					\
 125			RC = -EIO;					\
 126		}							\
 127	}								\
 128} while (0)
 129
 130/* for consistency */
 131#define DT_PUTPAGE(MP) BT_PUTPAGE(MP)
 132
 133#define DT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \
 134	BT_GETSEARCH(IP, LEAF, BN, MP, dtpage_t, P, INDEX, i_dtroot)
 135
 136/*
 137 * forward references
 138 */
 139static int dtSplitUp(tid_t tid, struct inode *ip,
 140		     struct dtsplit * split, struct btstack * btstack);
 141
 142static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
 143		       struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rxdp);
 144
 145static int dtExtendPage(tid_t tid, struct inode *ip,
 146			struct dtsplit * split, struct btstack * btstack);
 147
 148static int dtSplitRoot(tid_t tid, struct inode *ip,
 149		       struct dtsplit * split, struct metapage ** rmpp);
 150
 151static int dtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp,
 152		      dtpage_t * fp, struct btstack * btstack);
 153
 154static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p);
 155
 156static int dtReadFirst(struct inode *ip, struct btstack * btstack);
 157
 158static int dtReadNext(struct inode *ip,
 159		      loff_t * offset, struct btstack * btstack);
 160
 161static int dtCompare(struct component_name * key, dtpage_t * p, int si);
 162
 163static int ciCompare(struct component_name * key, dtpage_t * p, int si,
 164		     int flag);
 165
 166static void dtGetKey(dtpage_t * p, int i, struct component_name * key,
 167		     int flag);
 168
 169static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp,
 170			      int ri, struct component_name * key, int flag);
 171
 172static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key,
 173			  ddata_t * data, struct dt_lock **);
 174
 175static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp,
 176			struct dt_lock ** sdtlock, struct dt_lock ** ddtlock,
 177			int do_index);
 178
 179static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock);
 180
 181static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock);
 182
 183static void dtLinelockFreelist(dtpage_t * p, int m, struct dt_lock ** dtlock);
 184
 185#define ciToUpper(c)	UniStrupr((c)->name)
 186
 187/*
 188 *	read_index_page()
 189 *
 190 *	Reads a page of a directory's index table.
 191 *	Having metadata mapped into the directory inode's address space
 192 *	presents a multitude of problems.  We avoid this by mapping to
 193 *	the absolute address space outside of the *_metapage routines
 194 */
 195static struct metapage *read_index_page(struct inode *inode, s64 blkno)
 196{
 197	int rc;
 198	s64 xaddr;
 199	int xflag;
 200	s32 xlen;
 201
 202	rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1);
 203	if (rc || (xaddr == 0))
 204		return NULL;
 205
 206	return read_metapage(inode, xaddr, PSIZE, 1);
 207}
 208
 209/*
 210 *	get_index_page()
 211 *
 212 *	Same as get_index_page(), but get's a new page without reading
 213 */
 214static struct metapage *get_index_page(struct inode *inode, s64 blkno)
 215{
 216	int rc;
 217	s64 xaddr;
 218	int xflag;
 219	s32 xlen;
 220
 221	rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1);
 222	if (rc || (xaddr == 0))
 223		return NULL;
 224
 225	return get_metapage(inode, xaddr, PSIZE, 1);
 226}
 227
 228/*
 229 *	find_index()
 230 *
 231 *	Returns dtree page containing directory table entry for specified
 232 *	index and pointer to its entry.
 233 *
 234 *	mp must be released by caller.
 235 */
 236static struct dir_table_slot *find_index(struct inode *ip, u32 index,
 237					 struct metapage ** mp, s64 *lblock)
 238{
 239	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
 240	s64 blkno;
 241	s64 offset;
 242	int page_offset;
 243	struct dir_table_slot *slot;
 244	static int maxWarnings = 10;
 245
 246	if (index < 2) {
 247		if (maxWarnings) {
 248			jfs_warn("find_entry called with index = %d", index);
 249			maxWarnings--;
 250		}
 251		return NULL;
 252	}
 253
 254	if (index >= jfs_ip->next_index) {
 255		jfs_warn("find_entry called with index >= next_index");
 256		return NULL;
 257	}
 258
 259	if (jfs_dirtable_inline(ip)) {
 260		/*
 261		 * Inline directory table
 262		 */
 263		*mp = NULL;
 264		slot = &jfs_ip->i_dirtable[index - 2];
 265	} else {
 266		offset = (index - 2) * sizeof(struct dir_table_slot);
 267		page_offset = offset & (PSIZE - 1);
 268		blkno = ((offset + 1) >> L2PSIZE) <<
 269		    JFS_SBI(ip->i_sb)->l2nbperpage;
 270
 271		if (*mp && (*lblock != blkno)) {
 272			release_metapage(*mp);
 273			*mp = NULL;
 274		}
 275		if (!(*mp)) {
 276			*lblock = blkno;
 277			*mp = read_index_page(ip, blkno);
 278		}
 279		if (!(*mp)) {
 280			jfs_err("free_index: error reading directory table");
 281			return NULL;
 282		}
 283
 284		slot =
 285		    (struct dir_table_slot *) ((char *) (*mp)->data +
 286					       page_offset);
 287	}
 288	return slot;
 289}
 290
 291static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp,
 292			      u32 index)
 293{
 294	struct tlock *tlck;
 295	struct linelock *llck;
 296	struct lv *lv;
 297
 298	tlck = txLock(tid, ip, mp, tlckDATA);
 299	llck = (struct linelock *) tlck->lock;
 300
 301	if (llck->index >= llck->maxcnt)
 302		llck = txLinelock(llck);
 303	lv = &llck->lv[llck->index];
 304
 305	/*
 306	 *	Linelock slot size is twice the size of directory table
 307	 *	slot size.  512 entries per page.
 308	 */
 309	lv->offset = ((index - 2) & 511) >> 1;
 310	lv->length = 1;
 311	llck->index++;
 312}
 313
 314/*
 315 *	add_index()
 316 *
 317 *	Adds an entry to the directory index table.  This is used to provide
 318 *	each directory entry with a persistent index in which to resume
 319 *	directory traversals
 320 */
 321static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
 322{
 323	struct super_block *sb = ip->i_sb;
 324	struct jfs_sb_info *sbi = JFS_SBI(sb);
 325	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
 326	u64 blkno;
 327	struct dir_table_slot *dirtab_slot;
 328	u32 index;
 329	struct linelock *llck;
 330	struct lv *lv;
 331	struct metapage *mp;
 332	s64 offset;
 333	uint page_offset;
 334	struct tlock *tlck;
 335	s64 xaddr;
 336
 337	ASSERT(DO_INDEX(ip));
 338
 339	if (jfs_ip->next_index < 2) {
 340		jfs_warn("add_index: next_index = %d.  Resetting!",
 341			   jfs_ip->next_index);
 342		jfs_ip->next_index = 2;
 343	}
 344
 345	index = jfs_ip->next_index++;
 346
 347	if (index <= MAX_INLINE_DIRTABLE_ENTRY) {
 348		/*
 349		 * i_size reflects size of index table, or 8 bytes per entry.
 350		 */
 351		ip->i_size = (loff_t) (index - 1) << 3;
 352
 353		/*
 354		 * dir table fits inline within inode
 355		 */
 356		dirtab_slot = &jfs_ip->i_dirtable[index-2];
 357		dirtab_slot->flag = DIR_INDEX_VALID;
 358		dirtab_slot->slot = slot;
 359		DTSaddress(dirtab_slot, bn);
 360
 361		set_cflag(COMMIT_Dirtable, ip);
 362
 363		return index;
 364	}
 365	if (index == (MAX_INLINE_DIRTABLE_ENTRY + 1)) {
 366		struct dir_table_slot temp_table[12];
 367
 368		/*
 369		 * It's time to move the inline table to an external
 370		 * page and begin to build the xtree
 371		 */
 372		if (dquot_alloc_block(ip, sbi->nbperpage))
 373			goto clean_up;
 374		if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) {
 375			dquot_free_block(ip, sbi->nbperpage);
 376			goto clean_up;
 377		}
 378
 379		/*
 380		 * Save the table, we're going to overwrite it with the
 381		 * xtree root
 382		 */
 383		memcpy(temp_table, &jfs_ip->i_dirtable, sizeof(temp_table));
 384
 385		/*
 386		 * Initialize empty x-tree
 387		 */
 388		xtInitRoot(tid, ip);
 389
 390		/*
 391		 * Add the first block to the xtree
 392		 */
 393		if (xtInsert(tid, ip, 0, 0, sbi->nbperpage, &xaddr, 0)) {
 394			/* This really shouldn't fail */
 395			jfs_warn("add_index: xtInsert failed!");
 396			memcpy(&jfs_ip->i_dirtable, temp_table,
 397			       sizeof (temp_table));
 398			dbFree(ip, xaddr, sbi->nbperpage);
 399			dquot_free_block(ip, sbi->nbperpage);
 400			goto clean_up;
 401		}
 402		ip->i_size = PSIZE;
 403
 404		mp = get_index_page(ip, 0);
 405		if (!mp) {
 406			jfs_err("add_index: get_metapage failed!");
 407			xtTruncate(tid, ip, 0, COMMIT_PWMAP);
 408			memcpy(&jfs_ip->i_dirtable, temp_table,
 409			       sizeof (temp_table));
 410			goto clean_up;
 411		}
 412		tlck = txLock(tid, ip, mp, tlckDATA);
 413		llck = (struct linelock *) & tlck->lock;
 414		ASSERT(llck->index == 0);
 415		lv = &llck->lv[0];
 416
 417		lv->offset = 0;
 418		lv->length = 6;	/* tlckDATA slot size is 16 bytes */
 419		llck->index++;
 420
 421		memcpy(mp->data, temp_table, sizeof(temp_table));
 422
 423		mark_metapage_dirty(mp);
 424		release_metapage(mp);
 425
 426		/*
 427		 * Logging is now directed by xtree tlocks
 428		 */
 429		clear_cflag(COMMIT_Dirtable, ip);
 430	}
 431
 432	offset = (index - 2) * sizeof(struct dir_table_slot);
 433	page_offset = offset & (PSIZE - 1);
 434	blkno = ((offset + 1) >> L2PSIZE) << sbi->l2nbperpage;
 435	if (page_offset == 0) {
 436		/*
 437		 * This will be the beginning of a new page
 438		 */
 439		xaddr = 0;
 440		if (xtInsert(tid, ip, 0, blkno, sbi->nbperpage, &xaddr, 0)) {
 441			jfs_warn("add_index: xtInsert failed!");
 442			goto clean_up;
 443		}
 444		ip->i_size += PSIZE;
 445
 446		if ((mp = get_index_page(ip, blkno)))
 447			memset(mp->data, 0, PSIZE);	/* Just looks better */
 448		else
 449			xtTruncate(tid, ip, offset, COMMIT_PWMAP);
 450	} else
 451		mp = read_index_page(ip, blkno);
 452
 453	if (!mp) {
 454		jfs_err("add_index: get/read_metapage failed!");
 455		goto clean_up;
 456	}
 457
 458	lock_index(tid, ip, mp, index);
 459
 460	dirtab_slot =
 461	    (struct dir_table_slot *) ((char *) mp->data + page_offset);
 462	dirtab_slot->flag = DIR_INDEX_VALID;
 463	dirtab_slot->slot = slot;
 464	DTSaddress(dirtab_slot, bn);
 465
 466	mark_metapage_dirty(mp);
 467	release_metapage(mp);
 468
 469	return index;
 470
 471      clean_up:
 472
 473	jfs_ip->next_index--;
 474
 475	return 0;
 476}
 477
 478/*
 479 *	free_index()
 480 *
 481 *	Marks an entry to the directory index table as free.
 482 */
 483static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next)
 484{
 485	struct dir_table_slot *dirtab_slot;
 486	s64 lblock;
 487	struct metapage *mp = NULL;
 488
 489	dirtab_slot = find_index(ip, index, &mp, &lblock);
 490
 491	if (!dirtab_slot)
 492		return;
 493
 494	dirtab_slot->flag = DIR_INDEX_FREE;
 495	dirtab_slot->slot = dirtab_slot->addr1 = 0;
 496	dirtab_slot->addr2 = cpu_to_le32(next);
 497
 498	if (mp) {
 499		lock_index(tid, ip, mp, index);
 500		mark_metapage_dirty(mp);
 501		release_metapage(mp);
 502	} else
 503		set_cflag(COMMIT_Dirtable, ip);
 504}
 505
 506/*
 507 *	modify_index()
 508 *
 509 *	Changes an entry in the directory index table
 510 */
 511static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn,
 512			 int slot, struct metapage ** mp, s64 *lblock)
 513{
 514	struct dir_table_slot *dirtab_slot;
 515
 516	dirtab_slot = find_index(ip, index, mp, lblock);
 517
 518	if (!dirtab_slot)
 519		return;
 520
 521	DTSaddress(dirtab_slot, bn);
 522	dirtab_slot->slot = slot;
 523
 524	if (*mp) {
 525		lock_index(tid, ip, *mp, index);
 526		mark_metapage_dirty(*mp);
 527	} else
 528		set_cflag(COMMIT_Dirtable, ip);
 529}
 530
 531/*
 532 *	read_index()
 533 *
 534 *	reads a directory table slot
 535 */
 536static int read_index(struct inode *ip, u32 index,
 537		     struct dir_table_slot * dirtab_slot)
 538{
 539	s64 lblock;
 540	struct metapage *mp = NULL;
 541	struct dir_table_slot *slot;
 542
 543	slot = find_index(ip, index, &mp, &lblock);
 544	if (!slot) {
 545		return -EIO;
 546	}
 547
 548	memcpy(dirtab_slot, slot, sizeof(struct dir_table_slot));
 549
 550	if (mp)
 551		release_metapage(mp);
 552
 553	return 0;
 554}
 555
 556/*
 557 *	dtSearch()
 558 *
 559 * function:
 560 *	Search for the entry with specified key
 561 *
 562 * parameter:
 563 *
 564 * return: 0 - search result on stack, leaf page pinned;
 565 *	   errno - I/O error
 566 */
 567int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
 568	     struct btstack * btstack, int flag)
 569{
 570	int rc = 0;
 571	int cmp = 1;		/* init for empty page */
 572	s64 bn;
 573	struct metapage *mp;
 574	dtpage_t *p;
 575	s8 *stbl;
 576	int base, index, lim;
 577	struct btframe *btsp;
 578	pxd_t *pxd;
 579	int psize = 288;	/* initial in-line directory */
 580	ino_t inumber;
 581	struct component_name ciKey;
 582	struct super_block *sb = ip->i_sb;
 583
 584	ciKey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t),
 585				   GFP_NOFS);
 586	if (!ciKey.name) {
 587		rc = -ENOMEM;
 588		goto dtSearch_Exit2;
 589	}
 590
 591
 592	/* uppercase search key for c-i directory */
 593	UniStrcpy(ciKey.name, key->name);
 594	ciKey.namlen = key->namlen;
 595
 596	/* only uppercase if case-insensitive support is on */
 597	if ((JFS_SBI(sb)->mntflag & JFS_OS2) == JFS_OS2) {
 598		ciToUpper(&ciKey);
 599	}
 600	BT_CLR(btstack);	/* reset stack */
 601
 602	/* init level count for max pages to split */
 603	btstack->nsplit = 1;
 604
 605	/*
 606	 *	search down tree from root:
 607	 *
 608	 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
 609	 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
 610	 *
 611	 * if entry with search key K is not found
 612	 * internal page search find the entry with largest key Ki
 613	 * less than K which point to the child page to search;
 614	 * leaf page search find the entry with smallest key Kj
 615	 * greater than K so that the returned index is the position of
 616	 * the entry to be shifted right for insertion of new entry.
 617	 * for empty tree, search key is greater than any key of the tree.
 618	 *
 619	 * by convention, root bn = 0.
 620	 */
 621	for (bn = 0;;) {
 622		/* get/pin the page to search */
 623		DT_GETPAGE(ip, bn, mp, psize, p, rc);
 624		if (rc)
 625			goto dtSearch_Exit1;
 626
 627		/* get sorted entry table of the page */
 628		stbl = DT_GETSTBL(p);
 629
 630		/*
 631		 * binary search with search key K on the current page.
 632		 */
 633		for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) {
 634			index = base + (lim >> 1);
 635
 636			if (p->header.flag & BT_LEAF) {
 637				/* uppercase leaf name to compare */
 638				cmp =
 639				    ciCompare(&ciKey, p, stbl[index],
 640					      JFS_SBI(sb)->mntflag);
 641			} else {
 642				/* router key is in uppercase */
 643
 644				cmp = dtCompare(&ciKey, p, stbl[index]);
 645
 646
 647			}
 648			if (cmp == 0) {
 649				/*
 650				 *	search hit
 651				 */
 652				/* search hit - leaf page:
 653				 * return the entry found
 654				 */
 655				if (p->header.flag & BT_LEAF) {
 656					inumber = le32_to_cpu(
 657			((struct ldtentry *) & p->slot[stbl[index]])->inumber);
 658
 659					/*
 660					 * search for JFS_LOOKUP
 661					 */
 662					if (flag == JFS_LOOKUP) {
 663						*data = inumber;
 664						rc = 0;
 665						goto out;
 666					}
 667
 668					/*
 669					 * search for JFS_CREATE
 670					 */
 671					if (flag == JFS_CREATE) {
 672						*data = inumber;
 673						rc = -EEXIST;
 674						goto out;
 675					}
 676
 677					/*
 678					 * search for JFS_REMOVE or JFS_RENAME
 679					 */
 680					if ((flag == JFS_REMOVE ||
 681					     flag == JFS_RENAME) &&
 682					    *data != inumber) {
 683						rc = -ESTALE;
 684						goto out;
 685					}
 686
 687					/*
 688					 * JFS_REMOVE|JFS_FINDDIR|JFS_RENAME
 689					 */
 690					/* save search result */
 691					*data = inumber;
 692					btsp = btstack->top;
 693					btsp->bn = bn;
 694					btsp->index = index;
 695					btsp->mp = mp;
 696
 697					rc = 0;
 698					goto dtSearch_Exit1;
 699				}
 700
 701				/* search hit - internal page:
 702				 * descend/search its child page
 703				 */
 704				goto getChild;
 705			}
 706
 707			if (cmp > 0) {
 708				base = index + 1;
 709				--lim;
 710			}
 711		}
 712
 713		/*
 714		 *	search miss
 715		 *
 716		 * base is the smallest index with key (Kj) greater than
 717		 * search key (K) and may be zero or (maxindex + 1) index.
 718		 */
 719		/*
 720		 * search miss - leaf page
 721		 *
 722		 * return location of entry (base) where new entry with
 723		 * search key K is to be inserted.
 724		 */
 725		if (p->header.flag & BT_LEAF) {
 726			/*
 727			 * search for JFS_LOOKUP, JFS_REMOVE, or JFS_RENAME
 728			 */
 729			if (flag == JFS_LOOKUP || flag == JFS_REMOVE ||
 730			    flag == JFS_RENAME) {
 731				rc = -ENOENT;
 732				goto out;
 733			}
 734
 735			/*
 736			 * search for JFS_CREATE|JFS_FINDDIR:
 737			 *
 738			 * save search result
 739			 */
 740			*data = 0;
 741			btsp = btstack->top;
 742			btsp->bn = bn;
 743			btsp->index = base;
 744			btsp->mp = mp;
 745
 746			rc = 0;
 747			goto dtSearch_Exit1;
 748		}
 749
 750		/*
 751		 * search miss - internal page
 752		 *
 753		 * if base is non-zero, decrement base by one to get the parent
 754		 * entry of the child page to search.
 755		 */
 756		index = base ? base - 1 : base;
 757
 758		/*
 759		 * go down to child page
 760		 */
 761	      getChild:
 762		/* update max. number of pages to split */
 763		if (BT_STACK_FULL(btstack)) {
 764			/* Something's corrupted, mark filesystem dirty so
 765			 * chkdsk will fix it.
 766			 */
 767			jfs_error(sb, "stack overrun!\n");
 768			BT_STACK_DUMP(btstack);
 769			rc = -EIO;
 770			goto out;
 771		}
 772		btstack->nsplit++;
 773
 774		/* push (bn, index) of the parent page/entry */
 775		BT_PUSH(btstack, bn, index);
 776
 777		/* get the child page block number */
 778		pxd = (pxd_t *) & p->slot[stbl[index]];
 779		bn = addressPXD(pxd);
 780		psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize;
 781
 782		/* unpin the parent page */
 783		DT_PUTPAGE(mp);
 784	}
 785
 786      out:
 787	DT_PUTPAGE(mp);
 788
 789      dtSearch_Exit1:
 790
 791	kfree(ciKey.name);
 792
 793      dtSearch_Exit2:
 794
 795	return rc;
 796}
 797
 798
 799/*
 800 *	dtInsert()
 801 *
 802 * function: insert an entry to directory tree
 803 *
 804 * parameter:
 805 *
 806 * return: 0 - success;
 807 *	   errno - failure;
 808 */
 809int dtInsert(tid_t tid, struct inode *ip,
 810	 struct component_name * name, ino_t * fsn, struct btstack * btstack)
 811{
 812	int rc = 0;
 813	struct metapage *mp;	/* meta-page buffer */
 814	dtpage_t *p;		/* base B+-tree index page */
 815	s64 bn;
 816	int index;
 817	struct dtsplit split;	/* split information */
 818	ddata_t data;
 819	struct dt_lock *dtlck;
 820	int n;
 821	struct tlock *tlck;
 822	struct lv *lv;
 823
 824	/*
 825	 *	retrieve search result
 826	 *
 827	 * dtSearch() returns (leaf page pinned, index at which to insert).
 828	 * n.b. dtSearch() may return index of (maxindex + 1) of
 829	 * the full page.
 830	 */
 831	DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
 832
 833	/*
 834	 *	insert entry for new key
 835	 */
 836	if (DO_INDEX(ip)) {
 837		if (JFS_IP(ip)->next_index == DIREND) {
 838			DT_PUTPAGE(mp);
 839			return -EMLINK;
 840		}
 841		n = NDTLEAF(name->namlen);
 842		data.leaf.tid = tid;
 843		data.leaf.ip = ip;
 844	} else {
 845		n = NDTLEAF_LEGACY(name->namlen);
 846		data.leaf.ip = NULL;	/* signifies legacy directory format */
 847	}
 848	data.leaf.ino = *fsn;
 849
 850	/*
 851	 *	leaf page does not have enough room for new entry:
 852	 *
 853	 *	extend/split the leaf page;
 854	 *
 855	 * dtSplitUp() will insert the entry and unpin the leaf page.
 856	 */
 857	if (n > p->header.freecnt) {
 858		split.mp = mp;
 859		split.index = index;
 860		split.nslot = n;
 861		split.key = name;
 862		split.data = &data;
 863		rc = dtSplitUp(tid, ip, &split, btstack);
 864		return rc;
 865	}
 866
 867	/*
 868	 *	leaf page does have enough room for new entry:
 869	 *
 870	 *	insert the new data entry into the leaf page;
 871	 */
 872	BT_MARK_DIRTY(mp, ip);
 873	/*
 874	 * acquire a transaction lock on the leaf page
 875	 */
 876	tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
 877	dtlck = (struct dt_lock *) & tlck->lock;
 878	ASSERT(dtlck->index == 0);
 879	lv = & dtlck->lv[0];
 880
 881	/* linelock header */
 882	lv->offset = 0;
 883	lv->length = 1;
 884	dtlck->index++;
 885
 886	dtInsertEntry(p, index, name, &data, &dtlck);
 887
 888	/* linelock stbl of non-root leaf page */
 889	if (!(p->header.flag & BT_ROOT)) {
 890		if (dtlck->index >= dtlck->maxcnt)
 891			dtlck = (struct dt_lock *) txLinelock(dtlck);
 892		lv = & dtlck->lv[dtlck->index];
 893		n = index >> L2DTSLOTSIZE;
 894		lv->offset = p->header.stblindex + n;
 895		lv->length =
 896		    ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1;
 897		dtlck->index++;
 898	}
 899
 900	/* unpin the leaf page */
 901	DT_PUTPAGE(mp);
 902
 903	return 0;
 904}
 905
 906
 907/*
 908 *	dtSplitUp()
 909 *
 910 * function: propagate insertion bottom up;
 911 *
 912 * parameter:
 913 *
 914 * return: 0 - success;
 915 *	   errno - failure;
 916 *	leaf page unpinned;
 917 */
 918static int dtSplitUp(tid_t tid,
 919	  struct inode *ip, struct dtsplit * split, struct btstack * btstack)
 920{
 921	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
 922	int rc = 0;
 923	struct metapage *smp;
 924	dtpage_t *sp;		/* split page */
 925	struct metapage *rmp;
 926	dtpage_t *rp;		/* new right page split from sp */
 927	pxd_t rpxd;		/* new right page extent descriptor */
 928	struct metapage *lmp;
 929	dtpage_t *lp;		/* left child page */
 930	int skip;		/* index of entry of insertion */
 931	struct btframe *parent;	/* parent page entry on traverse stack */
 932	s64 xaddr, nxaddr;
 933	int xlen, xsize;
 934	struct pxdlist pxdlist;
 935	pxd_t *pxd;
 936	struct component_name key = { 0, NULL };
 937	ddata_t *data = split->data;
 938	int n;
 939	struct dt_lock *dtlck;
 940	struct tlock *tlck;
 941	struct lv *lv;
 942	int quota_allocation = 0;
 943
 944	/* get split page */
 945	smp = split->mp;
 946	sp = DT_PAGE(ip, smp);
 947
 948	key.name = kmalloc_array(JFS_NAME_MAX + 2, sizeof(wchar_t), GFP_NOFS);
 949	if (!key.name) {
 950		DT_PUTPAGE(smp);
 951		rc = -ENOMEM;
 952		goto dtSplitUp_Exit;
 953	}
 954
 955	/*
 956	 *	split leaf page
 957	 *
 958	 * The split routines insert the new entry, and
 959	 * acquire txLock as appropriate.
 960	 */
 961	/*
 962	 *	split root leaf page:
 963	 */
 964	if (sp->header.flag & BT_ROOT) {
 965		/*
 966		 * allocate a single extent child page
 967		 */
 968		xlen = 1;
 969		n = sbi->bsize >> L2DTSLOTSIZE;
 970		n -= (n + 31) >> L2DTSLOTSIZE;	/* stbl size */
 971		n -= DTROOTMAXSLOT - sp->header.freecnt; /* header + entries */
 972		if (n <= split->nslot)
 973			xlen++;
 974		if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr))) {
 975			DT_PUTPAGE(smp);
 976			goto freeKeyName;
 977		}
 978
 979		pxdlist.maxnpxd = 1;
 980		pxdlist.npxd = 0;
 981		pxd = &pxdlist.pxd[0];
 982		PXDaddress(pxd, xaddr);
 983		PXDlength(pxd, xlen);
 984		split->pxdlist = &pxdlist;
 985		rc = dtSplitRoot(tid, ip, split, &rmp);
 986
 987		if (rc)
 988			dbFree(ip, xaddr, xlen);
 989		else
 990			DT_PUTPAGE(rmp);
 991
 992		DT_PUTPAGE(smp);
 993
 994		if (!DO_INDEX(ip))
 995			ip->i_size = xlen << sbi->l2bsize;
 996
 997		goto freeKeyName;
 998	}
 999
1000	/*
1001	 *	extend first leaf page
1002	 *
1003	 * extend the 1st extent if less than buffer page size
1004	 * (dtExtendPage() reurns leaf page unpinned)
1005	 */
1006	pxd = &sp->header.self;
1007	xlen = lengthPXD(pxd);
1008	xsize = xlen << sbi->l2bsize;
1009	if (xsize < PSIZE) {
1010		xaddr = addressPXD(pxd);
1011		n = xsize >> L2DTSLOTSIZE;
1012		n -= (n + 31) >> L2DTSLOTSIZE;	/* stbl size */
1013		if ((n + sp->header.freecnt) <= split->nslot)
1014			n = xlen + (xlen << 1);
1015		else
1016			n = xlen;
1017
1018		/* Allocate blocks to quota. */
1019		rc = dquot_alloc_block(ip, n);
1020		if (rc)
1021			goto extendOut;
1022		quota_allocation += n;
1023
1024		if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen,
1025				    (s64) n, &nxaddr)))
1026			goto extendOut;
1027
1028		pxdlist.maxnpxd = 1;
1029		pxdlist.npxd = 0;
1030		pxd = &pxdlist.pxd[0];
1031		PXDaddress(pxd, nxaddr);
1032		PXDlength(pxd, xlen + n);
1033		split->pxdlist = &pxdlist;
1034		if ((rc = dtExtendPage(tid, ip, split, btstack))) {
1035			nxaddr = addressPXD(pxd);
1036			if (xaddr != nxaddr) {
1037				/* free relocated extent */
1038				xlen = lengthPXD(pxd);
1039				dbFree(ip, nxaddr, (s64) xlen);
1040			} else {
1041				/* free extended delta */
1042				xlen = lengthPXD(pxd) - n;
1043				xaddr = addressPXD(pxd) + xlen;
1044				dbFree(ip, xaddr, (s64) n);
1045			}
1046		} else if (!DO_INDEX(ip))
1047			ip->i_size = lengthPXD(pxd) << sbi->l2bsize;
1048
1049
1050	      extendOut:
1051		DT_PUTPAGE(smp);
1052		goto freeKeyName;
1053	}
1054
1055	/*
1056	 *	split leaf page <sp> into <sp> and a new right page <rp>.
1057	 *
1058	 * return <rp> pinned and its extent descriptor <rpxd>
1059	 */
1060	/*
1061	 * allocate new directory page extent and
1062	 * new index page(s) to cover page split(s)
1063	 *
1064	 * allocation hint: ?
1065	 */
1066	n = btstack->nsplit;
1067	pxdlist.maxnpxd = pxdlist.npxd = 0;
1068	xlen = sbi->nbperpage;
1069	for (pxd = pxdlist.pxd; n > 0; n--, pxd++) {
1070		if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr)) == 0) {
1071			PXDaddress(pxd, xaddr);
1072			PXDlength(pxd, xlen);
1073			pxdlist.maxnpxd++;
1074			continue;
1075		}
1076
1077		DT_PUTPAGE(smp);
1078
1079		/* undo allocation */
1080		goto splitOut;
1081	}
1082
1083	split->pxdlist = &pxdlist;
1084	if ((rc = dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd))) {
1085		DT_PUTPAGE(smp);
1086
1087		/* undo allocation */
1088		goto splitOut;
1089	}
1090
1091	if (!DO_INDEX(ip))
1092		ip->i_size += PSIZE;
1093
1094	/*
1095	 * propagate up the router entry for the leaf page just split
1096	 *
1097	 * insert a router entry for the new page into the parent page,
1098	 * propagate the insert/split up the tree by walking back the stack
1099	 * of (bn of parent page, index of child page entry in parent page)
1100	 * that were traversed during the search for the page that split.
1101	 *
1102	 * the propagation of insert/split up the tree stops if the root
1103	 * splits or the page inserted into doesn't have to split to hold
1104	 * the new entry.
1105	 *
1106	 * the parent entry for the split page remains the same, and
1107	 * a new entry is inserted at its right with the first key and
1108	 * block number of the new right page.
1109	 *
1110	 * There are a maximum of 4 pages pinned at any time:
1111	 * two children, left parent and right parent (when the parent splits).
1112	 * keep the child pages pinned while working on the parent.
1113	 * make sure that all pins are released at exit.
1114	 */
1115	while ((parent = BT_POP(btstack)) != NULL) {
1116		/* parent page specified by stack frame <parent> */
1117
1118		/* keep current child pages (<lp>, <rp>) pinned */
1119		lmp = smp;
1120		lp = sp;
1121
1122		/*
1123		 * insert router entry in parent for new right child page <rp>
1124		 */
1125		/* get the parent page <sp> */
1126		DT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc);
1127		if (rc) {
1128			DT_PUTPAGE(lmp);
1129			DT_PUTPAGE(rmp);
1130			goto splitOut;
1131		}
1132
1133		/*
1134		 * The new key entry goes ONE AFTER the index of parent entry,
1135		 * because the split was to the right.
1136		 */
1137		skip = parent->index + 1;
1138
1139		/*
1140		 * compute the key for the router entry
1141		 *
1142		 * key suffix compression:
1143		 * for internal pages that have leaf pages as children,
1144		 * retain only what's needed to distinguish between
1145		 * the new entry and the entry on the page to its left.
1146		 * If the keys compare equal, retain the entire key.
1147		 *
1148		 * note that compression is performed only at computing
1149		 * router key at the lowest internal level.
1150		 * further compression of the key between pairs of higher
1151		 * level internal pages loses too much information and
1152		 * the search may fail.
1153		 * (e.g., two adjacent leaf pages of {a, ..., x} {xx, ...,}
1154		 * results in two adjacent parent entries (a)(xx).
1155		 * if split occurs between these two entries, and
1156		 * if compression is applied, the router key of parent entry
1157		 * of right page (x) will divert search for x into right
1158		 * subtree and miss x in the left subtree.)
1159		 *
1160		 * the entire key must be retained for the next-to-leftmost
1161		 * internal key at any level of the tree, or search may fail
1162		 * (e.g., ?)
1163		 */
1164		switch (rp->header.flag & BT_TYPE) {
1165		case BT_LEAF:
1166			/*
1167			 * compute the length of prefix for suffix compression
1168			 * between last entry of left page and first entry
1169			 * of right page
1170			 */
1171			if ((sp->header.flag & BT_ROOT && skip > 1) ||
1172			    sp->header.prev != 0 || skip > 1) {
1173				/* compute uppercase router prefix key */
1174				rc = ciGetLeafPrefixKey(lp,
1175							lp->header.nextindex-1,
1176							rp, 0, &key,
1177							sbi->mntflag);
1178				if (rc) {
1179					DT_PUTPAGE(lmp);
1180					DT_PUTPAGE(rmp);
1181					DT_PUTPAGE(smp);
1182					goto splitOut;
1183				}
1184			} else {
1185				/* next to leftmost entry of
1186				   lowest internal level */
1187
1188				/* compute uppercase router key */
1189				dtGetKey(rp, 0, &key, sbi->mntflag);
1190				key.name[key.namlen] = 0;
1191
1192				if ((sbi->mntflag & JFS_OS2) == JFS_OS2)
1193					ciToUpper(&key);
1194			}
1195
1196			n = NDTINTERNAL(key.namlen);
1197			break;
1198
1199		case BT_INTERNAL:
1200			dtGetKey(rp, 0, &key, sbi->mntflag);
1201			n = NDTINTERNAL(key.namlen);
1202			break;
1203
1204		default:
1205			jfs_err("dtSplitUp(): UFO!");
1206			break;
1207		}
1208
1209		/* unpin left child page */
1210		DT_PUTPAGE(lmp);
1211
1212		/*
1213		 * compute the data for the router entry
1214		 */
1215		data->xd = rpxd;	/* child page xd */
1216
1217		/*
1218		 * parent page is full - split the parent page
1219		 */
1220		if (n > sp->header.freecnt) {
1221			/* init for parent page split */
1222			split->mp = smp;
1223			split->index = skip;	/* index at insert */
1224			split->nslot = n;
1225			split->key = &key;
1226			/* split->data = data; */
1227
1228			/* unpin right child page */
1229			DT_PUTPAGE(rmp);
1230
1231			/* The split routines insert the new entry,
1232			 * acquire txLock as appropriate.
1233			 * return <rp> pinned and its block number <rbn>.
1234			 */
1235			rc = (sp->header.flag & BT_ROOT) ?
1236			    dtSplitRoot(tid, ip, split, &rmp) :
1237			    dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd);
1238			if (rc) {
1239				DT_PUTPAGE(smp);
1240				goto splitOut;
1241			}
1242
1243			/* smp and rmp are pinned */
1244		}
1245		/*
1246		 * parent page is not full - insert router entry in parent page
1247		 */
1248		else {
1249			BT_MARK_DIRTY(smp, ip);
1250			/*
1251			 * acquire a transaction lock on the parent page
1252			 */
1253			tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY);
1254			dtlck = (struct dt_lock *) & tlck->lock;
1255			ASSERT(dtlck->index == 0);
1256			lv = & dtlck->lv[0];
1257
1258			/* linelock header */
1259			lv->offset = 0;
1260			lv->length = 1;
1261			dtlck->index++;
1262
1263			/* linelock stbl of non-root parent page */
1264			if (!(sp->header.flag & BT_ROOT)) {
1265				lv++;
1266				n = skip >> L2DTSLOTSIZE;
1267				lv->offset = sp->header.stblindex + n;
1268				lv->length =
1269				    ((sp->header.nextindex -
1270				      1) >> L2DTSLOTSIZE) - n + 1;
1271				dtlck->index++;
1272			}
1273
1274			dtInsertEntry(sp, skip, &key, data, &dtlck);
1275
1276			/* exit propagate up */
1277			break;
1278		}
1279	}
1280
1281	/* unpin current split and its right page */
1282	DT_PUTPAGE(smp);
1283	DT_PUTPAGE(rmp);
1284
1285	/*
1286	 * free remaining extents allocated for split
1287	 */
1288      splitOut:
1289	n = pxdlist.npxd;
1290	pxd = &pxdlist.pxd[n];
1291	for (; n < pxdlist.maxnpxd; n++, pxd++)
1292		dbFree(ip, addressPXD(pxd), (s64) lengthPXD(pxd));
1293
1294      freeKeyName:
1295	kfree(key.name);
1296
1297	/* Rollback quota allocation */
1298	if (rc && quota_allocation)
1299		dquot_free_block(ip, quota_allocation);
1300
1301      dtSplitUp_Exit:
1302
1303	return rc;
1304}
1305
1306
1307/*
1308 *	dtSplitPage()
1309 *
1310 * function: Split a non-root page of a btree.
1311 *
1312 * parameter:
1313 *
1314 * return: 0 - success;
1315 *	   errno - failure;
1316 *	return split and new page pinned;
1317 */
1318static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
1319	    struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rpxdp)
1320{
1321	int rc = 0;
1322	struct metapage *smp;
1323	dtpage_t *sp;
1324	struct metapage *rmp;
1325	dtpage_t *rp;		/* new right page allocated */
1326	s64 rbn;		/* new right page block number */
1327	struct metapage *mp;
1328	dtpage_t *p;
1329	s64 nextbn;
1330	struct pxdlist *pxdlist;
1331	pxd_t *pxd;
1332	int skip, nextindex, half, left, nxt, off, si;
1333	struct ldtentry *ldtentry;
1334	struct idtentry *idtentry;
1335	u8 *stbl;
1336	struct dtslot *f;
1337	int fsi, stblsize;
1338	int n;
1339	struct dt_lock *sdtlck, *rdtlck;
1340	struct tlock *tlck;
1341	struct dt_lock *dtlck;
1342	struct lv *slv, *rlv, *lv;
1343
1344	/* get split page */
1345	smp = split->mp;
1346	sp = DT_PAGE(ip, smp);
1347
1348	/*
1349	 * allocate the new right page for the split
1350	 */
1351	pxdlist = split->pxdlist;
1352	pxd = &pxdlist->pxd[pxdlist->npxd];
1353	pxdlist->npxd++;
1354	rbn = addressPXD(pxd);
1355	rmp = get_metapage(ip, rbn, PSIZE, 1);
1356	if (rmp == NULL)
1357		return -EIO;
1358
1359	/* Allocate blocks to quota. */
1360	rc = dquot_alloc_block(ip, lengthPXD(pxd));
1361	if (rc) {
1362		release_metapage(rmp);
1363		return rc;
1364	}
1365
1366	jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp);
1367
1368	BT_MARK_DIRTY(rmp, ip);
1369	/*
1370	 * acquire a transaction lock on the new right page
1371	 */
1372	tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW);
1373	rdtlck = (struct dt_lock *) & tlck->lock;
1374
1375	rp = (dtpage_t *) rmp->data;
1376	*rpp = rp;
1377	rp->header.self = *pxd;
1378
1379	BT_MARK_DIRTY(smp, ip);
1380	/*
1381	 * acquire a transaction lock on the split page
1382	 *
1383	 * action:
1384	 */
1385	tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY);
1386	sdtlck = (struct dt_lock *) & tlck->lock;
1387
1388	/* linelock header of split page */
1389	ASSERT(sdtlck->index == 0);
1390	slv = & sdtlck->lv[0];
1391	slv->offset = 0;
1392	slv->length = 1;
1393	sdtlck->index++;
1394
1395	/*
1396	 * initialize/update sibling pointers between sp and rp
1397	 */
1398	nextbn = le64_to_cpu(sp->header.next);
1399	rp->header.next = cpu_to_le64(nextbn);
1400	rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self));
1401	sp->header.next = cpu_to_le64(rbn);
1402
1403	/*
1404	 * initialize new right page
1405	 */
1406	rp->header.flag = sp->header.flag;
1407
1408	/* compute sorted entry table at start of extent data area */
1409	rp->header.nextindex = 0;
1410	rp->header.stblindex = 1;
1411
1412	n = PSIZE >> L2DTSLOTSIZE;
1413	rp->header.maxslot = n;
1414	stblsize = (n + 31) >> L2DTSLOTSIZE;	/* in unit of slot */
1415
1416	/* init freelist */
1417	fsi = rp->header.stblindex + stblsize;
1418	rp->header.freelist = fsi;
1419	rp->header.freecnt = rp->header.maxslot - fsi;
1420
1421	/*
1422	 *	sequential append at tail: append without split
1423	 *
1424	 * If splitting the last page on a level because of appending
1425	 * a entry to it (skip is maxentry), it's likely that the access is
1426	 * sequential. Adding an empty page on the side of the level is less
1427	 * work and can push the fill factor much higher than normal.
1428	 * If we're wrong it's no big deal, we'll just do the split the right
1429	 * way next time.
1430	 * (It may look like it's equally easy to do a similar hack for
1431	 * reverse sorted data, that is, split the tree left,
1432	 * but it's not. Be my guest.)
1433	 */
1434	if (nextbn == 0 && split->index == sp->header.nextindex) {
1435		/* linelock header + stbl (first slot) of new page */
1436		rlv = & rdtlck->lv[rdtlck->index];
1437		rlv->offset = 0;
1438		rlv->length = 2;
1439		rdtlck->index++;
1440
1441		/*
1442		 * initialize freelist of new right page
1443		 */
1444		f = &rp->slot[fsi];
1445		for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
1446			f->next = fsi;
1447		f->next = -1;
1448
1449		/* insert entry at the first entry of the new right page */
1450		dtInsertEntry(rp, 0, split->key, split->data, &rdtlck);
1451
1452		goto out;
1453	}
1454
1455	/*
1456	 *	non-sequential insert (at possibly middle page)
1457	 */
1458
1459	/*
1460	 * update prev pointer of previous right sibling page;
1461	 */
1462	if (nextbn != 0) {
1463		DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
1464		if (rc) {
1465			discard_metapage(rmp);
1466			return rc;
1467		}
1468
1469		BT_MARK_DIRTY(mp, ip);
1470		/*
1471		 * acquire a transaction lock on the next page
1472		 */
1473		tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
1474		jfs_info("dtSplitPage: tlck = 0x%p, ip = 0x%p, mp=0x%p",
1475			tlck, ip, mp);
1476		dtlck = (struct dt_lock *) & tlck->lock;
1477
1478		/* linelock header of previous right sibling page */
1479		lv = & dtlck->lv[dtlck->index];
1480		lv->offset = 0;
1481		lv->length = 1;
1482		dtlck->index++;
1483
1484		p->header.prev = cpu_to_le64(rbn);
1485
1486		DT_PUTPAGE(mp);
1487	}
1488
1489	/*
1490	 * split the data between the split and right pages.
1491	 */
1492	skip = split->index;
1493	half = (PSIZE >> L2DTSLOTSIZE) >> 1;	/* swag */
1494	left = 0;
1495
1496	/*
1497	 *	compute fill factor for split pages
1498	 *
1499	 * <nxt> traces the next entry to move to rp
1500	 * <off> traces the next entry to stay in sp
1501	 */
1502	stbl = (u8 *) & sp->slot[sp->header.stblindex];
1503	nextindex = sp->header.nextindex;
1504	for (nxt = off = 0; nxt < nextindex; ++off) {
1505		if (off == skip)
1506			/* check for fill factor with new entry size */
1507			n = split->nslot;
1508		else {
1509			si = stbl[nxt];
1510			switch (sp->header.flag & BT_TYPE) {
1511			case BT_LEAF:
1512				ldtentry = (struct ldtentry *) & sp->slot[si];
1513				if (DO_INDEX(ip))
1514					n = NDTLEAF(ldtentry->namlen);
1515				else
1516					n = NDTLEAF_LEGACY(ldtentry->
1517							   namlen);
1518				break;
1519
1520			case BT_INTERNAL:
1521				idtentry = (struct idtentry *) & sp->slot[si];
1522				n = NDTINTERNAL(idtentry->namlen);
1523				break;
1524
1525			default:
1526				break;
1527			}
1528
1529			++nxt;	/* advance to next entry to move in sp */
1530		}
1531
1532		left += n;
1533		if (left >= half)
1534			break;
1535	}
1536
1537	/* <nxt> poins to the 1st entry to move */
1538
1539	/*
1540	 *	move entries to right page
1541	 *
1542	 * dtMoveEntry() initializes rp and reserves entry for insertion
1543	 *
1544	 * split page moved out entries are linelocked;
1545	 * new/right page moved in entries are linelocked;
1546	 */
1547	/* linelock header + stbl of new right page */
1548	rlv = & rdtlck->lv[rdtlck->index];
1549	rlv->offset = 0;
1550	rlv->length = 5;
1551	rdtlck->index++;
1552
1553	dtMoveEntry(sp, nxt, rp, &sdtlck, &rdtlck, DO_INDEX(ip));
1554
1555	sp->header.nextindex = nxt;
1556
1557	/*
1558	 * finalize freelist of new right page
1559	 */
1560	fsi = rp->header.freelist;
1561	f = &rp->slot[fsi];
1562	for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
1563		f->next = fsi;
1564	f->next = -1;
1565
1566	/*
1567	 * Update directory index table for entries now in right page
1568	 */
1569	if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) {
1570		s64 lblock;
1571
1572		mp = NULL;
1573		stbl = DT_GETSTBL(rp);
1574		for (n = 0; n < rp->header.nextindex; n++) {
1575			ldtentry = (struct ldtentry *) & rp->slot[stbl[n]];
1576			modify_index(tid, ip, le32_to_cpu(ldtentry->index),
1577				     rbn, n, &mp, &lblock);
1578		}
1579		if (mp)
1580			release_metapage(mp);
1581	}
1582
1583	/*
1584	 * the skipped index was on the left page,
1585	 */
1586	if (skip <= off) {
1587		/* insert the new entry in the split page */
1588		dtInsertEntry(sp, skip, split->key, split->data, &sdtlck);
1589
1590		/* linelock stbl of split page */
1591		if (sdtlck->index >= sdtlck->maxcnt)
1592			sdtlck = (struct dt_lock *) txLinelock(sdtlck);
1593		slv = & sdtlck->lv[sdtlck->index];
1594		n = skip >> L2DTSLOTSIZE;
1595		slv->offset = sp->header.stblindex + n;
1596		slv->length =
1597		    ((sp->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1;
1598		sdtlck->index++;
1599	}
1600	/*
1601	 * the skipped index was on the right page,
1602	 */
1603	else {
1604		/* adjust the skip index to reflect the new position */
1605		skip -= nxt;
1606
1607		/* insert the new entry in the right page */
1608		dtInsertEntry(rp, skip, split->key, split->data, &rdtlck);
1609	}
1610
1611      out:
1612	*rmpp = rmp;
1613	*rpxdp = *pxd;
1614
1615	return rc;
1616}
1617
1618
1619/*
1620 *	dtExtendPage()
1621 *
1622 * function: extend 1st/only directory leaf page
1623 *
1624 * parameter:
1625 *
1626 * return: 0 - success;
1627 *	   errno - failure;
1628 *	return extended page pinned;
1629 */
1630static int dtExtendPage(tid_t tid,
1631	     struct inode *ip, struct dtsplit * split, struct btstack * btstack)
1632{
1633	struct super_block *sb = ip->i_sb;
1634	int rc;
1635	struct metapage *smp, *pmp, *mp;
1636	dtpage_t *sp, *pp;
1637	struct pxdlist *pxdlist;
1638	pxd_t *pxd, *tpxd;
1639	int xlen, xsize;
1640	int newstblindex, newstblsize;
1641	int oldstblindex, oldstblsize;
1642	int fsi, last;
1643	struct dtslot *f;
1644	struct btframe *parent;
1645	int n;
1646	struct dt_lock *dtlck;
1647	s64 xaddr, txaddr;
1648	struct tlock *tlck;
1649	struct pxd_lock *pxdlock;
1650	struct lv *lv;
1651	uint type;
1652	struct ldtentry *ldtentry;
1653	u8 *stbl;
1654
1655	/* get page to extend */
1656	smp = split->mp;
1657	sp = DT_PAGE(ip, smp);
1658
1659	/* get parent/root page */
1660	parent = BT_POP(btstack);
1661	DT_GETPAGE(ip, parent->bn, pmp, PSIZE, pp, rc);
1662	if (rc)
1663		return (rc);
1664
1665	/*
1666	 *	extend the extent
1667	 */
1668	pxdlist = split->pxdlist;
1669	pxd = &pxdlist->pxd[pxdlist->npxd];
1670	pxdlist->npxd++;
1671
1672	xaddr = addressPXD(pxd);
1673	tpxd = &sp->header.self;
1674	txaddr = addressPXD(tpxd);
1675	/* in-place extension */
1676	if (xaddr == txaddr) {
1677		type = tlckEXTEND;
1678	}
1679	/* relocation */
1680	else {
1681		type = tlckNEW;
1682
1683		/* save moved extent descriptor for later free */
1684		tlck = txMaplock(tid, ip, tlckDTREE | tlckRELOCATE);
1685		pxdlock = (struct pxd_lock *) & tlck->lock;
1686		pxdlock->flag = mlckFREEPXD;
1687		pxdlock->pxd = sp->header.self;
1688		pxdlock->index = 1;
1689
1690		/*
1691		 * Update directory index table to reflect new page address
1692		 */
1693		if (DO_INDEX(ip)) {
1694			s64 lblock;
1695
1696			mp = NULL;
1697			stbl = DT_GETSTBL(sp);
1698			for (n = 0; n < sp->header.nextindex; n++) {
1699				ldtentry =
1700				    (struct ldtentry *) & sp->slot[stbl[n]];
1701				modify_index(tid, ip,
1702					     le32_to_cpu(ldtentry->index),
1703					     xaddr, n, &mp, &lblock);
1704			}
1705			if (mp)
1706				release_metapage(mp);
1707		}
1708	}
1709
1710	/*
1711	 *	extend the page
1712	 */
1713	sp->header.self = *pxd;
1714
1715	jfs_info("dtExtendPage: ip:0x%p smp:0x%p sp:0x%p", ip, smp, sp);
1716
1717	BT_MARK_DIRTY(smp, ip);
1718	/*
1719	 * acquire a transaction lock on the extended/leaf page
1720	 */
1721	tlck = txLock(tid, ip, smp, tlckDTREE | type);
1722	dtlck = (struct dt_lock *) & tlck->lock;
1723	lv = & dtlck->lv[0];
1724
1725	/* update buffer extent descriptor of extended page */
1726	xlen = lengthPXD(pxd);
1727	xsize = xlen << JFS_SBI(sb)->l2bsize;
1728
1729	/*
1730	 * copy old stbl to new stbl at start of extended area
1731	 */
1732	oldstblindex = sp->header.stblindex;
1733	oldstblsize = (sp->header.maxslot + 31) >> L2DTSLOTSIZE;
1734	newstblindex = sp->header.maxslot;
1735	n = xsize >> L2DTSLOTSIZE;
1736	newstblsize = (n + 31) >> L2DTSLOTSIZE;
1737	memcpy(&sp->slot[newstblindex], &sp->slot[oldstblindex],
1738	       sp->header.nextindex);
1739
1740	/*
1741	 * in-line extension: linelock old area of extended page
1742	 */
1743	if (type == tlckEXTEND) {
1744		/* linelock header */
1745		lv->offset = 0;
1746		lv->length = 1;
1747		dtlck->index++;
1748		lv++;
1749
1750		/* linelock new stbl of extended page */
1751		lv->offset = newstblindex;
1752		lv->length = newstblsize;
1753	}
1754	/*
1755	 * relocation: linelock whole relocated area
1756	 */
1757	else {
1758		lv->offset = 0;
1759		lv->length = sp->header.maxslot + newstblsize;
1760	}
1761
1762	dtlck->index++;
1763
1764	sp->header.maxslot = n;
1765	sp->header.stblindex = newstblindex;
1766	/* sp->header.nextindex remains the same */
1767
1768	/*
1769	 * add old stbl region at head of freelist
1770	 */
1771	fsi = oldstblindex;
1772	f = &sp->slot[fsi];
1773	last = sp->header.freelist;
1774	for (n = 0; n < oldstblsize; n++, fsi++, f++) {
1775		f->next = last;
1776		last = fsi;
1777	}
1778	sp->header.freelist = last;
1779	sp->header.freecnt += oldstblsize;
1780
1781	/*
1782	 * append free region of newly extended area at tail of freelist
1783	 */
1784	/* init free region of newly extended area */
1785	fsi = n = newstblindex + newstblsize;
1786	f = &sp->slot[fsi];
1787	for (fsi++; fsi < sp->header.maxslot; f++, fsi++)
1788		f->next = fsi;
1789	f->next = -1;
1790
1791	/* append new free region at tail of old freelist */
1792	fsi = sp->header.freelist;
1793	if (fsi == -1)
1794		sp->header.freelist = n;
1795	else {
1796		do {
1797			f = &sp->slot[fsi];
1798			fsi = f->next;
1799		} while (fsi != -1);
1800
1801		f->next = n;
1802	}
1803
1804	sp->header.freecnt += sp->header.maxslot - n;
1805
1806	/*
1807	 * insert the new entry
1808	 */
1809	dtInsertEntry(sp, split->index, split->key, split->data, &dtlck);
1810
1811	BT_MARK_DIRTY(pmp, ip);
1812	/*
1813	 * linelock any freeslots residing in old extent
1814	 */
1815	if (type == tlckEXTEND) {
1816		n = sp->header.maxslot >> 2;
1817		if (sp->header.freelist < n)
1818			dtLinelockFreelist(sp, n, &dtlck);
1819	}
1820
1821	/*
1822	 *	update parent entry on the parent/root page
1823	 */
1824	/*
1825	 * acquire a transaction lock on the parent/root page
1826	 */
1827	tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY);
1828	dtlck = (struct dt_lock *) & tlck->lock;
1829	lv = & dtlck->lv[dtlck->index];
1830
1831	/* linelock parent entry - 1st slot */
1832	lv->offset = 1;
1833	lv->length = 1;
1834	dtlck->index++;
1835
1836	/* update the parent pxd for page extension */
1837	tpxd = (pxd_t *) & pp->slot[1];
1838	*tpxd = *pxd;
1839
1840	DT_PUTPAGE(pmp);
1841	return 0;
1842}
1843
1844
1845/*
1846 *	dtSplitRoot()
1847 *
1848 * function:
1849 *	split the full root page into
1850 *	original/root/split page and new right page
1851 *	i.e., root remains fixed in tree anchor (inode) and
1852 *	the root is copied to a single new right child page
1853 *	since root page << non-root page, and
1854 *	the split root page contains a single entry for the
1855 *	new right child page.
1856 *
1857 * parameter:
1858 *
1859 * return: 0 - success;
1860 *	   errno - failure;
1861 *	return new page pinned;
1862 */
1863static int dtSplitRoot(tid_t tid,
1864	    struct inode *ip, struct dtsplit * split, struct metapage ** rmpp)
1865{
1866	struct super_block *sb = ip->i_sb;
1867	struct metapage *smp;
1868	dtroot_t *sp;
1869	struct metapage *rmp;
1870	dtpage_t *rp;
1871	s64 rbn;
1872	int xlen;
1873	int xsize;
1874	struct dtslot *f;
1875	s8 *stbl;
1876	int fsi, stblsize, n;
1877	struct idtentry *s;
1878	pxd_t *ppxd;
1879	struct pxdlist *pxdlist;
1880	pxd_t *pxd;
1881	struct dt_lock *dtlck;
1882	struct tlock *tlck;
1883	struct lv *lv;
1884	int rc;
1885
1886	/* get split root page */
1887	smp = split->mp;
1888	sp = &JFS_IP(ip)->i_dtroot;
1889
1890	/*
1891	 *	allocate/initialize a single (right) child page
1892	 *
1893	 * N.B. at first split, a one (or two) block to fit new entry
1894	 * is allocated; at subsequent split, a full page is allocated;
1895	 */
1896	pxdlist = split->pxdlist;
1897	pxd = &pxdlist->pxd[pxdlist->npxd];
1898	pxdlist->npxd++;
1899	rbn = addressPXD(pxd);
1900	xlen = lengthPXD(pxd);
1901	xsize = xlen << JFS_SBI(sb)->l2bsize;
1902	rmp = get_metapage(ip, rbn, xsize, 1);
1903	if (!rmp)
1904		return -EIO;
1905
1906	rp = rmp->data;
1907
1908	/* Allocate blocks to quota. */
1909	rc = dquot_alloc_block(ip, lengthPXD(pxd));
1910	if (rc) {
1911		release_metapage(rmp);
1912		return rc;
1913	}
1914
1915	BT_MARK_DIRTY(rmp, ip);
1916	/*
1917	 * acquire a transaction lock on the new right page
1918	 */
1919	tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW);
1920	dtlck = (struct dt_lock *) & tlck->lock;
1921
1922	rp->header.flag =
1923	    (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL;
1924	rp->header.self = *pxd;
1925
1926	/* initialize sibling pointers */
1927	rp->header.next = 0;
1928	rp->header.prev = 0;
1929
1930	/*
1931	 *	move in-line root page into new right page extent
1932	 */
1933	/* linelock header + copied entries + new stbl (1st slot) in new page */
1934	ASSERT(dtlck->index == 0);
1935	lv = & dtlck->lv[0];
1936	lv->offset = 0;
1937	lv->length = 10;	/* 1 + 8 + 1 */
1938	dtlck->index++;
1939
1940	n = xsize >> L2DTSLOTSIZE;
1941	rp->header.maxslot = n;
1942	stblsize = (n + 31) >> L2DTSLOTSIZE;
1943
1944	/* copy old stbl to new stbl at start of extended area */
1945	rp->header.stblindex = DTROOTMAXSLOT;
1946	stbl = (s8 *) & rp->slot[DTROOTMAXSLOT];
1947	memcpy(stbl, sp->header.stbl, sp->header.nextindex);
1948	rp->header.nextindex = sp->header.nextindex;
1949
1950	/* copy old data area to start of new data area */
1951	memcpy(&rp->slot[1], &sp->slot[1], IDATASIZE);
1952
1953	/*
1954	 * append free region of newly extended area at tail of freelist
1955	 */
1956	/* init free region of newly extended area */
1957	fsi = n = DTROOTMAXSLOT + stblsize;
1958	f = &rp->slot[fsi];
1959	for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
1960		f->next = fsi;
1961	f->next = -1;
1962
1963	/* append new free region at tail of old freelist */
1964	fsi = sp->header.freelist;
1965	if (fsi == -1)
1966		rp->header.freelist = n;
1967	else {
1968		rp->header.freelist = fsi;
1969
1970		do {
1971			f = &rp->slot[fsi];
1972			fsi = f->next;
1973		} while (fsi != -1);
1974
1975		f->next = n;
1976	}
1977
1978	rp->header.freecnt = sp->header.freecnt + rp->header.maxslot - n;
1979
1980	/*
1981	 * Update directory index table for entries now in right page
1982	 */
1983	if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) {
1984		s64 lblock;
1985		struct metapage *mp = NULL;
1986		struct ldtentry *ldtentry;
1987
1988		stbl = DT_GETSTBL(rp);
1989		for (n = 0; n < rp->header.nextindex; n++) {
1990			ldtentry = (struct ldtentry *) & rp->slot[stbl[n]];
1991			modify_index(tid, ip, le32_to_cpu(ldtentry->index),
1992				     rbn, n, &mp, &lblock);
1993		}
1994		if (mp)
1995			release_metapage(mp);
1996	}
1997	/*
1998	 * insert the new entry into the new right/child page
1999	 * (skip index in the new right page will not change)
2000	 */
2001	dtInsertEntry(rp, split->index, split->key, split->data, &dtlck);
2002
2003	/*
2004	 *	reset parent/root page
2005	 *
2006	 * set the 1st entry offset to 0, which force the left-most key
2007	 * at any level of the tree to be less than any search key.
2008	 *
2009	 * The btree comparison code guarantees that the left-most key on any
2010	 * level of the tree is never used, so it doesn't need to be filled in.
2011	 */
2012	BT_MARK_DIRTY(smp, ip);
2013	/*
2014	 * acquire a transaction lock on the root page (in-memory inode)
2015	 */
2016	tlck = txLock(tid, ip, smp, tlckDTREE | tlckNEW | tlckBTROOT);
2017	dtlck = (struct dt_lock *) & tlck->lock;
2018
2019	/* linelock root */
2020	ASSERT(dtlck->index == 0);
2021	lv = & dtlck->lv[0];
2022	lv->offset = 0;
2023	lv->length = DTROOTMAXSLOT;
2024	dtlck->index++;
2025
2026	/* update page header of root */
2027	if (sp->header.flag & BT_LEAF) {
2028		sp->header.flag &= ~BT_LEAF;
2029		sp->header.flag |= BT_INTERNAL;
2030	}
2031
2032	/* init the first entry */
2033	s = (struct idtentry *) & sp->slot[DTENTRYSTART];
2034	ppxd = (pxd_t *) s;
2035	*ppxd = *pxd;
2036	s->next = -1;
2037	s->namlen = 0;
2038
2039	stbl = sp->header.stbl;
2040	stbl[0] = DTENTRYSTART;
2041	sp->header.nextindex = 1;
2042
2043	/* init freelist */
2044	fsi = DTENTRYSTART + 1;
2045	f = &sp->slot[fsi];
2046
2047	/* init free region of remaining area */
2048	for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++)
2049		f->next = fsi;
2050	f->next = -1;
2051
2052	sp->header.freelist = DTENTRYSTART + 1;
2053	sp->header.freecnt = DTROOTMAXSLOT - (DTENTRYSTART + 1);
2054
2055	*rmpp = rmp;
2056
2057	return 0;
2058}
2059
2060
2061/*
2062 *	dtDelete()
2063 *
2064 * function: delete the entry(s) referenced by a key.
2065 *
2066 * parameter:
2067 *
2068 * return:
2069 */
2070int dtDelete(tid_t tid,
2071	 struct inode *ip, struct component_name * key, ino_t * ino, int flag)
2072{
2073	int rc = 0;
2074	s64 bn;
2075	struct metapage *mp, *imp;
2076	dtpage_t *p;
2077	int index;
2078	struct btstack btstack;
2079	struct dt_lock *dtlck;
2080	struct tlock *tlck;
2081	struct lv *lv;
2082	int i;
2083	struct ldtentry *ldtentry;
2084	u8 *stbl;
2085	u32 table_index, next_index;
2086	struct metapage *nmp;
2087	dtpage_t *np;
2088
2089	/*
2090	 *	search for the entry to delete:
2091	 *
2092	 * dtSearch() returns (leaf page pinned, index at which to delete).
2093	 */
2094	if ((rc = dtSearch(ip, key, ino, &btstack, flag)))
2095		return rc;
2096
2097	/* retrieve search result */
2098	DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
2099
2100	/*
2101	 * We need to find put the index of the next entry into the
2102	 * directory index table in order to resume a readdir from this
2103	 * entry.
2104	 */
2105	if (DO_INDEX(ip)) {
2106		stbl = DT_GETSTBL(p);
2107		ldtentry = (struct ldtentry *) & p->slot[stbl[index]];
2108		table_index = le32_to_cpu(ldtentry->index);
2109		if (index == (p->header.nextindex - 1)) {
2110			/*
2111			 * Last entry in this leaf page
2112			 */
2113			if ((p->header.flag & BT_ROOT)
2114			    || (p->header.next == 0))
2115				next_index = -1;
2116			else {
2117				/* Read next leaf page */
2118				DT_GETPAGE(ip, le64_to_cpu(p->header.next),
2119					   nmp, PSIZE, np, rc);
2120				if (rc)
2121					next_index = -1;
2122				else {
2123					stbl = DT_GETSTBL(np);
2124					ldtentry =
2125					    (struct ldtentry *) & np->
2126					    slot[stbl[0]];
2127					next_index =
2128					    le32_to_cpu(ldtentry->index);
2129					DT_PUTPAGE(nmp);
2130				}
2131			}
2132		} else {
2133			ldtentry =
2134			    (struct ldtentry *) & p->slot[stbl[index + 1]];
2135			next_index = le32_to_cpu(ldtentry->index);
2136		}
2137		free_index(tid, ip, table_index, next_index);
2138	}
2139	/*
2140	 * the leaf page becomes empty, delete the page
2141	 */
2142	if (p->header.nextindex == 1) {
2143		/* delete empty page */
2144		rc = dtDeleteUp(tid, ip, mp, p, &btstack);
2145	}
2146	/*
2147	 * the leaf page has other entries remaining:
2148	 *
2149	 * delete the entry from the leaf page.
2150	 */
2151	else {
2152		BT_MARK_DIRTY(mp, ip);
2153		/*
2154		 * acquire a transaction lock on the leaf page
2155		 */
2156		tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
2157		dtlck = (struct dt_lock *) & tlck->lock;
2158
2159		/*
2160		 * Do not assume that dtlck->index will be zero.  During a
2161		 * rename within a directory, this transaction may have
2162		 * modified this page already when adding the new entry.
2163		 */
2164
2165		/* linelock header */
2166		if (dtlck->index >= dtlck->maxcnt)
2167			dtlck = (struct dt_lock *) txLinelock(dtlck);
2168		lv = & dtlck->lv[dtlck->index];
2169		lv->offset = 0;
2170		lv->length = 1;
2171		dtlck->index++;
2172
2173		/* linelock stbl of non-root leaf page */
2174		if (!(p->header.flag & BT_ROOT)) {
2175			if (dtlck->index >= dtlck->maxcnt)
2176				dtlck = (struct dt_lock *) txLinelock(dtlck);
2177			lv = & dtlck->lv[dtlck->index];
2178			i = index >> L2DTSLOTSIZE;
2179			lv->offset = p->header.stblindex + i;
2180			lv->length =
2181			    ((p->header.nextindex - 1) >> L2DTSLOTSIZE) -
2182			    i + 1;
2183			dtlck->index++;
2184		}
2185
2186		/* free the leaf entry */
2187		dtDeleteEntry(p, index, &dtlck);
2188
2189		/*
2190		 * Update directory index table for entries moved in stbl
2191		 */
2192		if (DO_INDEX(ip) && index < p->header.nextindex) {
2193			s64 lblock;
2194
2195			imp = NULL;
2196			stbl = DT_GETSTBL(p);
2197			for (i = index; i < p->header.nextindex; i++) {
2198				ldtentry =
2199				    (struct ldtentry *) & p->slot[stbl[i]];
2200				modify_index(tid, ip,
2201					     le32_to_cpu(ldtentry->index),
2202					     bn, i, &imp, &lblock);
2203			}
2204			if (imp)
2205				release_metapage(imp);
2206		}
2207
2208		DT_PUTPAGE(mp);
2209	}
2210
2211	return rc;
2212}
2213
2214
2215/*
2216 *	dtDeleteUp()
2217 *
2218 * function:
2219 *	free empty pages as propagating deletion up the tree
2220 *
2221 * parameter:
2222 *
2223 * return:
2224 */
2225static int dtDeleteUp(tid_t tid, struct inode *ip,
2226	   struct metapage * fmp, dtpage_t * fp, struct btstack * btstack)
2227{
2228	int rc = 0;
2229	struct metapage *mp;
2230	dtpage_t *p;
2231	int index, nextindex;
2232	int xlen;
2233	struct btframe *parent;
2234	struct dt_lock *dtlck;
2235	struct tlock *tlck;
2236	struct lv *lv;
2237	struct pxd_lock *pxdlock;
2238	int i;
2239
2240	/*
2241	 *	keep the root leaf page which has become empty
2242	 */
2243	if (BT_IS_ROOT(fmp)) {
2244		/*
2245		 * reset the root
2246		 *
2247		 * dtInitRoot() acquires txlock on the root
2248		 */
2249		dtInitRoot(tid, ip, PARENT(ip));
2250
2251		DT_PUTPAGE(fmp);
2252
2253		return 0;
2254	}
2255
2256	/*
2257	 *	free the non-root leaf page
2258	 */
2259	/*
2260	 * acquire a transaction lock on the page
2261	 *
2262	 * write FREEXTENT|NOREDOPAGE log record
2263	 * N.B. linelock is overlaid as freed extent descriptor, and
2264	 * the buffer page is freed;
2265	 */
2266	tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE);
2267	pxdlock = (struct pxd_lock *) & tlck->lock;
2268	pxdlock->flag = mlckFREEPXD;
2269	pxdlock->pxd = fp->header.self;
2270	pxdlock->index = 1;
2271
2272	/* update sibling pointers */
2273	if ((rc = dtRelink(tid, ip, fp))) {
2274		BT_PUTPAGE(fmp);
2275		return rc;
2276	}
2277
2278	xlen = lengthPXD(&fp->header.self);
2279
2280	/* Free quota allocation. */
2281	dquot_free_block(ip, xlen);
2282
2283	/* free/invalidate its buffer page */
2284	discard_metapage(fmp);
2285
2286	/*
2287	 *	propagate page deletion up the directory tree
2288	 *
2289	 * If the delete from the parent page makes it empty,
2290	 * continue all the way up the tree.
2291	 * stop if the root page is reached (which is never deleted) or
2292	 * if the entry deletion does not empty the page.
2293	 */
2294	while ((parent = BT_POP(btstack)) != NULL) {
2295		/* pin the parent page <sp> */
2296		DT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc);
2297		if (rc)
2298			return rc;
2299
2300		/*
2301		 * free the extent of the child page deleted
2302		 */
2303		index = parent->index;
2304
2305		/*
2306		 * delete the entry for the child page from parent
2307		 */
2308		nextindex = p->header.nextindex;
2309
2310		/*
2311		 * the parent has the single entry being deleted:
2312		 *
2313		 * free the parent page which has become empty.
2314		 */
2315		if (nextindex == 1) {
2316			/*
2317			 * keep the root internal page which has become empty
2318			 */
2319			if (p->header.flag & BT_ROOT) {
2320				/*
2321				 * reset the root
2322				 *
2323				 * dtInitRoot() acquires txlock on the root
2324				 */
2325				dtInitRoot(tid, ip, PARENT(ip));
2326
2327				DT_PUTPAGE(mp);
2328
2329				return 0;
2330			}
2331			/*
2332			 * free the parent page
2333			 */
2334			else {
2335				/*
2336				 * acquire a transaction lock on the page
2337				 *
2338				 * write FREEXTENT|NOREDOPAGE log record
2339				 */
2340				tlck =
2341				    txMaplock(tid, ip,
2342					      tlckDTREE | tlckFREE);
2343				pxdlock = (struct pxd_lock *) & tlck->lock;
2344				pxdlock->flag = mlckFREEPXD;
2345				pxdlock->pxd = p->header.self;
2346				pxdlock->index = 1;
2347
2348				/* update sibling pointers */
2349				if ((rc = dtRelink(tid, ip, p))) {
2350					DT_PUTPAGE(mp);
2351					return rc;
2352				}
2353
2354				xlen = lengthPXD(&p->header.self);
2355
2356				/* Free quota allocation */
2357				dquot_free_block(ip, xlen);
2358
2359				/* free/invalidate its buffer page */
2360				discard_metapage(mp);
2361
2362				/* propagate up */
2363				continue;
2364			}
2365		}
2366
2367		/*
2368		 * the parent has other entries remaining:
2369		 *
2370		 * delete the router entry from the parent page.
2371		 */
2372		BT_MARK_DIRTY(mp, ip);
2373		/*
2374		 * acquire a transaction lock on the page
2375		 *
2376		 * action: router entry deletion
2377		 */
2378		tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
2379		dtlck = (struct dt_lock *) & tlck->lock;
2380
2381		/* linelock header */
2382		if (dtlck->index >= dtlck->maxcnt)
2383			dtlck = (struct dt_lock *) txLinelock(dtlck);
2384		lv = & dtlck->lv[dtlck->index];
2385		lv->offset = 0;
2386		lv->length = 1;
2387		dtlck->index++;
2388
2389		/* linelock stbl of non-root leaf page */
2390		if (!(p->header.flag & BT_ROOT)) {
2391			if (dtlck->index < dtlck->maxcnt)
2392				lv++;
2393			else {
2394				dtlck = (struct dt_lock *) txLinelock(dtlck);
2395				lv = & dtlck->lv[0];
2396			}
2397			i = index >> L2DTSLOTSIZE;
2398			lv->offset = p->header.stblindex + i;
2399			lv->length =
2400			    ((p->header.nextindex - 1) >> L2DTSLOTSIZE) -
2401			    i + 1;
2402			dtlck->index++;
2403		}
2404
2405		/* free the router entry */
2406		dtDeleteEntry(p, index, &dtlck);
2407
2408		/* reset key of new leftmost entry of level (for consistency) */
2409		if (index == 0 &&
2410		    ((p->header.flag & BT_ROOT) || p->header.prev == 0))
2411			dtTruncateEntry(p, 0, &dtlck);
2412
2413		/* unpin the parent page */
2414		DT_PUTPAGE(mp);
2415
2416		/* exit propagation up */
2417		break;
2418	}
2419
2420	if (!DO_INDEX(ip))
2421		ip->i_size -= PSIZE;
2422
2423	return 0;
2424}
2425
2426#ifdef _NOTYET
2427/*
2428 * NAME:	dtRelocate()
2429 *
2430 * FUNCTION:	relocate dtpage (internal or leaf) of directory;
2431 *		This function is mainly used by defragfs utility.
2432 */
2433int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
2434	       s64 nxaddr)
2435{
2436	int rc = 0;
2437	struct metapage *mp, *pmp, *lmp, *rmp;
2438	dtpage_t *p, *pp, *rp = 0, *lp= 0;
2439	s64 bn;
2440	int index;
2441	struct btstack btstack;
2442	pxd_t *pxd;
2443	s64 oxaddr, nextbn, prevbn;
2444	int xlen, xsize;
2445	struct tlock *tlck;
2446	struct dt_lock *dtlck;
2447	struct pxd_lock *pxdlock;
2448	s8 *stbl;
2449	struct lv *lv;
2450
2451	oxaddr = addressPXD(opxd);
2452	xlen = lengthPXD(opxd);
2453
2454	jfs_info("dtRelocate: lmxaddr:%Ld xaddr:%Ld:%Ld xlen:%d",
2455		   (long long)lmxaddr, (long long)oxaddr, (long long)nxaddr,
2456		   xlen);
2457
2458	/*
2459	 *	1. get the internal parent dtpage covering
2460	 *	router entry for the tartget page to be relocated;
2461	 */
2462	rc = dtSearchNode(ip, lmxaddr, opxd, &btstack);
2463	if (rc)
2464		return rc;
2465
2466	/* retrieve search result */
2467	DT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
2468	jfs_info("dtRelocate: parent router entry validated.");
2469
2470	/*
2471	 *	2. relocate the target dtpage
2472	 */
2473	/* read in the target page from src extent */
2474	DT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc);
2475	if (rc) {
2476		/* release the pinned parent page */
2477		DT_PUTPAGE(pmp);
2478		return rc;
2479	}
2480
2481	/*
2482	 * read in sibling pages if any to update sibling pointers;
2483	 */
2484	rmp = NULL;
2485	if (p->header.next) {
2486		nextbn = le64_to_cpu(p->header.next);
2487		DT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc);
2488		if (rc) {
2489			DT_PUTPAGE(mp);
2490			DT_PUTPAGE(pmp);
2491			return (rc);
2492		}
2493	}
2494
2495	lmp = NULL;
2496	if (p->header.prev) {
2497		prevbn = le64_to_cpu(p->header.prev);
2498		DT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc);
2499		if (rc) {
2500			DT_PUTPAGE(mp);
2501			DT_PUTPAGE(pmp);
2502			if (rmp)
2503				DT_PUTPAGE(rmp);
2504			return (rc);
2505		}
2506	}
2507
2508	/* at this point, all xtpages to be updated are in memory */
2509
2510	/*
2511	 * update sibling pointers of sibling dtpages if any;
2512	 */
2513	if (lmp) {
2514		tlck = txLock(tid, ip, lmp, tlckDTREE | tlckRELINK);
2515		dtlck = (struct dt_lock *) & tlck->lock;
2516		/* linelock header */
2517		ASSERT(dtlck->index == 0);
2518		lv = & dtlck->lv[0];
2519		lv->offset = 0;
2520		lv->length = 1;
2521		dtlck->index++;
2522
2523		lp->header.next = cpu_to_le64(nxaddr);
2524		DT_PUTPAGE(lmp);
2525	}
2526
2527	if (rmp) {
2528		tlck = txLock(tid, ip, rmp, tlckDTREE | tlckRELINK);
2529		dtlck = (struct dt_lock *) & tlck->lock;
2530		/* linelock header */
2531		ASSERT(dtlck->index == 0);
2532		lv = & dtlck->lv[0];
2533		lv->offset = 0;
2534		lv->length = 1;
2535		dtlck->index++;
2536
2537		rp->header.prev = cpu_to_le64(nxaddr);
2538		DT_PUTPAGE(rmp);
2539	}
2540
2541	/*
2542	 * update the target dtpage to be relocated
2543	 *
2544	 * write LOG_REDOPAGE of LOG_NEW type for dst page
2545	 * for the whole target page (logredo() will apply
2546	 * after image and update bmap for allocation of the
2547	 * dst extent), and update bmap for allocation of
2548	 * the dst extent;
2549	 */
2550	tlck = txLock(tid, ip, mp, tlckDTREE | tlckNEW);
2551	dtlck = (struct dt_lock *) & tlck->lock;
2552	/* linelock header */
2553	ASSERT(dtlck->index == 0);
2554	lv = & dtlck->lv[0];
2555
2556	/* update the self address in the dtpage header */
2557	pxd = &p->header.self;
2558	PXDaddress(pxd, nxaddr);
2559
2560	/* the dst page is the same as the src page, i.e.,
2561	 * linelock for afterimage of the whole page;
2562	 */
2563	lv->offset = 0;
2564	lv->length = p->header.maxslot;
2565	dtlck->index++;
2566
2567	/* update the buffer extent descriptor of the dtpage */
2568	xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize;
2569
2570	/* unpin the relocated page */
2571	DT_PUTPAGE(mp);
2572	jfs_info("dtRelocate: target dtpage relocated.");
2573
2574	/* the moved extent is dtpage, then a LOG_NOREDOPAGE log rec
2575	 * needs to be written (in logredo(), the LOG_NOREDOPAGE log rec
2576	 * will also force a bmap update ).
2577	 */
2578
2579	/*
2580	 *	3. acquire maplock for the source extent to be freed;
2581	 */
2582	/* for dtpage relocation, write a LOG_NOREDOPAGE record
2583	 * for the source dtpage (logredo() will init NoRedoPage
2584	 * filter and will also update bmap for free of the source
2585	 * dtpage), and upadte bmap for free of the source dtpage;
2586	 */
2587	tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE);
2588	pxdlock = (struct pxd_lock *) & tlck->lock;
2589	pxdlock->flag = mlckFREEPXD;
2590	PXDaddress(&pxdlock->pxd, oxaddr);
2591	PXDlength(&pxdlock->pxd, xlen);
2592	pxdlock->index = 1;
2593
2594	/*
2595	 *	4. update the parent router entry for relocation;
2596	 *
2597	 * acquire tlck for the parent entry covering the target dtpage;
2598	 * write LOG_REDOPAGE to apply after image only;
2599	 */
2600	jfs_info("dtRelocate: update parent router entry.");
2601	tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY);
2602	dtlck = (struct dt_lock *) & tlck->lock;
2603	lv = & dtlck->lv[dtlck->index];
2604
2605	/* update the PXD with the new address */
2606	stbl = DT_GETSTBL(pp);
2607	pxd = (pxd_t *) & pp->slot[stbl[index]];
2608	PXDaddress(pxd, nxaddr);
2609	lv->offset = stbl[index];
2610	lv->length = 1;
2611	dtlck->index++;
2612
2613	/* unpin the parent dtpage */
2614	DT_PUTPAGE(pmp);
2615
2616	return rc;
2617}
2618
2619/*
2620 * NAME:	dtSearchNode()
2621 *
2622 * FUNCTION:	Search for an dtpage containing a specified address
2623 *		This function is mainly used by defragfs utility.
2624 *
2625 * NOTE:	Search result on stack, the found page is pinned at exit.
2626 *		The result page must be an internal dtpage.
2627 *		lmxaddr give the address of the left most page of the
2628 *		dtree level, in which the required dtpage resides.
2629 */
2630static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd,
2631			struct btstack * btstack)
2632{
2633	int rc = 0;
2634	s64 bn;
2635	struct metapage *mp;
2636	dtpage_t *p;
2637	int psize = 288;	/* initial in-line directory */
2638	s8 *stbl;
2639	int i;
2640	pxd_t *pxd;
2641	struct btframe *btsp;
2642
2643	BT_CLR(btstack);	/* reset stack */
2644
2645	/*
2646	 *	descend tree to the level with specified leftmost page
2647	 *
2648	 *  by convention, root bn = 0.
2649	 */
2650	for (bn = 0;;) {
2651		/* get/pin the page to search */
2652		DT_GETPAGE(ip, bn, mp, psize, p, rc);
2653		if (rc)
2654			return rc;
2655
2656		/* does the xaddr of leftmost page of the levevl
2657		 * matches levevl search key ?
2658		 */
2659		if (p->header.flag & BT_ROOT) {
2660			if (lmxaddr == 0)
2661				break;
2662		} else if (addressPXD(&p->header.self) == lmxaddr)
2663			break;
2664
2665		/*
2666		 * descend down to leftmost child page
2667		 */
2668		if (p->header.flag & BT_LEAF) {
2669			DT_PUTPAGE(mp);
2670			return -ESTALE;
2671		}
2672
2673		/* get the leftmost entry */
2674		stbl = DT_GETSTBL(p);
2675		pxd = (pxd_t *) & p->slot[stbl[0]];
2676
2677		/* get the child page block address */
2678		bn = addressPXD(pxd);
2679		psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize;
2680		/* unpin the parent page */
2681		DT_PUTPAGE(mp);
2682	}
2683
2684	/*
2685	 *	search each page at the current levevl
2686	 */
2687      loop:
2688	stbl = DT_GETSTBL(p);
2689	for (i = 0; i < p->header.nextindex; i++) {
2690		pxd = (pxd_t *) & p->slot[stbl[i]];
2691
2692		/* found the specified router entry */
2693		if (addressPXD(pxd) == addressPXD(kpxd) &&
2694		    lengthPXD(pxd) == lengthPXD(kpxd)) {
2695			btsp = btstack->top;
2696			btsp->bn = bn;
2697			btsp->index = i;
2698			btsp->mp = mp;
2699
2700			return 0;
2701		}
2702	}
2703
2704	/* get the right sibling page if any */
2705	if (p->header.next)
2706		bn = le64_to_cpu(p->header.next);
2707	else {
2708		DT_PUTPAGE(mp);
2709		return -ESTALE;
2710	}
2711
2712	/* unpin current page */
2713	DT_PUTPAGE(mp);
2714
2715	/* get the right sibling page */
2716	DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
2717	if (rc)
2718		return rc;
2719
2720	goto loop;
2721}
2722#endif /* _NOTYET */
2723
2724/*
2725 *	dtRelink()
2726 *
2727 * function:
2728 *	link around a freed page.
2729 *
2730 * parameter:
2731 *	fp:	page to be freed
2732 *
2733 * return:
2734 */
2735static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p)
2736{
2737	int rc;
2738	struct metapage *mp;
2739	s64 nextbn, prevbn;
2740	struct tlock *tlck;
2741	struct dt_lock *dtlck;
2742	struct lv *lv;
2743
2744	nextbn = le64_to_cpu(p->header.next);
2745	prevbn = le64_to_cpu(p->header.prev);
2746
2747	/* update prev pointer of the next page */
2748	if (nextbn != 0) {
2749		DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
2750		if (rc)
2751			return rc;
2752
2753		BT_MARK_DIRTY(mp, ip);
2754		/*
2755		 * acquire a transaction lock on the next page
2756		 *
2757		 * action: update prev pointer;
2758		 */
2759		tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
2760		jfs_info("dtRelink nextbn: tlck = 0x%p, ip = 0x%p, mp=0x%p",
2761			tlck, ip, mp);
2762		dtlck = (struct dt_lock *) & tlck->lock;
2763
2764		/* linelock header */
2765		if (dtlck->index >= dtlck->maxcnt)
2766			dtlck = (struct dt_lock *) txLinelock(dtlck);
2767		lv = & dtlck->lv[dtlck->index];
2768		lv->offset = 0;
2769		lv->length = 1;
2770		dtlck->index++;
2771
2772		p->header.prev = cpu_to_le64(prevbn);
2773		DT_PUTPAGE(mp);
2774	}
2775
2776	/* update next pointer of the previous page */
2777	if (prevbn != 0) {
2778		DT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc);
2779		if (rc)
2780			return rc;
2781
2782		BT_MARK_DIRTY(mp, ip);
2783		/*
2784		 * acquire a transaction lock on the prev page
2785		 *
2786		 * action: update next pointer;
2787		 */
2788		tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
2789		jfs_info("dtRelink prevbn: tlck = 0x%p, ip = 0x%p, mp=0x%p",
2790			tlck, ip, mp);
2791		dtlck = (struct dt_lock *) & tlck->lock;
2792
2793		/* linelock header */
2794		if (dtlck->index >= dtlck->maxcnt)
2795			dtlck = (struct dt_lock *) txLinelock(dtlck);
2796		lv = & dtlck->lv[dtlck->index];
2797		lv->offset = 0;
2798		lv->length = 1;
2799		dtlck->index++;
2800
2801		p->header.next = cpu_to_le64(nextbn);
2802		DT_PUTPAGE(mp);
2803	}
2804
2805	return 0;
2806}
2807
2808
2809/*
2810 *	dtInitRoot()
2811 *
2812 * initialize directory root (inline in inode)
2813 */
2814void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot)
2815{
2816	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
2817	dtroot_t *p;
2818	int fsi;
2819	struct dtslot *f;
2820	struct tlock *tlck;
2821	struct dt_lock *dtlck;
2822	struct lv *lv;
2823	u16 xflag_save;
2824
2825	/*
2826	 * If this was previously an non-empty directory, we need to remove
2827	 * the old directory table.
2828	 */
2829	if (DO_INDEX(ip)) {
2830		if (!jfs_dirtable_inline(ip)) {
2831			struct tblock *tblk = tid_to_tblock(tid);
2832			/*
2833			 * We're playing games with the tid's xflag.  If
2834			 * we're removing a regular file, the file's xtree
2835			 * is committed with COMMIT_PMAP, but we always
2836			 * commit the directories xtree with COMMIT_PWMAP.
2837			 */
2838			xflag_save = tblk->xflag;
2839			tblk->xflag = 0;
2840			/*
2841			 * xtTruncate isn't guaranteed to fully truncate
2842			 * the xtree.  The caller needs to check i_size
2843			 * after committing the transaction to see if
2844			 * additional truncation is needed.  The
2845			 * COMMIT_Stale flag tells caller that we
2846			 * initiated the truncation.
2847			 */
2848			xtTruncate(tid, ip, 0, COMMIT_PWMAP);
2849			set_cflag(COMMIT_Stale, ip);
2850
2851			tblk->xflag = xflag_save;
2852		} else
2853			ip->i_size = 1;
2854
2855		jfs_ip->next_index = 2;
2856	} else
2857		ip->i_size = IDATASIZE;
2858
2859	/*
2860	 * acquire a transaction lock on the root
2861	 *
2862	 * action: directory initialization;
2863	 */
2864	tlck = txLock(tid, ip, (struct metapage *) & jfs_ip->bxflag,
2865		      tlckDTREE | tlckENTRY | tlckBTROOT);
2866	dtlck = (struct dt_lock *) & tlck->lock;
2867
2868	/* linelock root */
2869	ASSERT(dtlck->index == 0);
2870	lv = & dtlck->lv[0];
2871	lv->offset = 0;
2872	lv->length = DTROOTMAXSLOT;
2873	dtlck->index++;
2874
2875	p = &jfs_ip->i_dtroot;
2876
2877	p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF;
2878
2879	p->header.nextindex = 0;
2880
2881	/* init freelist */
2882	fsi = 1;
2883	f = &p->slot[fsi];
2884
2885	/* init data area of root */
2886	for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++)
2887		f->next = fsi;
2888	f->next = -1;
2889
2890	p->header.freelist = 1;
2891	p->header.freecnt = 8;
2892
2893	/* init '..' entry */
2894	p->header.idotdot = cpu_to_le32(idotdot);
2895
2896	return;
2897}
2898
2899/*
2900 *	add_missing_indices()
2901 *
2902 * function: Fix dtree page in which one or more entries has an invalid index.
2903 *	     fsck.jfs should really fix this, but it currently does not.
2904 *	     Called from jfs_readdir when bad index is detected.
2905 */
2906static void add_missing_indices(struct inode *inode, s64 bn)
2907{
2908	struct ldtentry *d;
2909	struct dt_lock *dtlck;
2910	int i;
2911	uint index;
2912	struct lv *lv;
2913	struct metapage *mp;
2914	dtpage_t *p;
2915	int rc;
2916	s8 *stbl;
2917	tid_t tid;
2918	struct tlock *tlck;
2919
2920	tid = txBegin(inode->i_sb, 0);
2921
2922	DT_GETPAGE(inode, bn, mp, PSIZE, p, rc);
2923
2924	if (rc) {
2925		printk(KERN_ERR "DT_GETPAGE failed!\n");
2926		goto end;
2927	}
2928	BT_MARK_DIRTY(mp, inode);
2929
2930	ASSERT(p->header.flag & BT_LEAF);
2931
2932	tlck = txLock(tid, inode, mp, tlckDTREE | tlckENTRY);
2933	if (BT_IS_ROOT(mp))
2934		tlck->type |= tlckBTROOT;
2935
2936	dtlck = (struct dt_lock *) &tlck->lock;
2937
2938	stbl = DT_GETSTBL(p);
2939	for (i = 0; i < p->header.nextindex; i++) {
2940		d = (struct ldtentry *) &p->slot[stbl[i]];
2941		index = le32_to_cpu(d->index);
2942		if ((index < 2) || (index >= JFS_IP(inode)->next_index)) {
2943			d->index = cpu_to_le32(add_index(tid, inode, bn, i));
2944			if (dtlck->index >= dtlck->maxcnt)
2945				dtlck = (struct dt_lock *) txLinelock(dtlck);
2946			lv = &dtlck->lv[dtlck->index];
2947			lv->offset = stbl[i];
2948			lv->length = 1;
2949			dtlck->index++;
2950		}
2951	}
2952
2953	DT_PUTPAGE(mp);
2954	(void) txCommit(tid, 1, &inode, 0);
2955end:
2956	txEnd(tid);
2957}
2958
2959/*
2960 * Buffer to hold directory entry info while traversing a dtree page
2961 * before being fed to the filldir function
2962 */
2963struct jfs_dirent {
2964	loff_t position;
2965	int ino;
2966	u16 name_len;
2967	char name[0];
2968};
2969
2970/*
2971 * function to determine next variable-sized jfs_dirent in buffer
2972 */
2973static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent)
2974{
2975	return (struct jfs_dirent *)
2976		((char *)dirent +
2977		 ((sizeof (struct jfs_dirent) + dirent->name_len + 1 +
2978		   sizeof (loff_t) - 1) &
2979		  ~(sizeof (loff_t) - 1)));
2980}
2981
2982/*
2983 *	jfs_readdir()
2984 *
2985 * function: read directory entries sequentially
2986 *	from the specified entry offset
2987 *
2988 * parameter:
2989 *
2990 * return: offset = (pn, index) of start entry
2991 *	of next jfs_readdir()/dtRead()
2992 */
2993int jfs_readdir(struct file *file, struct dir_context *ctx)
2994{
2995	struct inode *ip = file_inode(file);
2996	struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab;
2997	int rc = 0;
2998	loff_t dtpos;	/* legacy OS/2 style position */
2999	struct dtoffset {
3000		s16 pn;
3001		s16 index;
3002		s32 unused;
3003	} *dtoffset = (struct dtoffset *) &dtpos;
3004	s64 bn;
3005	struct metapage *mp;
3006	dtpage_t *p;
3007	int index;
3008	s8 *stbl;
3009	struct btstack btstack;
3010	int i, next;
3011	struct ldtentry *d;
3012	struct dtslot *t;
3013	int d_namleft, len, outlen;
3014	unsigned long dirent_buf;
3015	char *name_ptr;
3016	u32 dir_index;
3017	int do_index = 0;
3018	uint loop_count = 0;
3019	struct jfs_dirent *jfs_dirent;
3020	int jfs_dirents;
3021	int overflow, fix_page, page_fixed = 0;
3022	static int unique_pos = 2;	/* If we can't fix broken index */
3023
3024	if (ctx->pos == DIREND)
3025		return 0;
3026
3027	if (DO_INDEX(ip)) {
3028		/*
3029		 * persistent index is stored in directory entries.
3030		 * Special cases:	 0 = .
3031		 *			 1 = ..
3032		 *			-1 = End of directory
3033		 */
3034		do_index = 1;
3035
3036		dir_index = (u32) ctx->pos;
3037
3038		/*
3039		 * NFSv4 reserves cookies 1 and 2 for . and .. so the value
3040		 * we return to the vfs is one greater than the one we use
3041		 * internally.
3042		 */
3043		if (dir_index)
3044			dir_index--;
3045
3046		if (dir_index > 1) {
3047			struct dir_table_slot dirtab_slot;
3048
3049			if (dtEmpty(ip) ||
3050			    (dir_index >= JFS_IP(ip)->next_index)) {
3051				/* Stale position.  Directory has shrunk */
3052				ctx->pos = DIREND;
3053				return 0;
3054			}
3055		      repeat:
3056			rc = read_index(ip, dir_index, &dirtab_slot);
3057			if (rc) {
3058				ctx->pos = DIREND;
3059				return rc;
3060			}
3061			if (dirtab_slot.flag == DIR_INDEX_FREE) {
3062				if (loop_count++ > JFS_IP(ip)->next_index) {
3063					jfs_err("jfs_readdir detected infinite loop!");
3064					ctx->pos = DIREND;
 
3065					return 0;
3066				}
3067				dir_index = le32_to_cpu(dirtab_slot.addr2);
3068				if (dir_index == -1) {
3069					ctx->pos = DIREND;
3070					return 0;
3071				}
3072				goto repeat;
3073			}
3074			bn = addressDTS(&dirtab_slot);
3075			index = dirtab_slot.slot;
3076			DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3077			if (rc) {
3078				ctx->pos = DIREND;
3079				return 0;
3080			}
3081			if (p->header.flag & BT_INTERNAL) {
3082				jfs_err("jfs_readdir: bad index table");
3083				DT_PUTPAGE(mp);
3084				ctx->pos = DIREND;
3085				return 0;
3086			}
3087		} else {
3088			if (dir_index == 0) {
3089				/*
3090				 * self "."
3091				 */
3092				ctx->pos = 1;
3093				if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
 
3094					return 0;
3095			}
3096			/*
3097			 * parent ".."
3098			 */
3099			ctx->pos = 2;
3100			if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
3101				return 0;
3102
3103			/*
3104			 * Find first entry of left-most leaf
3105			 */
3106			if (dtEmpty(ip)) {
3107				ctx->pos = DIREND;
3108				return 0;
3109			}
3110
3111			if ((rc = dtReadFirst(ip, &btstack)))
3112				return rc;
3113
3114			DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
3115		}
3116	} else {
3117		/*
3118		 * Legacy filesystem - OS/2 & Linux JFS < 0.3.6
3119		 *
3120		 * pn = 0; index = 1:	First entry "."
3121		 * pn = 0; index = 2:	Second entry ".."
3122		 * pn > 0:		Real entries, pn=1 -> leftmost page
3123		 * pn = index = -1:	No more entries
3124		 */
3125		dtpos = ctx->pos;
3126		if (dtpos < 2) {
3127			/* build "." entry */
3128			ctx->pos = 1;
3129			if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
 
3130				return 0;
3131			dtoffset->index = 2;
3132			ctx->pos = dtpos;
3133		}
3134
3135		if (dtoffset->pn == 0) {
3136			if (dtoffset->index == 2) {
3137				/* build ".." entry */
3138				if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
 
 
3139					return 0;
3140			} else {
3141				jfs_err("jfs_readdir called with invalid offset!");
 
3142			}
3143			dtoffset->pn = 1;
3144			dtoffset->index = 0;
3145			ctx->pos = dtpos;
3146		}
3147
3148		if (dtEmpty(ip)) {
3149			ctx->pos = DIREND;
3150			return 0;
3151		}
3152
3153		if ((rc = dtReadNext(ip, &ctx->pos, &btstack))) {
3154			jfs_err("jfs_readdir: unexpected rc = %d from dtReadNext",
3155				rc);
3156			ctx->pos = DIREND;
3157			return 0;
3158		}
3159		/* get start leaf page and index */
3160		DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
3161
3162		/* offset beyond directory eof ? */
3163		if (bn < 0) {
3164			ctx->pos = DIREND;
3165			return 0;
3166		}
3167	}
3168
3169	dirent_buf = __get_free_page(GFP_KERNEL);
3170	if (dirent_buf == 0) {
3171		DT_PUTPAGE(mp);
3172		jfs_warn("jfs_readdir: __get_free_page failed!");
3173		ctx->pos = DIREND;
3174		return -ENOMEM;
3175	}
3176
3177	while (1) {
3178		jfs_dirent = (struct jfs_dirent *) dirent_buf;
3179		jfs_dirents = 0;
3180		overflow = fix_page = 0;
3181
3182		stbl = DT_GETSTBL(p);
3183
3184		for (i = index; i < p->header.nextindex; i++) {
3185			d = (struct ldtentry *) & p->slot[stbl[i]];
3186
3187			if (((long) jfs_dirent + d->namlen + 1) >
3188			    (dirent_buf + PAGE_SIZE)) {
3189				/* DBCS codepages could overrun dirent_buf */
3190				index = i;
3191				overflow = 1;
3192				break;
3193			}
3194
3195			d_namleft = d->namlen;
3196			name_ptr = jfs_dirent->name;
3197			jfs_dirent->ino = le32_to_cpu(d->inumber);
3198
3199			if (do_index) {
3200				len = min(d_namleft, DTLHDRDATALEN);
3201				jfs_dirent->position = le32_to_cpu(d->index);
3202				/*
3203				 * d->index should always be valid, but it
3204				 * isn't.  fsck.jfs doesn't create the
3205				 * directory index for the lost+found
3206				 * directory.  Rather than let it go,
3207				 * we can try to fix it.
3208				 */
3209				if ((jfs_dirent->position < 2) ||
3210				    (jfs_dirent->position >=
3211				     JFS_IP(ip)->next_index)) {
3212					if (!page_fixed && !isReadOnly(ip)) {
3213						fix_page = 1;
3214						/*
3215						 * setting overflow and setting
3216						 * index to i will cause the
3217						 * same page to be processed
3218						 * again starting here
3219						 */
3220						overflow = 1;
3221						index = i;
3222						break;
3223					}
3224					jfs_dirent->position = unique_pos++;
3225				}
3226				/*
3227				 * We add 1 to the index because we may
3228				 * use a value of 2 internally, and NFSv4
3229				 * doesn't like that.
3230				 */
3231				jfs_dirent->position++;
3232			} else {
3233				jfs_dirent->position = dtpos;
3234				len = min(d_namleft, DTLHDRDATALEN_LEGACY);
3235			}
3236
3237			/* copy the name of head/only segment */
3238			outlen = jfs_strfromUCS_le(name_ptr, d->name, len,
3239						   codepage);
3240			jfs_dirent->name_len = outlen;
3241
3242			/* copy name in the additional segment(s) */
3243			next = d->next;
3244			while (next >= 0) {
3245				t = (struct dtslot *) & p->slot[next];
3246				name_ptr += outlen;
3247				d_namleft -= len;
3248				/* Sanity Check */
3249				if (d_namleft == 0) {
3250					jfs_error(ip->i_sb,
3251						  "JFS:Dtree error: ino = %ld, bn=%lld, index = %d\n",
 
3252						  (long)ip->i_ino,
3253						  (long long)bn,
3254						  i);
3255					goto skip_one;
3256				}
3257				len = min(d_namleft, DTSLOTDATALEN);
3258				outlen = jfs_strfromUCS_le(name_ptr, t->name,
3259							   len, codepage);
3260				jfs_dirent->name_len += outlen;
3261
3262				next = t->next;
3263			}
3264
3265			jfs_dirents++;
3266			jfs_dirent = next_jfs_dirent(jfs_dirent);
3267skip_one:
3268			if (!do_index)
3269				dtoffset->index++;
3270		}
3271
3272		if (!overflow) {
3273			/* Point to next leaf page */
3274			if (p->header.flag & BT_ROOT)
3275				bn = 0;
3276			else {
3277				bn = le64_to_cpu(p->header.next);
3278				index = 0;
3279				/* update offset (pn:index) for new page */
3280				if (!do_index) {
3281					dtoffset->pn++;
3282					dtoffset->index = 0;
3283				}
3284			}
3285			page_fixed = 0;
3286		}
3287
3288		/* unpin previous leaf page */
3289		DT_PUTPAGE(mp);
3290
3291		jfs_dirent = (struct jfs_dirent *) dirent_buf;
3292		while (jfs_dirents--) {
3293			ctx->pos = jfs_dirent->position;
3294			if (!dir_emit(ctx, jfs_dirent->name,
3295				    jfs_dirent->name_len,
3296				    jfs_dirent->ino, DT_UNKNOWN))
3297				goto out;
3298			jfs_dirent = next_jfs_dirent(jfs_dirent);
3299		}
3300
3301		if (fix_page) {
3302			add_missing_indices(ip, bn);
3303			page_fixed = 1;
3304		}
3305
3306		if (!overflow && (bn == 0)) {
3307			ctx->pos = DIREND;
3308			break;
3309		}
3310
3311		DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3312		if (rc) {
3313			free_page(dirent_buf);
3314			return rc;
3315		}
3316	}
3317
3318      out:
3319	free_page(dirent_buf);
3320
3321	return rc;
3322}
3323
3324
3325/*
3326 *	dtReadFirst()
3327 *
3328 * function: get the leftmost page of the directory
3329 */
3330static int dtReadFirst(struct inode *ip, struct btstack * btstack)
3331{
3332	int rc = 0;
3333	s64 bn;
3334	int psize = 288;	/* initial in-line directory */
3335	struct metapage *mp;
3336	dtpage_t *p;
3337	s8 *stbl;
3338	struct btframe *btsp;
3339	pxd_t *xd;
3340
3341	BT_CLR(btstack);	/* reset stack */
3342
3343	/*
3344	 *	descend leftmost path of the tree
3345	 *
3346	 * by convention, root bn = 0.
3347	 */
3348	for (bn = 0;;) {
3349		DT_GETPAGE(ip, bn, mp, psize, p, rc);
3350		if (rc)
3351			return rc;
3352
3353		/*
3354		 * leftmost leaf page
3355		 */
3356		if (p->header.flag & BT_LEAF) {
3357			/* return leftmost entry */
3358			btsp = btstack->top;
3359			btsp->bn = bn;
3360			btsp->index = 0;
3361			btsp->mp = mp;
3362
3363			return 0;
3364		}
3365
3366		/*
3367		 * descend down to leftmost child page
3368		 */
3369		if (BT_STACK_FULL(btstack)) {
3370			DT_PUTPAGE(mp);
3371			jfs_error(ip->i_sb, "btstack overrun\n");
3372			BT_STACK_DUMP(btstack);
3373			return -EIO;
3374		}
3375		/* push (bn, index) of the parent page/entry */
3376		BT_PUSH(btstack, bn, 0);
3377
3378		/* get the leftmost entry */
3379		stbl = DT_GETSTBL(p);
3380		xd = (pxd_t *) & p->slot[stbl[0]];
3381
3382		/* get the child page block address */
3383		bn = addressPXD(xd);
3384		psize = lengthPXD(xd) << JFS_SBI(ip->i_sb)->l2bsize;
3385
3386		/* unpin the parent page */
3387		DT_PUTPAGE(mp);
3388	}
3389}
3390
3391
3392/*
3393 *	dtReadNext()
3394 *
3395 * function: get the page of the specified offset (pn:index)
3396 *
3397 * return: if (offset > eof), bn = -1;
3398 *
3399 * note: if index > nextindex of the target leaf page,
3400 * start with 1st entry of next leaf page;
3401 */
3402static int dtReadNext(struct inode *ip, loff_t * offset,
3403		      struct btstack * btstack)
3404{
3405	int rc = 0;
3406	struct dtoffset {
3407		s16 pn;
3408		s16 index;
3409		s32 unused;
3410	} *dtoffset = (struct dtoffset *) offset;
3411	s64 bn;
3412	struct metapage *mp;
3413	dtpage_t *p;
3414	int index;
3415	int pn;
3416	s8 *stbl;
3417	struct btframe *btsp, *parent;
3418	pxd_t *xd;
3419
3420	/*
3421	 * get leftmost leaf page pinned
3422	 */
3423	if ((rc = dtReadFirst(ip, btstack)))
3424		return rc;
3425
3426	/* get leaf page */
3427	DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
3428
3429	/* get the start offset (pn:index) */
3430	pn = dtoffset->pn - 1;	/* Now pn = 0 represents leftmost leaf */
3431	index = dtoffset->index;
3432
3433	/* start at leftmost page ? */
3434	if (pn == 0) {
3435		/* offset beyond eof ? */
3436		if (index < p->header.nextindex)
3437			goto out;
3438
3439		if (p->header.flag & BT_ROOT) {
3440			bn = -1;
3441			goto out;
3442		}
3443
3444		/* start with 1st entry of next leaf page */
3445		dtoffset->pn++;
3446		dtoffset->index = index = 0;
3447		goto a;
3448	}
3449
3450	/* start at non-leftmost page: scan parent pages for large pn */
3451	if (p->header.flag & BT_ROOT) {
3452		bn = -1;
3453		goto out;
3454	}
3455
3456	/* start after next leaf page ? */
3457	if (pn > 1)
3458		goto b;
3459
3460	/* get leaf page pn = 1 */
3461      a:
3462	bn = le64_to_cpu(p->header.next);
3463
3464	/* unpin leaf page */
3465	DT_PUTPAGE(mp);
3466
3467	/* offset beyond eof ? */
3468	if (bn == 0) {
3469		bn = -1;
3470		goto out;
3471	}
3472
3473	goto c;
3474
3475	/*
3476	 * scan last internal page level to get target leaf page
3477	 */
3478      b:
3479	/* unpin leftmost leaf page */
3480	DT_PUTPAGE(mp);
3481
3482	/* get left most parent page */
3483	btsp = btstack->top;
3484	parent = btsp - 1;
3485	bn = parent->bn;
3486	DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3487	if (rc)
3488		return rc;
3489
3490	/* scan parent pages at last internal page level */
3491	while (pn >= p->header.nextindex) {
3492		pn -= p->header.nextindex;
3493
3494		/* get next parent page address */
3495		bn = le64_to_cpu(p->header.next);
3496
3497		/* unpin current parent page */
3498		DT_PUTPAGE(mp);
3499
3500		/* offset beyond eof ? */
3501		if (bn == 0) {
3502			bn = -1;
3503			goto out;
3504		}
3505
3506		/* get next parent page */
3507		DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3508		if (rc)
3509			return rc;
3510
3511		/* update parent page stack frame */
3512		parent->bn = bn;
3513	}
3514
3515	/* get leaf page address */
3516	stbl = DT_GETSTBL(p);
3517	xd = (pxd_t *) & p->slot[stbl[pn]];
3518	bn = addressPXD(xd);
3519
3520	/* unpin parent page */
3521	DT_PUTPAGE(mp);
3522
3523	/*
3524	 * get target leaf page
3525	 */
3526      c:
3527	DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3528	if (rc)
3529		return rc;
3530
3531	/*
3532	 * leaf page has been completed:
3533	 * start with 1st entry of next leaf page
3534	 */
3535	if (index >= p->header.nextindex) {
3536		bn = le64_to_cpu(p->header.next);
3537
3538		/* unpin leaf page */
3539		DT_PUTPAGE(mp);
3540
3541		/* offset beyond eof ? */
3542		if (bn == 0) {
3543			bn = -1;
3544			goto out;
3545		}
3546
3547		/* get next leaf page */
3548		DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3549		if (rc)
3550			return rc;
3551
3552		/* start with 1st entry of next leaf page */
3553		dtoffset->pn++;
3554		dtoffset->index = 0;
3555	}
3556
3557      out:
3558	/* return target leaf page pinned */
3559	btsp = btstack->top;
3560	btsp->bn = bn;
3561	btsp->index = dtoffset->index;
3562	btsp->mp = mp;
3563
3564	return 0;
3565}
3566
3567
3568/*
3569 *	dtCompare()
3570 *
3571 * function: compare search key with an internal entry
3572 *
3573 * return:
3574 *	< 0 if k is < record
3575 *	= 0 if k is = record
3576 *	> 0 if k is > record
3577 */
3578static int dtCompare(struct component_name * key,	/* search key */
3579		     dtpage_t * p,	/* directory page */
3580		     int si)
3581{				/* entry slot index */
3582	wchar_t *kname;
3583	__le16 *name;
3584	int klen, namlen, len, rc;
3585	struct idtentry *ih;
3586	struct dtslot *t;
3587
3588	/*
3589	 * force the left-most key on internal pages, at any level of
3590	 * the tree, to be less than any search key.
3591	 * this obviates having to update the leftmost key on an internal
3592	 * page when the user inserts a new key in the tree smaller than
3593	 * anything that has been stored.
3594	 *
3595	 * (? if/when dtSearch() narrows down to 1st entry (index = 0),
3596	 * at any internal page at any level of the tree,
3597	 * it descends to child of the entry anyway -
3598	 * ? make the entry as min size dummy entry)
3599	 *
3600	 * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF))
3601	 * return (1);
3602	 */
3603
3604	kname = key->name;
3605	klen = key->namlen;
3606
3607	ih = (struct idtentry *) & p->slot[si];
3608	si = ih->next;
3609	name = ih->name;
3610	namlen = ih->namlen;
3611	len = min(namlen, DTIHDRDATALEN);
3612
3613	/* compare with head/only segment */
3614	len = min(klen, len);
3615	if ((rc = UniStrncmp_le(kname, name, len)))
3616		return rc;
3617
3618	klen -= len;
3619	namlen -= len;
3620
3621	/* compare with additional segment(s) */
3622	kname += len;
3623	while (klen > 0 && namlen > 0) {
3624		/* compare with next name segment */
3625		t = (struct dtslot *) & p->slot[si];
3626		len = min(namlen, DTSLOTDATALEN);
3627		len = min(klen, len);
3628		name = t->name;
3629		if ((rc = UniStrncmp_le(kname, name, len)))
3630			return rc;
3631
3632		klen -= len;
3633		namlen -= len;
3634		kname += len;
3635		si = t->next;
3636	}
3637
3638	return (klen - namlen);
3639}
3640
3641
3642
3643
3644/*
3645 *	ciCompare()
3646 *
3647 * function: compare search key with an (leaf/internal) entry
3648 *
3649 * return:
3650 *	< 0 if k is < record
3651 *	= 0 if k is = record
3652 *	> 0 if k is > record
3653 */
3654static int ciCompare(struct component_name * key,	/* search key */
3655		     dtpage_t * p,	/* directory page */
3656		     int si,	/* entry slot index */
3657		     int flag)
3658{
3659	wchar_t *kname, x;
3660	__le16 *name;
3661	int klen, namlen, len, rc;
3662	struct ldtentry *lh;
3663	struct idtentry *ih;
3664	struct dtslot *t;
3665	int i;
3666
3667	/*
3668	 * force the left-most key on internal pages, at any level of
3669	 * the tree, to be less than any search key.
3670	 * this obviates having to update the leftmost key on an internal
3671	 * page when the user inserts a new key in the tree smaller than
3672	 * anything that has been stored.
3673	 *
3674	 * (? if/when dtSearch() narrows down to 1st entry (index = 0),
3675	 * at any internal page at any level of the tree,
3676	 * it descends to child of the entry anyway -
3677	 * ? make the entry as min size dummy entry)
3678	 *
3679	 * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF))
3680	 * return (1);
3681	 */
3682
3683	kname = key->name;
3684	klen = key->namlen;
3685
3686	/*
3687	 * leaf page entry
3688	 */
3689	if (p->header.flag & BT_LEAF) {
3690		lh = (struct ldtentry *) & p->slot[si];
3691		si = lh->next;
3692		name = lh->name;
3693		namlen = lh->namlen;
3694		if (flag & JFS_DIR_INDEX)
3695			len = min(namlen, DTLHDRDATALEN);
3696		else
3697			len = min(namlen, DTLHDRDATALEN_LEGACY);
3698	}
3699	/*
3700	 * internal page entry
3701	 */
3702	else {
3703		ih = (struct idtentry *) & p->slot[si];
3704		si = ih->next;
3705		name = ih->name;
3706		namlen = ih->namlen;
3707		len = min(namlen, DTIHDRDATALEN);
3708	}
3709
3710	/* compare with head/only segment */
3711	len = min(klen, len);
3712	for (i = 0; i < len; i++, kname++, name++) {
3713		/* only uppercase if case-insensitive support is on */
3714		if ((flag & JFS_OS2) == JFS_OS2)
3715			x = UniToupper(le16_to_cpu(*name));
3716		else
3717			x = le16_to_cpu(*name);
3718		if ((rc = *kname - x))
3719			return rc;
3720	}
3721
3722	klen -= len;
3723	namlen -= len;
3724
3725	/* compare with additional segment(s) */
3726	while (klen > 0 && namlen > 0) {
3727		/* compare with next name segment */
3728		t = (struct dtslot *) & p->slot[si];
3729		len = min(namlen, DTSLOTDATALEN);
3730		len = min(klen, len);
3731		name = t->name;
3732		for (i = 0; i < len; i++, kname++, name++) {
3733			/* only uppercase if case-insensitive support is on */
3734			if ((flag & JFS_OS2) == JFS_OS2)
3735				x = UniToupper(le16_to_cpu(*name));
3736			else
3737				x = le16_to_cpu(*name);
3738
3739			if ((rc = *kname - x))
3740				return rc;
3741		}
3742
3743		klen -= len;
3744		namlen -= len;
3745		si = t->next;
3746	}
3747
3748	return (klen - namlen);
3749}
3750
3751
3752/*
3753 *	ciGetLeafPrefixKey()
3754 *
3755 * function: compute prefix of suffix compression
3756 *	     from two adjacent leaf entries
3757 *	     across page boundary
3758 *
3759 * return: non-zero on error
3760 *
3761 */
3762static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp,
3763			       int ri, struct component_name * key, int flag)
3764{
3765	int klen, namlen;
3766	wchar_t *pl, *pr, *kname;
3767	struct component_name lkey;
3768	struct component_name rkey;
3769
3770	lkey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t),
3771					GFP_KERNEL);
3772	if (lkey.name == NULL)
3773		return -ENOMEM;
3774
3775	rkey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t),
3776					GFP_KERNEL);
3777	if (rkey.name == NULL) {
3778		kfree(lkey.name);
3779		return -ENOMEM;
3780	}
3781
3782	/* get left and right key */
3783	dtGetKey(lp, li, &lkey, flag);
3784	lkey.name[lkey.namlen] = 0;
3785
3786	if ((flag & JFS_OS2) == JFS_OS2)
3787		ciToUpper(&lkey);
3788
3789	dtGetKey(rp, ri, &rkey, flag);
3790	rkey.name[rkey.namlen] = 0;
3791
3792
3793	if ((flag & JFS_OS2) == JFS_OS2)
3794		ciToUpper(&rkey);
3795
3796	/* compute prefix */
3797	klen = 0;
3798	kname = key->name;
3799	namlen = min(lkey.namlen, rkey.namlen);
3800	for (pl = lkey.name, pr = rkey.name;
3801	     namlen; pl++, pr++, namlen--, klen++, kname++) {
3802		*kname = *pr;
3803		if (*pl != *pr) {
3804			key->namlen = klen + 1;
3805			goto free_names;
3806		}
3807	}
3808
3809	/* l->namlen <= r->namlen since l <= r */
3810	if (lkey.namlen < rkey.namlen) {
3811		*kname = *pr;
3812		key->namlen = klen + 1;
3813	} else			/* l->namelen == r->namelen */
3814		key->namlen = klen;
3815
3816free_names:
3817	kfree(lkey.name);
3818	kfree(rkey.name);
3819	return 0;
3820}
3821
3822
3823
3824/*
3825 *	dtGetKey()
3826 *
3827 * function: get key of the entry
3828 */
3829static void dtGetKey(dtpage_t * p, int i,	/* entry index */
3830		     struct component_name * key, int flag)
3831{
3832	int si;
3833	s8 *stbl;
3834	struct ldtentry *lh;
3835	struct idtentry *ih;
3836	struct dtslot *t;
3837	int namlen, len;
3838	wchar_t *kname;
3839	__le16 *name;
3840
3841	/* get entry */
3842	stbl = DT_GETSTBL(p);
3843	si = stbl[i];
3844	if (p->header.flag & BT_LEAF) {
3845		lh = (struct ldtentry *) & p->slot[si];
3846		si = lh->next;
3847		namlen = lh->namlen;
3848		name = lh->name;
3849		if (flag & JFS_DIR_INDEX)
3850			len = min(namlen, DTLHDRDATALEN);
3851		else
3852			len = min(namlen, DTLHDRDATALEN_LEGACY);
3853	} else {
3854		ih = (struct idtentry *) & p->slot[si];
3855		si = ih->next;
3856		namlen = ih->namlen;
3857		name = ih->name;
3858		len = min(namlen, DTIHDRDATALEN);
3859	}
3860
3861	key->namlen = namlen;
3862	kname = key->name;
3863
3864	/*
3865	 * move head/only segment
3866	 */
3867	UniStrncpy_from_le(kname, name, len);
3868
3869	/*
3870	 * move additional segment(s)
3871	 */
3872	while (si >= 0) {
3873		/* get next segment */
3874		t = &p->slot[si];
3875		kname += len;
3876		namlen -= len;
3877		len = min(namlen, DTSLOTDATALEN);
3878		UniStrncpy_from_le(kname, t->name, len);
3879
3880		si = t->next;
3881	}
3882}
3883
3884
3885/*
3886 *	dtInsertEntry()
3887 *
3888 * function: allocate free slot(s) and
3889 *	     write a leaf/internal entry
3890 *
3891 * return: entry slot index
3892 */
3893static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key,
3894			  ddata_t * data, struct dt_lock ** dtlock)
3895{
3896	struct dtslot *h, *t;
3897	struct ldtentry *lh = NULL;
3898	struct idtentry *ih = NULL;
3899	int hsi, fsi, klen, len, nextindex;
3900	wchar_t *kname;
3901	__le16 *name;
3902	s8 *stbl;
3903	pxd_t *xd;
3904	struct dt_lock *dtlck = *dtlock;
3905	struct lv *lv;
3906	int xsi, n;
3907	s64 bn = 0;
3908	struct metapage *mp = NULL;
3909
3910	klen = key->namlen;
3911	kname = key->name;
3912
3913	/* allocate a free slot */
3914	hsi = fsi = p->header.freelist;
3915	h = &p->slot[fsi];
3916	p->header.freelist = h->next;
3917	--p->header.freecnt;
3918
3919	/* open new linelock */
3920	if (dtlck->index >= dtlck->maxcnt)
3921		dtlck = (struct dt_lock *) txLinelock(dtlck);
3922
3923	lv = & dtlck->lv[dtlck->index];
3924	lv->offset = hsi;
3925
3926	/* write head/only segment */
3927	if (p->header.flag & BT_LEAF) {
3928		lh = (struct ldtentry *) h;
3929		lh->next = h->next;
3930		lh->inumber = cpu_to_le32(data->leaf.ino);
3931		lh->namlen = klen;
3932		name = lh->name;
3933		if (data->leaf.ip) {
3934			len = min(klen, DTLHDRDATALEN);
3935			if (!(p->header.flag & BT_ROOT))
3936				bn = addressPXD(&p->header.self);
3937			lh->index = cpu_to_le32(add_index(data->leaf.tid,
3938							  data->leaf.ip,
3939							  bn, index));
3940		} else
3941			len = min(klen, DTLHDRDATALEN_LEGACY);
3942	} else {
3943		ih = (struct idtentry *) h;
3944		ih->next = h->next;
3945		xd = (pxd_t *) ih;
3946		*xd = data->xd;
3947		ih->namlen = klen;
3948		name = ih->name;
3949		len = min(klen, DTIHDRDATALEN);
3950	}
3951
3952	UniStrncpy_to_le(name, kname, len);
3953
3954	n = 1;
3955	xsi = hsi;
3956
3957	/* write additional segment(s) */
3958	t = h;
3959	klen -= len;
3960	while (klen) {
3961		/* get free slot */
3962		fsi = p->header.freelist;
3963		t = &p->slot[fsi];
3964		p->header.freelist = t->next;
3965		--p->header.freecnt;
3966
3967		/* is next slot contiguous ? */
3968		if (fsi != xsi + 1) {
3969			/* close current linelock */
3970			lv->length = n;
3971			dtlck->index++;
3972
3973			/* open new linelock */
3974			if (dtlck->index < dtlck->maxcnt)
3975				lv++;
3976			else {
3977				dtlck = (struct dt_lock *) txLinelock(dtlck);
3978				lv = & dtlck->lv[0];
3979			}
3980
3981			lv->offset = fsi;
3982			n = 0;
3983		}
3984
3985		kname += len;
3986		len = min(klen, DTSLOTDATALEN);
3987		UniStrncpy_to_le(t->name, kname, len);
3988
3989		n++;
3990		xsi = fsi;
3991		klen -= len;
3992	}
3993
3994	/* close current linelock */
3995	lv->length = n;
3996	dtlck->index++;
3997
3998	*dtlock = dtlck;
3999
4000	/* terminate last/only segment */
4001	if (h == t) {
4002		/* single segment entry */
4003		if (p->header.flag & BT_LEAF)
4004			lh->next = -1;
4005		else
4006			ih->next = -1;
4007	} else
4008		/* multi-segment entry */
4009		t->next = -1;
4010
4011	/* if insert into middle, shift right succeeding entries in stbl */
4012	stbl = DT_GETSTBL(p);
4013	nextindex = p->header.nextindex;
4014	if (index < nextindex) {
4015		memmove(stbl + index + 1, stbl + index, nextindex - index);
4016
4017		if ((p->header.flag & BT_LEAF) && data->leaf.ip) {
4018			s64 lblock;
4019
4020			/*
4021			 * Need to update slot number for entries that moved
4022			 * in the stbl
4023			 */
4024			mp = NULL;
4025			for (n = index + 1; n <= nextindex; n++) {
4026				lh = (struct ldtentry *) & (p->slot[stbl[n]]);
4027				modify_index(data->leaf.tid, data->leaf.ip,
4028					     le32_to_cpu(lh->index), bn, n,
4029					     &mp, &lblock);
4030			}
4031			if (mp)
4032				release_metapage(mp);
4033		}
4034	}
4035
4036	stbl[index] = hsi;
4037
4038	/* advance next available entry index of stbl */
4039	++p->header.nextindex;
4040}
4041
4042
4043/*
4044 *	dtMoveEntry()
4045 *
4046 * function: move entries from split/left page to new/right page
4047 *
4048 *	nextindex of dst page and freelist/freecnt of both pages
4049 *	are updated.
4050 */
4051static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp,
4052			struct dt_lock ** sdtlock, struct dt_lock ** ddtlock,
4053			int do_index)
4054{
4055	int ssi, next;		/* src slot index */
4056	int di;			/* dst entry index */
4057	int dsi;		/* dst slot index */
4058	s8 *sstbl, *dstbl;	/* sorted entry table */
4059	int snamlen, len;
4060	struct ldtentry *slh, *dlh = NULL;
4061	struct idtentry *sih, *dih = NULL;
4062	struct dtslot *h, *s, *d;
4063	struct dt_lock *sdtlck = *sdtlock, *ddtlck = *ddtlock;
4064	struct lv *slv, *dlv;
4065	int xssi, ns, nd;
4066	int sfsi;
4067
4068	sstbl = (s8 *) & sp->slot[sp->header.stblindex];
4069	dstbl = (s8 *) & dp->slot[dp->header.stblindex];
4070
4071	dsi = dp->header.freelist;	/* first (whole page) free slot */
4072	sfsi = sp->header.freelist;
4073
4074	/* linelock destination entry slot */
4075	dlv = & ddtlck->lv[ddtlck->index];
4076	dlv->offset = dsi;
4077
4078	/* linelock source entry slot */
4079	slv = & sdtlck->lv[sdtlck->index];
4080	slv->offset = sstbl[si];
4081	xssi = slv->offset - 1;
4082
4083	/*
4084	 * move entries
4085	 */
4086	ns = nd = 0;
4087	for (di = 0; si < sp->header.nextindex; si++, di++) {
4088		ssi = sstbl[si];
4089		dstbl[di] = dsi;
4090
4091		/* is next slot contiguous ? */
4092		if (ssi != xssi + 1) {
4093			/* close current linelock */
4094			slv->length = ns;
4095			sdtlck->index++;
4096
4097			/* open new linelock */
4098			if (sdtlck->index < sdtlck->maxcnt)
4099				slv++;
4100			else {
4101				sdtlck = (struct dt_lock *) txLinelock(sdtlck);
4102				slv = & sdtlck->lv[0];
4103			}
4104
4105			slv->offset = ssi;
4106			ns = 0;
4107		}
4108
4109		/*
4110		 * move head/only segment of an entry
4111		 */
4112		/* get dst slot */
4113		h = d = &dp->slot[dsi];
4114
4115		/* get src slot and move */
4116		s = &sp->slot[ssi];
4117		if (sp->header.flag & BT_LEAF) {
4118			/* get source entry */
4119			slh = (struct ldtentry *) s;
4120			dlh = (struct ldtentry *) h;
4121			snamlen = slh->namlen;
4122
4123			if (do_index) {
4124				len = min(snamlen, DTLHDRDATALEN);
4125				dlh->index = slh->index; /* little-endian */
4126			} else
4127				len = min(snamlen, DTLHDRDATALEN_LEGACY);
4128
4129			memcpy(dlh, slh, 6 + len * 2);
4130
4131			next = slh->next;
4132
4133			/* update dst head/only segment next field */
4134			dsi++;
4135			dlh->next = dsi;
4136		} else {
4137			sih = (struct idtentry *) s;
4138			snamlen = sih->namlen;
4139
4140			len = min(snamlen, DTIHDRDATALEN);
4141			dih = (struct idtentry *) h;
4142			memcpy(dih, sih, 10 + len * 2);
4143			next = sih->next;
4144
4145			dsi++;
4146			dih->next = dsi;
4147		}
4148
4149		/* free src head/only segment */
4150		s->next = sfsi;
4151		s->cnt = 1;
4152		sfsi = ssi;
4153
4154		ns++;
4155		nd++;
4156		xssi = ssi;
4157
4158		/*
4159		 * move additional segment(s) of the entry
4160		 */
4161		snamlen -= len;
4162		while ((ssi = next) >= 0) {
4163			/* is next slot contiguous ? */
4164			if (ssi != xssi + 1) {
4165				/* close current linelock */
4166				slv->length = ns;
4167				sdtlck->index++;
4168
4169				/* open new linelock */
4170				if (sdtlck->index < sdtlck->maxcnt)
4171					slv++;
4172				else {
4173					sdtlck =
4174					    (struct dt_lock *)
4175					    txLinelock(sdtlck);
4176					slv = & sdtlck->lv[0];
4177				}
4178
4179				slv->offset = ssi;
4180				ns = 0;
4181			}
4182
4183			/* get next source segment */
4184			s = &sp->slot[ssi];
4185
4186			/* get next destination free slot */
4187			d++;
4188
4189			len = min(snamlen, DTSLOTDATALEN);
4190			UniStrncpy_le(d->name, s->name, len);
4191
4192			ns++;
4193			nd++;
4194			xssi = ssi;
4195
4196			dsi++;
4197			d->next = dsi;
4198
4199			/* free source segment */
4200			next = s->next;
4201			s->next = sfsi;
4202			s->cnt = 1;
4203			sfsi = ssi;
4204
4205			snamlen -= len;
4206		}		/* end while */
4207
4208		/* terminate dst last/only segment */
4209		if (h == d) {
4210			/* single segment entry */
4211			if (dp->header.flag & BT_LEAF)
4212				dlh->next = -1;
4213			else
4214				dih->next = -1;
4215		} else
4216			/* multi-segment entry */
4217			d->next = -1;
4218	}			/* end for */
4219
4220	/* close current linelock */
4221	slv->length = ns;
4222	sdtlck->index++;
4223	*sdtlock = sdtlck;
4224
4225	dlv->length = nd;
4226	ddtlck->index++;
4227	*ddtlock = ddtlck;
4228
4229	/* update source header */
4230	sp->header.freelist = sfsi;
4231	sp->header.freecnt += nd;
4232
4233	/* update destination header */
4234	dp->header.nextindex = di;
4235
4236	dp->header.freelist = dsi;
4237	dp->header.freecnt -= nd;
4238}
4239
4240
4241/*
4242 *	dtDeleteEntry()
4243 *
4244 * function: free a (leaf/internal) entry
4245 *
4246 * log freelist header, stbl, and each segment slot of entry
4247 * (even though last/only segment next field is modified,
4248 * physical image logging requires all segment slots of
4249 * the entry logged to avoid applying previous updates
4250 * to the same slots)
4251 */
4252static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock)
4253{
4254	int fsi;		/* free entry slot index */
4255	s8 *stbl;
4256	struct dtslot *t;
4257	int si, freecnt;
4258	struct dt_lock *dtlck = *dtlock;
4259	struct lv *lv;
4260	int xsi, n;
4261
4262	/* get free entry slot index */
4263	stbl = DT_GETSTBL(p);
4264	fsi = stbl[fi];
4265
4266	/* open new linelock */
4267	if (dtlck->index >= dtlck->maxcnt)
4268		dtlck = (struct dt_lock *) txLinelock(dtlck);
4269	lv = & dtlck->lv[dtlck->index];
4270
4271	lv->offset = fsi;
4272
4273	/* get the head/only segment */
4274	t = &p->slot[fsi];
4275	if (p->header.flag & BT_LEAF)
4276		si = ((struct ldtentry *) t)->next;
4277	else
4278		si = ((struct idtentry *) t)->next;
4279	t->next = si;
4280	t->cnt = 1;
4281
4282	n = freecnt = 1;
4283	xsi = fsi;
4284
4285	/* find the last/only segment */
4286	while (si >= 0) {
4287		/* is next slot contiguous ? */
4288		if (si != xsi + 1) {
4289			/* close current linelock */
4290			lv->length = n;
4291			dtlck->index++;
4292
4293			/* open new linelock */
4294			if (dtlck->index < dtlck->maxcnt)
4295				lv++;
4296			else {
4297				dtlck = (struct dt_lock *) txLinelock(dtlck);
4298				lv = & dtlck->lv[0];
4299			}
4300
4301			lv->offset = si;
4302			n = 0;
4303		}
4304
4305		n++;
4306		xsi = si;
4307		freecnt++;
4308
4309		t = &p->slot[si];
4310		t->cnt = 1;
4311		si = t->next;
4312	}
4313
4314	/* close current linelock */
4315	lv->length = n;
4316	dtlck->index++;
4317
4318	*dtlock = dtlck;
4319
4320	/* update freelist */
4321	t->next = p->header.freelist;
4322	p->header.freelist = fsi;
4323	p->header.freecnt += freecnt;
4324
4325	/* if delete from middle,
4326	 * shift left the succedding entries in the stbl
4327	 */
4328	si = p->header.nextindex;
4329	if (fi < si - 1)
4330		memmove(&stbl[fi], &stbl[fi + 1], si - fi - 1);
4331
4332	p->header.nextindex--;
4333}
4334
4335
4336/*
4337 *	dtTruncateEntry()
4338 *
4339 * function: truncate a (leaf/internal) entry
4340 *
4341 * log freelist header, stbl, and each segment slot of entry
4342 * (even though last/only segment next field is modified,
4343 * physical image logging requires all segment slots of
4344 * the entry logged to avoid applying previous updates
4345 * to the same slots)
4346 */
4347static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock)
4348{
4349	int tsi;		/* truncate entry slot index */
4350	s8 *stbl;
4351	struct dtslot *t;
4352	int si, freecnt;
4353	struct dt_lock *dtlck = *dtlock;
4354	struct lv *lv;
4355	int fsi, xsi, n;
4356
4357	/* get free entry slot index */
4358	stbl = DT_GETSTBL(p);
4359	tsi = stbl[ti];
4360
4361	/* open new linelock */
4362	if (dtlck->index >= dtlck->maxcnt)
4363		dtlck = (struct dt_lock *) txLinelock(dtlck);
4364	lv = & dtlck->lv[dtlck->index];
4365
4366	lv->offset = tsi;
4367
4368	/* get the head/only segment */
4369	t = &p->slot[tsi];
4370	ASSERT(p->header.flag & BT_INTERNAL);
4371	((struct idtentry *) t)->namlen = 0;
4372	si = ((struct idtentry *) t)->next;
4373	((struct idtentry *) t)->next = -1;
4374
4375	n = 1;
4376	freecnt = 0;
4377	fsi = si;
4378	xsi = tsi;
4379
4380	/* find the last/only segment */
4381	while (si >= 0) {
4382		/* is next slot contiguous ? */
4383		if (si != xsi + 1) {
4384			/* close current linelock */
4385			lv->length = n;
4386			dtlck->index++;
4387
4388			/* open new linelock */
4389			if (dtlck->index < dtlck->maxcnt)
4390				lv++;
4391			else {
4392				dtlck = (struct dt_lock *) txLinelock(dtlck);
4393				lv = & dtlck->lv[0];
4394			}
4395
4396			lv->offset = si;
4397			n = 0;
4398		}
4399
4400		n++;
4401		xsi = si;
4402		freecnt++;
4403
4404		t = &p->slot[si];
4405		t->cnt = 1;
4406		si = t->next;
4407	}
4408
4409	/* close current linelock */
4410	lv->length = n;
4411	dtlck->index++;
4412
4413	*dtlock = dtlck;
4414
4415	/* update freelist */
4416	if (freecnt == 0)
4417		return;
4418	t->next = p->header.freelist;
4419	p->header.freelist = fsi;
4420	p->header.freecnt += freecnt;
4421}
4422
4423
4424/*
4425 *	dtLinelockFreelist()
4426 */
4427static void dtLinelockFreelist(dtpage_t * p,	/* directory page */
4428			       int m,	/* max slot index */
4429			       struct dt_lock ** dtlock)
4430{
4431	int fsi;		/* free entry slot index */
4432	struct dtslot *t;
4433	int si;
4434	struct dt_lock *dtlck = *dtlock;
4435	struct lv *lv;
4436	int xsi, n;
4437
4438	/* get free entry slot index */
4439	fsi = p->header.freelist;
4440
4441	/* open new linelock */
4442	if (dtlck->index >= dtlck->maxcnt)
4443		dtlck = (struct dt_lock *) txLinelock(dtlck);
4444	lv = & dtlck->lv[dtlck->index];
4445
4446	lv->offset = fsi;
4447
4448	n = 1;
4449	xsi = fsi;
4450
4451	t = &p->slot[fsi];
4452	si = t->next;
4453
4454	/* find the last/only segment */
4455	while (si < m && si >= 0) {
4456		/* is next slot contiguous ? */
4457		if (si != xsi + 1) {
4458			/* close current linelock */
4459			lv->length = n;
4460			dtlck->index++;
4461
4462			/* open new linelock */
4463			if (dtlck->index < dtlck->maxcnt)
4464				lv++;
4465			else {
4466				dtlck = (struct dt_lock *) txLinelock(dtlck);
4467				lv = & dtlck->lv[0];
4468			}
4469
4470			lv->offset = si;
4471			n = 0;
4472		}
4473
4474		n++;
4475		xsi = si;
4476
4477		t = &p->slot[si];
4478		si = t->next;
4479	}
4480
4481	/* close current linelock */
4482	lv->length = n;
4483	dtlck->index++;
4484
4485	*dtlock = dtlck;
4486}
4487
4488
4489/*
4490 * NAME: dtModify
4491 *
4492 * FUNCTION: Modify the inode number part of a directory entry
4493 *
4494 * PARAMETERS:
4495 *	tid	- Transaction id
4496 *	ip	- Inode of parent directory
4497 *	key	- Name of entry to be modified
4498 *	orig_ino	- Original inode number expected in entry
4499 *	new_ino	- New inode number to put into entry
4500 *	flag	- JFS_RENAME
4501 *
4502 * RETURNS:
4503 *	-ESTALE	- If entry found does not match orig_ino passed in
4504 *	-ENOENT	- If no entry can be found to match key
4505 *	0	- If successfully modified entry
4506 */
4507int dtModify(tid_t tid, struct inode *ip,
4508	 struct component_name * key, ino_t * orig_ino, ino_t new_ino, int flag)
4509{
4510	int rc;
4511	s64 bn;
4512	struct metapage *mp;
4513	dtpage_t *p;
4514	int index;
4515	struct btstack btstack;
4516	struct tlock *tlck;
4517	struct dt_lock *dtlck;
4518	struct lv *lv;
4519	s8 *stbl;
4520	int entry_si;		/* entry slot index */
4521	struct ldtentry *entry;
4522
4523	/*
4524	 *	search for the entry to modify:
4525	 *
4526	 * dtSearch() returns (leaf page pinned, index at which to modify).
4527	 */
4528	if ((rc = dtSearch(ip, key, orig_ino, &btstack, flag)))
4529		return rc;
4530
4531	/* retrieve search result */
4532	DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
4533
4534	BT_MARK_DIRTY(mp, ip);
4535	/*
4536	 * acquire a transaction lock on the leaf page of named entry
4537	 */
4538	tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
4539	dtlck = (struct dt_lock *) & tlck->lock;
4540
4541	/* get slot index of the entry */
4542	stbl = DT_GETSTBL(p);
4543	entry_si = stbl[index];
4544
4545	/* linelock entry */
4546	ASSERT(dtlck->index == 0);
4547	lv = & dtlck->lv[0];
4548	lv->offset = entry_si;
4549	lv->length = 1;
4550	dtlck->index++;
4551
4552	/* get the head/only segment */
4553	entry = (struct ldtentry *) & p->slot[entry_si];
4554
4555	/* substitute the inode number of the entry */
4556	entry->inumber = cpu_to_le32(new_ino);
4557
4558	/* unpin the leaf page */
4559	DT_PUTPAGE(mp);
4560
4561	return 0;
4562}
v3.1
 
   1/*
   2 *   Copyright (C) International Business Machines Corp., 2000-2004
   3 *
   4 *   This program is free software;  you can redistribute it and/or modify
   5 *   it under the terms of the GNU General Public License as published by
   6 *   the Free Software Foundation; either version 2 of the License, or
   7 *   (at your option) any later version.
   8 *
   9 *   This program is distributed in the hope that it will be useful,
  10 *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  11 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  12 *   the GNU General Public License for more details.
  13 *
  14 *   You should have received a copy of the GNU General Public License
  15 *   along with this program;  if not, write to the Free Software
  16 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  17 */
  18
  19/*
  20 *	jfs_dtree.c: directory B+-tree manager
  21 *
  22 * B+-tree with variable length key directory:
  23 *
  24 * each directory page is structured as an array of 32-byte
  25 * directory entry slots initialized as a freelist
  26 * to avoid search/compaction of free space at insertion.
  27 * when an entry is inserted, a number of slots are allocated
  28 * from the freelist as required to store variable length data
  29 * of the entry; when the entry is deleted, slots of the entry
  30 * are returned to freelist.
  31 *
  32 * leaf entry stores full name as key and file serial number
  33 * (aka inode number) as data.
  34 * internal/router entry stores sufffix compressed name
  35 * as key and simple extent descriptor as data.
  36 *
  37 * each directory page maintains a sorted entry index table
  38 * which stores the start slot index of sorted entries
  39 * to allow binary search on the table.
  40 *
  41 * directory starts as a root/leaf page in on-disk inode
  42 * inline data area.
  43 * when it becomes full, it starts a leaf of a external extent
  44 * of length of 1 block. each time the first leaf becomes full,
  45 * it is extended rather than split (its size is doubled),
  46 * until its length becoms 4 KBytes, from then the extent is split
  47 * with new 4 Kbyte extent when it becomes full
  48 * to reduce external fragmentation of small directories.
  49 *
  50 * blah, blah, blah, for linear scan of directory in pieces by
  51 * readdir().
  52 *
  53 *
  54 *	case-insensitive directory file system
  55 *
  56 * names are stored in case-sensitive way in leaf entry.
  57 * but stored, searched and compared in case-insensitive (uppercase) order
  58 * (i.e., both search key and entry key are folded for search/compare):
  59 * (note that case-sensitive order is BROKEN in storage, e.g.,
  60 *  sensitive: Ad, aB, aC, aD -> insensitive: aB, aC, aD, Ad
  61 *
  62 *  entries which folds to the same key makes up a equivalent class
  63 *  whose members are stored as contiguous cluster (may cross page boundary)
  64 *  but whose order is arbitrary and acts as duplicate, e.g.,
  65 *  abc, Abc, aBc, abC)
  66 *
  67 * once match is found at leaf, requires scan forward/backward
  68 * either for, in case-insensitive search, duplicate
  69 * or for, in case-sensitive search, for exact match
  70 *
  71 * router entry must be created/stored in case-insensitive way
  72 * in internal entry:
  73 * (right most key of left page and left most key of right page
  74 * are folded, and its suffix compression is propagated as router
  75 * key in parent)
  76 * (e.g., if split occurs <abc> and <aBd>, <ABD> trather than <aB>
  77 * should be made the router key for the split)
  78 *
  79 * case-insensitive search:
  80 *
  81 *	fold search key;
  82 *
  83 *	case-insensitive search of B-tree:
  84 *	for internal entry, router key is already folded;
  85 *	for leaf entry, fold the entry key before comparison.
  86 *
  87 *	if (leaf entry case-insensitive match found)
  88 *		if (next entry satisfies case-insensitive match)
  89 *			return EDUPLICATE;
  90 *		if (prev entry satisfies case-insensitive match)
  91 *			return EDUPLICATE;
  92 *		return match;
  93 *	else
  94 *		return no match;
  95 *
  96 *	serialization:
  97 * target directory inode lock is being held on entry/exit
  98 * of all main directory service routines.
  99 *
 100 *	log based recovery:
 101 */
 102
 103#include <linux/fs.h>
 104#include <linux/quotaops.h>
 105#include <linux/slab.h>
 106#include "jfs_incore.h"
 107#include "jfs_superblock.h"
 108#include "jfs_filsys.h"
 109#include "jfs_metapage.h"
 110#include "jfs_dmap.h"
 111#include "jfs_unicode.h"
 112#include "jfs_debug.h"
 113
 114/* dtree split parameter */
 115struct dtsplit {
 116	struct metapage *mp;
 117	s16 index;
 118	s16 nslot;
 119	struct component_name *key;
 120	ddata_t *data;
 121	struct pxdlist *pxdlist;
 122};
 123
 124#define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot)
 125
 126/* get page buffer for specified block address */
 127#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC)\
 128{\
 129	BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot)\
 130	if (!(RC))\
 131	{\
 132		if (((P)->header.nextindex > (((BN)==0)?DTROOTMAXSLOT:(P)->header.maxslot)) ||\
 133		    ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT)))\
 134		{\
 135			BT_PUTPAGE(MP);\
 136			jfs_error((IP)->i_sb, "DT_GETPAGE: dtree page corrupt");\
 137			MP = NULL;\
 138			RC = -EIO;\
 139		}\
 140	}\
 141}
 142
 143/* for consistency */
 144#define DT_PUTPAGE(MP) BT_PUTPAGE(MP)
 145
 146#define DT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \
 147	BT_GETSEARCH(IP, LEAF, BN, MP, dtpage_t, P, INDEX, i_dtroot)
 148
 149/*
 150 * forward references
 151 */
 152static int dtSplitUp(tid_t tid, struct inode *ip,
 153		     struct dtsplit * split, struct btstack * btstack);
 154
 155static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
 156		       struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rxdp);
 157
 158static int dtExtendPage(tid_t tid, struct inode *ip,
 159			struct dtsplit * split, struct btstack * btstack);
 160
 161static int dtSplitRoot(tid_t tid, struct inode *ip,
 162		       struct dtsplit * split, struct metapage ** rmpp);
 163
 164static int dtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp,
 165		      dtpage_t * fp, struct btstack * btstack);
 166
 167static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p);
 168
 169static int dtReadFirst(struct inode *ip, struct btstack * btstack);
 170
 171static int dtReadNext(struct inode *ip,
 172		      loff_t * offset, struct btstack * btstack);
 173
 174static int dtCompare(struct component_name * key, dtpage_t * p, int si);
 175
 176static int ciCompare(struct component_name * key, dtpage_t * p, int si,
 177		     int flag);
 178
 179static void dtGetKey(dtpage_t * p, int i, struct component_name * key,
 180		     int flag);
 181
 182static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp,
 183			      int ri, struct component_name * key, int flag);
 184
 185static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key,
 186			  ddata_t * data, struct dt_lock **);
 187
 188static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp,
 189			struct dt_lock ** sdtlock, struct dt_lock ** ddtlock,
 190			int do_index);
 191
 192static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock);
 193
 194static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock);
 195
 196static void dtLinelockFreelist(dtpage_t * p, int m, struct dt_lock ** dtlock);
 197
 198#define ciToUpper(c)	UniStrupr((c)->name)
 199
 200/*
 201 *	read_index_page()
 202 *
 203 *	Reads a page of a directory's index table.
 204 *	Having metadata mapped into the directory inode's address space
 205 *	presents a multitude of problems.  We avoid this by mapping to
 206 *	the absolute address space outside of the *_metapage routines
 207 */
 208static struct metapage *read_index_page(struct inode *inode, s64 blkno)
 209{
 210	int rc;
 211	s64 xaddr;
 212	int xflag;
 213	s32 xlen;
 214
 215	rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1);
 216	if (rc || (xaddr == 0))
 217		return NULL;
 218
 219	return read_metapage(inode, xaddr, PSIZE, 1);
 220}
 221
 222/*
 223 *	get_index_page()
 224 *
 225 *	Same as get_index_page(), but get's a new page without reading
 226 */
 227static struct metapage *get_index_page(struct inode *inode, s64 blkno)
 228{
 229	int rc;
 230	s64 xaddr;
 231	int xflag;
 232	s32 xlen;
 233
 234	rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1);
 235	if (rc || (xaddr == 0))
 236		return NULL;
 237
 238	return get_metapage(inode, xaddr, PSIZE, 1);
 239}
 240
 241/*
 242 *	find_index()
 243 *
 244 *	Returns dtree page containing directory table entry for specified
 245 *	index and pointer to its entry.
 246 *
 247 *	mp must be released by caller.
 248 */
 249static struct dir_table_slot *find_index(struct inode *ip, u32 index,
 250					 struct metapage ** mp, s64 *lblock)
 251{
 252	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
 253	s64 blkno;
 254	s64 offset;
 255	int page_offset;
 256	struct dir_table_slot *slot;
 257	static int maxWarnings = 10;
 258
 259	if (index < 2) {
 260		if (maxWarnings) {
 261			jfs_warn("find_entry called with index = %d", index);
 262			maxWarnings--;
 263		}
 264		return NULL;
 265	}
 266
 267	if (index >= jfs_ip->next_index) {
 268		jfs_warn("find_entry called with index >= next_index");
 269		return NULL;
 270	}
 271
 272	if (jfs_dirtable_inline(ip)) {
 273		/*
 274		 * Inline directory table
 275		 */
 276		*mp = NULL;
 277		slot = &jfs_ip->i_dirtable[index - 2];
 278	} else {
 279		offset = (index - 2) * sizeof(struct dir_table_slot);
 280		page_offset = offset & (PSIZE - 1);
 281		blkno = ((offset + 1) >> L2PSIZE) <<
 282		    JFS_SBI(ip->i_sb)->l2nbperpage;
 283
 284		if (*mp && (*lblock != blkno)) {
 285			release_metapage(*mp);
 286			*mp = NULL;
 287		}
 288		if (!(*mp)) {
 289			*lblock = blkno;
 290			*mp = read_index_page(ip, blkno);
 291		}
 292		if (!(*mp)) {
 293			jfs_err("free_index: error reading directory table");
 294			return NULL;
 295		}
 296
 297		slot =
 298		    (struct dir_table_slot *) ((char *) (*mp)->data +
 299					       page_offset);
 300	}
 301	return slot;
 302}
 303
 304static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp,
 305			      u32 index)
 306{
 307	struct tlock *tlck;
 308	struct linelock *llck;
 309	struct lv *lv;
 310
 311	tlck = txLock(tid, ip, mp, tlckDATA);
 312	llck = (struct linelock *) tlck->lock;
 313
 314	if (llck->index >= llck->maxcnt)
 315		llck = txLinelock(llck);
 316	lv = &llck->lv[llck->index];
 317
 318	/*
 319	 *	Linelock slot size is twice the size of directory table
 320	 *	slot size.  512 entries per page.
 321	 */
 322	lv->offset = ((index - 2) & 511) >> 1;
 323	lv->length = 1;
 324	llck->index++;
 325}
 326
 327/*
 328 *	add_index()
 329 *
 330 *	Adds an entry to the directory index table.  This is used to provide
 331 *	each directory entry with a persistent index in which to resume
 332 *	directory traversals
 333 */
 334static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
 335{
 336	struct super_block *sb = ip->i_sb;
 337	struct jfs_sb_info *sbi = JFS_SBI(sb);
 338	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
 339	u64 blkno;
 340	struct dir_table_slot *dirtab_slot;
 341	u32 index;
 342	struct linelock *llck;
 343	struct lv *lv;
 344	struct metapage *mp;
 345	s64 offset;
 346	uint page_offset;
 347	struct tlock *tlck;
 348	s64 xaddr;
 349
 350	ASSERT(DO_INDEX(ip));
 351
 352	if (jfs_ip->next_index < 2) {
 353		jfs_warn("add_index: next_index = %d.  Resetting!",
 354			   jfs_ip->next_index);
 355		jfs_ip->next_index = 2;
 356	}
 357
 358	index = jfs_ip->next_index++;
 359
 360	if (index <= MAX_INLINE_DIRTABLE_ENTRY) {
 361		/*
 362		 * i_size reflects size of index table, or 8 bytes per entry.
 363		 */
 364		ip->i_size = (loff_t) (index - 1) << 3;
 365
 366		/*
 367		 * dir table fits inline within inode
 368		 */
 369		dirtab_slot = &jfs_ip->i_dirtable[index-2];
 370		dirtab_slot->flag = DIR_INDEX_VALID;
 371		dirtab_slot->slot = slot;
 372		DTSaddress(dirtab_slot, bn);
 373
 374		set_cflag(COMMIT_Dirtable, ip);
 375
 376		return index;
 377	}
 378	if (index == (MAX_INLINE_DIRTABLE_ENTRY + 1)) {
 379		struct dir_table_slot temp_table[12];
 380
 381		/*
 382		 * It's time to move the inline table to an external
 383		 * page and begin to build the xtree
 384		 */
 385		if (dquot_alloc_block(ip, sbi->nbperpage))
 386			goto clean_up;
 387		if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) {
 388			dquot_free_block(ip, sbi->nbperpage);
 389			goto clean_up;
 390		}
 391
 392		/*
 393		 * Save the table, we're going to overwrite it with the
 394		 * xtree root
 395		 */
 396		memcpy(temp_table, &jfs_ip->i_dirtable, sizeof(temp_table));
 397
 398		/*
 399		 * Initialize empty x-tree
 400		 */
 401		xtInitRoot(tid, ip);
 402
 403		/*
 404		 * Add the first block to the xtree
 405		 */
 406		if (xtInsert(tid, ip, 0, 0, sbi->nbperpage, &xaddr, 0)) {
 407			/* This really shouldn't fail */
 408			jfs_warn("add_index: xtInsert failed!");
 409			memcpy(&jfs_ip->i_dirtable, temp_table,
 410			       sizeof (temp_table));
 411			dbFree(ip, xaddr, sbi->nbperpage);
 412			dquot_free_block(ip, sbi->nbperpage);
 413			goto clean_up;
 414		}
 415		ip->i_size = PSIZE;
 416
 417		mp = get_index_page(ip, 0);
 418		if (!mp) {
 419			jfs_err("add_index: get_metapage failed!");
 420			xtTruncate(tid, ip, 0, COMMIT_PWMAP);
 421			memcpy(&jfs_ip->i_dirtable, temp_table,
 422			       sizeof (temp_table));
 423			goto clean_up;
 424		}
 425		tlck = txLock(tid, ip, mp, tlckDATA);
 426		llck = (struct linelock *) & tlck->lock;
 427		ASSERT(llck->index == 0);
 428		lv = &llck->lv[0];
 429
 430		lv->offset = 0;
 431		lv->length = 6;	/* tlckDATA slot size is 16 bytes */
 432		llck->index++;
 433
 434		memcpy(mp->data, temp_table, sizeof(temp_table));
 435
 436		mark_metapage_dirty(mp);
 437		release_metapage(mp);
 438
 439		/*
 440		 * Logging is now directed by xtree tlocks
 441		 */
 442		clear_cflag(COMMIT_Dirtable, ip);
 443	}
 444
 445	offset = (index - 2) * sizeof(struct dir_table_slot);
 446	page_offset = offset & (PSIZE - 1);
 447	blkno = ((offset + 1) >> L2PSIZE) << sbi->l2nbperpage;
 448	if (page_offset == 0) {
 449		/*
 450		 * This will be the beginning of a new page
 451		 */
 452		xaddr = 0;
 453		if (xtInsert(tid, ip, 0, blkno, sbi->nbperpage, &xaddr, 0)) {
 454			jfs_warn("add_index: xtInsert failed!");
 455			goto clean_up;
 456		}
 457		ip->i_size += PSIZE;
 458
 459		if ((mp = get_index_page(ip, blkno)))
 460			memset(mp->data, 0, PSIZE);	/* Just looks better */
 461		else
 462			xtTruncate(tid, ip, offset, COMMIT_PWMAP);
 463	} else
 464		mp = read_index_page(ip, blkno);
 465
 466	if (!mp) {
 467		jfs_err("add_index: get/read_metapage failed!");
 468		goto clean_up;
 469	}
 470
 471	lock_index(tid, ip, mp, index);
 472
 473	dirtab_slot =
 474	    (struct dir_table_slot *) ((char *) mp->data + page_offset);
 475	dirtab_slot->flag = DIR_INDEX_VALID;
 476	dirtab_slot->slot = slot;
 477	DTSaddress(dirtab_slot, bn);
 478
 479	mark_metapage_dirty(mp);
 480	release_metapage(mp);
 481
 482	return index;
 483
 484      clean_up:
 485
 486	jfs_ip->next_index--;
 487
 488	return 0;
 489}
 490
 491/*
 492 *	free_index()
 493 *
 494 *	Marks an entry to the directory index table as free.
 495 */
 496static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next)
 497{
 498	struct dir_table_slot *dirtab_slot;
 499	s64 lblock;
 500	struct metapage *mp = NULL;
 501
 502	dirtab_slot = find_index(ip, index, &mp, &lblock);
 503
 504	if (!dirtab_slot)
 505		return;
 506
 507	dirtab_slot->flag = DIR_INDEX_FREE;
 508	dirtab_slot->slot = dirtab_slot->addr1 = 0;
 509	dirtab_slot->addr2 = cpu_to_le32(next);
 510
 511	if (mp) {
 512		lock_index(tid, ip, mp, index);
 513		mark_metapage_dirty(mp);
 514		release_metapage(mp);
 515	} else
 516		set_cflag(COMMIT_Dirtable, ip);
 517}
 518
 519/*
 520 *	modify_index()
 521 *
 522 *	Changes an entry in the directory index table
 523 */
 524static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn,
 525			 int slot, struct metapage ** mp, s64 *lblock)
 526{
 527	struct dir_table_slot *dirtab_slot;
 528
 529	dirtab_slot = find_index(ip, index, mp, lblock);
 530
 531	if (!dirtab_slot)
 532		return;
 533
 534	DTSaddress(dirtab_slot, bn);
 535	dirtab_slot->slot = slot;
 536
 537	if (*mp) {
 538		lock_index(tid, ip, *mp, index);
 539		mark_metapage_dirty(*mp);
 540	} else
 541		set_cflag(COMMIT_Dirtable, ip);
 542}
 543
 544/*
 545 *	read_index()
 546 *
 547 *	reads a directory table slot
 548 */
 549static int read_index(struct inode *ip, u32 index,
 550		     struct dir_table_slot * dirtab_slot)
 551{
 552	s64 lblock;
 553	struct metapage *mp = NULL;
 554	struct dir_table_slot *slot;
 555
 556	slot = find_index(ip, index, &mp, &lblock);
 557	if (!slot) {
 558		return -EIO;
 559	}
 560
 561	memcpy(dirtab_slot, slot, sizeof(struct dir_table_slot));
 562
 563	if (mp)
 564		release_metapage(mp);
 565
 566	return 0;
 567}
 568
 569/*
 570 *	dtSearch()
 571 *
 572 * function:
 573 *	Search for the entry with specified key
 574 *
 575 * parameter:
 576 *
 577 * return: 0 - search result on stack, leaf page pinned;
 578 *	   errno - I/O error
 579 */
 580int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
 581	     struct btstack * btstack, int flag)
 582{
 583	int rc = 0;
 584	int cmp = 1;		/* init for empty page */
 585	s64 bn;
 586	struct metapage *mp;
 587	dtpage_t *p;
 588	s8 *stbl;
 589	int base, index, lim;
 590	struct btframe *btsp;
 591	pxd_t *pxd;
 592	int psize = 288;	/* initial in-line directory */
 593	ino_t inumber;
 594	struct component_name ciKey;
 595	struct super_block *sb = ip->i_sb;
 596
 597	ciKey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), GFP_NOFS);
 
 598	if (!ciKey.name) {
 599		rc = -ENOMEM;
 600		goto dtSearch_Exit2;
 601	}
 602
 603
 604	/* uppercase search key for c-i directory */
 605	UniStrcpy(ciKey.name, key->name);
 606	ciKey.namlen = key->namlen;
 607
 608	/* only uppercase if case-insensitive support is on */
 609	if ((JFS_SBI(sb)->mntflag & JFS_OS2) == JFS_OS2) {
 610		ciToUpper(&ciKey);
 611	}
 612	BT_CLR(btstack);	/* reset stack */
 613
 614	/* init level count for max pages to split */
 615	btstack->nsplit = 1;
 616
 617	/*
 618	 *	search down tree from root:
 619	 *
 620	 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
 621	 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
 622	 *
 623	 * if entry with search key K is not found
 624	 * internal page search find the entry with largest key Ki
 625	 * less than K which point to the child page to search;
 626	 * leaf page search find the entry with smallest key Kj
 627	 * greater than K so that the returned index is the position of
 628	 * the entry to be shifted right for insertion of new entry.
 629	 * for empty tree, search key is greater than any key of the tree.
 630	 *
 631	 * by convention, root bn = 0.
 632	 */
 633	for (bn = 0;;) {
 634		/* get/pin the page to search */
 635		DT_GETPAGE(ip, bn, mp, psize, p, rc);
 636		if (rc)
 637			goto dtSearch_Exit1;
 638
 639		/* get sorted entry table of the page */
 640		stbl = DT_GETSTBL(p);
 641
 642		/*
 643		 * binary search with search key K on the current page.
 644		 */
 645		for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) {
 646			index = base + (lim >> 1);
 647
 648			if (p->header.flag & BT_LEAF) {
 649				/* uppercase leaf name to compare */
 650				cmp =
 651				    ciCompare(&ciKey, p, stbl[index],
 652					      JFS_SBI(sb)->mntflag);
 653			} else {
 654				/* router key is in uppercase */
 655
 656				cmp = dtCompare(&ciKey, p, stbl[index]);
 657
 658
 659			}
 660			if (cmp == 0) {
 661				/*
 662				 *	search hit
 663				 */
 664				/* search hit - leaf page:
 665				 * return the entry found
 666				 */
 667				if (p->header.flag & BT_LEAF) {
 668					inumber = le32_to_cpu(
 669			((struct ldtentry *) & p->slot[stbl[index]])->inumber);
 670
 671					/*
 672					 * search for JFS_LOOKUP
 673					 */
 674					if (flag == JFS_LOOKUP) {
 675						*data = inumber;
 676						rc = 0;
 677						goto out;
 678					}
 679
 680					/*
 681					 * search for JFS_CREATE
 682					 */
 683					if (flag == JFS_CREATE) {
 684						*data = inumber;
 685						rc = -EEXIST;
 686						goto out;
 687					}
 688
 689					/*
 690					 * search for JFS_REMOVE or JFS_RENAME
 691					 */
 692					if ((flag == JFS_REMOVE ||
 693					     flag == JFS_RENAME) &&
 694					    *data != inumber) {
 695						rc = -ESTALE;
 696						goto out;
 697					}
 698
 699					/*
 700					 * JFS_REMOVE|JFS_FINDDIR|JFS_RENAME
 701					 */
 702					/* save search result */
 703					*data = inumber;
 704					btsp = btstack->top;
 705					btsp->bn = bn;
 706					btsp->index = index;
 707					btsp->mp = mp;
 708
 709					rc = 0;
 710					goto dtSearch_Exit1;
 711				}
 712
 713				/* search hit - internal page:
 714				 * descend/search its child page
 715				 */
 716				goto getChild;
 717			}
 718
 719			if (cmp > 0) {
 720				base = index + 1;
 721				--lim;
 722			}
 723		}
 724
 725		/*
 726		 *	search miss
 727		 *
 728		 * base is the smallest index with key (Kj) greater than
 729		 * search key (K) and may be zero or (maxindex + 1) index.
 730		 */
 731		/*
 732		 * search miss - leaf page
 733		 *
 734		 * return location of entry (base) where new entry with
 735		 * search key K is to be inserted.
 736		 */
 737		if (p->header.flag & BT_LEAF) {
 738			/*
 739			 * search for JFS_LOOKUP, JFS_REMOVE, or JFS_RENAME
 740			 */
 741			if (flag == JFS_LOOKUP || flag == JFS_REMOVE ||
 742			    flag == JFS_RENAME) {
 743				rc = -ENOENT;
 744				goto out;
 745			}
 746
 747			/*
 748			 * search for JFS_CREATE|JFS_FINDDIR:
 749			 *
 750			 * save search result
 751			 */
 752			*data = 0;
 753			btsp = btstack->top;
 754			btsp->bn = bn;
 755			btsp->index = base;
 756			btsp->mp = mp;
 757
 758			rc = 0;
 759			goto dtSearch_Exit1;
 760		}
 761
 762		/*
 763		 * search miss - internal page
 764		 *
 765		 * if base is non-zero, decrement base by one to get the parent
 766		 * entry of the child page to search.
 767		 */
 768		index = base ? base - 1 : base;
 769
 770		/*
 771		 * go down to child page
 772		 */
 773	      getChild:
 774		/* update max. number of pages to split */
 775		if (BT_STACK_FULL(btstack)) {
 776			/* Something's corrupted, mark filesystem dirty so
 777			 * chkdsk will fix it.
 778			 */
 779			jfs_error(sb, "stack overrun in dtSearch!");
 780			BT_STACK_DUMP(btstack);
 781			rc = -EIO;
 782			goto out;
 783		}
 784		btstack->nsplit++;
 785
 786		/* push (bn, index) of the parent page/entry */
 787		BT_PUSH(btstack, bn, index);
 788
 789		/* get the child page block number */
 790		pxd = (pxd_t *) & p->slot[stbl[index]];
 791		bn = addressPXD(pxd);
 792		psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize;
 793
 794		/* unpin the parent page */
 795		DT_PUTPAGE(mp);
 796	}
 797
 798      out:
 799	DT_PUTPAGE(mp);
 800
 801      dtSearch_Exit1:
 802
 803	kfree(ciKey.name);
 804
 805      dtSearch_Exit2:
 806
 807	return rc;
 808}
 809
 810
 811/*
 812 *	dtInsert()
 813 *
 814 * function: insert an entry to directory tree
 815 *
 816 * parameter:
 817 *
 818 * return: 0 - success;
 819 *	   errno - failure;
 820 */
 821int dtInsert(tid_t tid, struct inode *ip,
 822	 struct component_name * name, ino_t * fsn, struct btstack * btstack)
 823{
 824	int rc = 0;
 825	struct metapage *mp;	/* meta-page buffer */
 826	dtpage_t *p;		/* base B+-tree index page */
 827	s64 bn;
 828	int index;
 829	struct dtsplit split;	/* split information */
 830	ddata_t data;
 831	struct dt_lock *dtlck;
 832	int n;
 833	struct tlock *tlck;
 834	struct lv *lv;
 835
 836	/*
 837	 *	retrieve search result
 838	 *
 839	 * dtSearch() returns (leaf page pinned, index at which to insert).
 840	 * n.b. dtSearch() may return index of (maxindex + 1) of
 841	 * the full page.
 842	 */
 843	DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
 844
 845	/*
 846	 *	insert entry for new key
 847	 */
 848	if (DO_INDEX(ip)) {
 849		if (JFS_IP(ip)->next_index == DIREND) {
 850			DT_PUTPAGE(mp);
 851			return -EMLINK;
 852		}
 853		n = NDTLEAF(name->namlen);
 854		data.leaf.tid = tid;
 855		data.leaf.ip = ip;
 856	} else {
 857		n = NDTLEAF_LEGACY(name->namlen);
 858		data.leaf.ip = NULL;	/* signifies legacy directory format */
 859	}
 860	data.leaf.ino = *fsn;
 861
 862	/*
 863	 *	leaf page does not have enough room for new entry:
 864	 *
 865	 *	extend/split the leaf page;
 866	 *
 867	 * dtSplitUp() will insert the entry and unpin the leaf page.
 868	 */
 869	if (n > p->header.freecnt) {
 870		split.mp = mp;
 871		split.index = index;
 872		split.nslot = n;
 873		split.key = name;
 874		split.data = &data;
 875		rc = dtSplitUp(tid, ip, &split, btstack);
 876		return rc;
 877	}
 878
 879	/*
 880	 *	leaf page does have enough room for new entry:
 881	 *
 882	 *	insert the new data entry into the leaf page;
 883	 */
 884	BT_MARK_DIRTY(mp, ip);
 885	/*
 886	 * acquire a transaction lock on the leaf page
 887	 */
 888	tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
 889	dtlck = (struct dt_lock *) & tlck->lock;
 890	ASSERT(dtlck->index == 0);
 891	lv = & dtlck->lv[0];
 892
 893	/* linelock header */
 894	lv->offset = 0;
 895	lv->length = 1;
 896	dtlck->index++;
 897
 898	dtInsertEntry(p, index, name, &data, &dtlck);
 899
 900	/* linelock stbl of non-root leaf page */
 901	if (!(p->header.flag & BT_ROOT)) {
 902		if (dtlck->index >= dtlck->maxcnt)
 903			dtlck = (struct dt_lock *) txLinelock(dtlck);
 904		lv = & dtlck->lv[dtlck->index];
 905		n = index >> L2DTSLOTSIZE;
 906		lv->offset = p->header.stblindex + n;
 907		lv->length =
 908		    ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1;
 909		dtlck->index++;
 910	}
 911
 912	/* unpin the leaf page */
 913	DT_PUTPAGE(mp);
 914
 915	return 0;
 916}
 917
 918
 919/*
 920 *	dtSplitUp()
 921 *
 922 * function: propagate insertion bottom up;
 923 *
 924 * parameter:
 925 *
 926 * return: 0 - success;
 927 *	   errno - failure;
 928 *	leaf page unpinned;
 929 */
 930static int dtSplitUp(tid_t tid,
 931	  struct inode *ip, struct dtsplit * split, struct btstack * btstack)
 932{
 933	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
 934	int rc = 0;
 935	struct metapage *smp;
 936	dtpage_t *sp;		/* split page */
 937	struct metapage *rmp;
 938	dtpage_t *rp;		/* new right page split from sp */
 939	pxd_t rpxd;		/* new right page extent descriptor */
 940	struct metapage *lmp;
 941	dtpage_t *lp;		/* left child page */
 942	int skip;		/* index of entry of insertion */
 943	struct btframe *parent;	/* parent page entry on traverse stack */
 944	s64 xaddr, nxaddr;
 945	int xlen, xsize;
 946	struct pxdlist pxdlist;
 947	pxd_t *pxd;
 948	struct component_name key = { 0, NULL };
 949	ddata_t *data = split->data;
 950	int n;
 951	struct dt_lock *dtlck;
 952	struct tlock *tlck;
 953	struct lv *lv;
 954	int quota_allocation = 0;
 955
 956	/* get split page */
 957	smp = split->mp;
 958	sp = DT_PAGE(ip, smp);
 959
 960	key.name = kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t), GFP_NOFS);
 961	if (!key.name) {
 962		DT_PUTPAGE(smp);
 963		rc = -ENOMEM;
 964		goto dtSplitUp_Exit;
 965	}
 966
 967	/*
 968	 *	split leaf page
 969	 *
 970	 * The split routines insert the new entry, and
 971	 * acquire txLock as appropriate.
 972	 */
 973	/*
 974	 *	split root leaf page:
 975	 */
 976	if (sp->header.flag & BT_ROOT) {
 977		/*
 978		 * allocate a single extent child page
 979		 */
 980		xlen = 1;
 981		n = sbi->bsize >> L2DTSLOTSIZE;
 982		n -= (n + 31) >> L2DTSLOTSIZE;	/* stbl size */
 983		n -= DTROOTMAXSLOT - sp->header.freecnt; /* header + entries */
 984		if (n <= split->nslot)
 985			xlen++;
 986		if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr))) {
 987			DT_PUTPAGE(smp);
 988			goto freeKeyName;
 989		}
 990
 991		pxdlist.maxnpxd = 1;
 992		pxdlist.npxd = 0;
 993		pxd = &pxdlist.pxd[0];
 994		PXDaddress(pxd, xaddr);
 995		PXDlength(pxd, xlen);
 996		split->pxdlist = &pxdlist;
 997		rc = dtSplitRoot(tid, ip, split, &rmp);
 998
 999		if (rc)
1000			dbFree(ip, xaddr, xlen);
1001		else
1002			DT_PUTPAGE(rmp);
1003
1004		DT_PUTPAGE(smp);
1005
1006		if (!DO_INDEX(ip))
1007			ip->i_size = xlen << sbi->l2bsize;
1008
1009		goto freeKeyName;
1010	}
1011
1012	/*
1013	 *	extend first leaf page
1014	 *
1015	 * extend the 1st extent if less than buffer page size
1016	 * (dtExtendPage() reurns leaf page unpinned)
1017	 */
1018	pxd = &sp->header.self;
1019	xlen = lengthPXD(pxd);
1020	xsize = xlen << sbi->l2bsize;
1021	if (xsize < PSIZE) {
1022		xaddr = addressPXD(pxd);
1023		n = xsize >> L2DTSLOTSIZE;
1024		n -= (n + 31) >> L2DTSLOTSIZE;	/* stbl size */
1025		if ((n + sp->header.freecnt) <= split->nslot)
1026			n = xlen + (xlen << 1);
1027		else
1028			n = xlen;
1029
1030		/* Allocate blocks to quota. */
1031		rc = dquot_alloc_block(ip, n);
1032		if (rc)
1033			goto extendOut;
1034		quota_allocation += n;
1035
1036		if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen,
1037				    (s64) n, &nxaddr)))
1038			goto extendOut;
1039
1040		pxdlist.maxnpxd = 1;
1041		pxdlist.npxd = 0;
1042		pxd = &pxdlist.pxd[0];
1043		PXDaddress(pxd, nxaddr)
1044		    PXDlength(pxd, xlen + n);
1045		split->pxdlist = &pxdlist;
1046		if ((rc = dtExtendPage(tid, ip, split, btstack))) {
1047			nxaddr = addressPXD(pxd);
1048			if (xaddr != nxaddr) {
1049				/* free relocated extent */
1050				xlen = lengthPXD(pxd);
1051				dbFree(ip, nxaddr, (s64) xlen);
1052			} else {
1053				/* free extended delta */
1054				xlen = lengthPXD(pxd) - n;
1055				xaddr = addressPXD(pxd) + xlen;
1056				dbFree(ip, xaddr, (s64) n);
1057			}
1058		} else if (!DO_INDEX(ip))
1059			ip->i_size = lengthPXD(pxd) << sbi->l2bsize;
1060
1061
1062	      extendOut:
1063		DT_PUTPAGE(smp);
1064		goto freeKeyName;
1065	}
1066
1067	/*
1068	 *	split leaf page <sp> into <sp> and a new right page <rp>.
1069	 *
1070	 * return <rp> pinned and its extent descriptor <rpxd>
1071	 */
1072	/*
1073	 * allocate new directory page extent and
1074	 * new index page(s) to cover page split(s)
1075	 *
1076	 * allocation hint: ?
1077	 */
1078	n = btstack->nsplit;
1079	pxdlist.maxnpxd = pxdlist.npxd = 0;
1080	xlen = sbi->nbperpage;
1081	for (pxd = pxdlist.pxd; n > 0; n--, pxd++) {
1082		if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr)) == 0) {
1083			PXDaddress(pxd, xaddr);
1084			PXDlength(pxd, xlen);
1085			pxdlist.maxnpxd++;
1086			continue;
1087		}
1088
1089		DT_PUTPAGE(smp);
1090
1091		/* undo allocation */
1092		goto splitOut;
1093	}
1094
1095	split->pxdlist = &pxdlist;
1096	if ((rc = dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd))) {
1097		DT_PUTPAGE(smp);
1098
1099		/* undo allocation */
1100		goto splitOut;
1101	}
1102
1103	if (!DO_INDEX(ip))
1104		ip->i_size += PSIZE;
1105
1106	/*
1107	 * propagate up the router entry for the leaf page just split
1108	 *
1109	 * insert a router entry for the new page into the parent page,
1110	 * propagate the insert/split up the tree by walking back the stack
1111	 * of (bn of parent page, index of child page entry in parent page)
1112	 * that were traversed during the search for the page that split.
1113	 *
1114	 * the propagation of insert/split up the tree stops if the root
1115	 * splits or the page inserted into doesn't have to split to hold
1116	 * the new entry.
1117	 *
1118	 * the parent entry for the split page remains the same, and
1119	 * a new entry is inserted at its right with the first key and
1120	 * block number of the new right page.
1121	 *
1122	 * There are a maximum of 4 pages pinned at any time:
1123	 * two children, left parent and right parent (when the parent splits).
1124	 * keep the child pages pinned while working on the parent.
1125	 * make sure that all pins are released at exit.
1126	 */
1127	while ((parent = BT_POP(btstack)) != NULL) {
1128		/* parent page specified by stack frame <parent> */
1129
1130		/* keep current child pages (<lp>, <rp>) pinned */
1131		lmp = smp;
1132		lp = sp;
1133
1134		/*
1135		 * insert router entry in parent for new right child page <rp>
1136		 */
1137		/* get the parent page <sp> */
1138		DT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc);
1139		if (rc) {
1140			DT_PUTPAGE(lmp);
1141			DT_PUTPAGE(rmp);
1142			goto splitOut;
1143		}
1144
1145		/*
1146		 * The new key entry goes ONE AFTER the index of parent entry,
1147		 * because the split was to the right.
1148		 */
1149		skip = parent->index + 1;
1150
1151		/*
1152		 * compute the key for the router entry
1153		 *
1154		 * key suffix compression:
1155		 * for internal pages that have leaf pages as children,
1156		 * retain only what's needed to distinguish between
1157		 * the new entry and the entry on the page to its left.
1158		 * If the keys compare equal, retain the entire key.
1159		 *
1160		 * note that compression is performed only at computing
1161		 * router key at the lowest internal level.
1162		 * further compression of the key between pairs of higher
1163		 * level internal pages loses too much information and
1164		 * the search may fail.
1165		 * (e.g., two adjacent leaf pages of {a, ..., x} {xx, ...,}
1166		 * results in two adjacent parent entries (a)(xx).
1167		 * if split occurs between these two entries, and
1168		 * if compression is applied, the router key of parent entry
1169		 * of right page (x) will divert search for x into right
1170		 * subtree and miss x in the left subtree.)
1171		 *
1172		 * the entire key must be retained for the next-to-leftmost
1173		 * internal key at any level of the tree, or search may fail
1174		 * (e.g., ?)
1175		 */
1176		switch (rp->header.flag & BT_TYPE) {
1177		case BT_LEAF:
1178			/*
1179			 * compute the length of prefix for suffix compression
1180			 * between last entry of left page and first entry
1181			 * of right page
1182			 */
1183			if ((sp->header.flag & BT_ROOT && skip > 1) ||
1184			    sp->header.prev != 0 || skip > 1) {
1185				/* compute uppercase router prefix key */
1186				rc = ciGetLeafPrefixKey(lp,
1187							lp->header.nextindex-1,
1188							rp, 0, &key,
1189							sbi->mntflag);
1190				if (rc) {
1191					DT_PUTPAGE(lmp);
1192					DT_PUTPAGE(rmp);
1193					DT_PUTPAGE(smp);
1194					goto splitOut;
1195				}
1196			} else {
1197				/* next to leftmost entry of
1198				   lowest internal level */
1199
1200				/* compute uppercase router key */
1201				dtGetKey(rp, 0, &key, sbi->mntflag);
1202				key.name[key.namlen] = 0;
1203
1204				if ((sbi->mntflag & JFS_OS2) == JFS_OS2)
1205					ciToUpper(&key);
1206			}
1207
1208			n = NDTINTERNAL(key.namlen);
1209			break;
1210
1211		case BT_INTERNAL:
1212			dtGetKey(rp, 0, &key, sbi->mntflag);
1213			n = NDTINTERNAL(key.namlen);
1214			break;
1215
1216		default:
1217			jfs_err("dtSplitUp(): UFO!");
1218			break;
1219		}
1220
1221		/* unpin left child page */
1222		DT_PUTPAGE(lmp);
1223
1224		/*
1225		 * compute the data for the router entry
1226		 */
1227		data->xd = rpxd;	/* child page xd */
1228
1229		/*
1230		 * parent page is full - split the parent page
1231		 */
1232		if (n > sp->header.freecnt) {
1233			/* init for parent page split */
1234			split->mp = smp;
1235			split->index = skip;	/* index at insert */
1236			split->nslot = n;
1237			split->key = &key;
1238			/* split->data = data; */
1239
1240			/* unpin right child page */
1241			DT_PUTPAGE(rmp);
1242
1243			/* The split routines insert the new entry,
1244			 * acquire txLock as appropriate.
1245			 * return <rp> pinned and its block number <rbn>.
1246			 */
1247			rc = (sp->header.flag & BT_ROOT) ?
1248			    dtSplitRoot(tid, ip, split, &rmp) :
1249			    dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd);
1250			if (rc) {
1251				DT_PUTPAGE(smp);
1252				goto splitOut;
1253			}
1254
1255			/* smp and rmp are pinned */
1256		}
1257		/*
1258		 * parent page is not full - insert router entry in parent page
1259		 */
1260		else {
1261			BT_MARK_DIRTY(smp, ip);
1262			/*
1263			 * acquire a transaction lock on the parent page
1264			 */
1265			tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY);
1266			dtlck = (struct dt_lock *) & tlck->lock;
1267			ASSERT(dtlck->index == 0);
1268			lv = & dtlck->lv[0];
1269
1270			/* linelock header */
1271			lv->offset = 0;
1272			lv->length = 1;
1273			dtlck->index++;
1274
1275			/* linelock stbl of non-root parent page */
1276			if (!(sp->header.flag & BT_ROOT)) {
1277				lv++;
1278				n = skip >> L2DTSLOTSIZE;
1279				lv->offset = sp->header.stblindex + n;
1280				lv->length =
1281				    ((sp->header.nextindex -
1282				      1) >> L2DTSLOTSIZE) - n + 1;
1283				dtlck->index++;
1284			}
1285
1286			dtInsertEntry(sp, skip, &key, data, &dtlck);
1287
1288			/* exit propagate up */
1289			break;
1290		}
1291	}
1292
1293	/* unpin current split and its right page */
1294	DT_PUTPAGE(smp);
1295	DT_PUTPAGE(rmp);
1296
1297	/*
1298	 * free remaining extents allocated for split
1299	 */
1300      splitOut:
1301	n = pxdlist.npxd;
1302	pxd = &pxdlist.pxd[n];
1303	for (; n < pxdlist.maxnpxd; n++, pxd++)
1304		dbFree(ip, addressPXD(pxd), (s64) lengthPXD(pxd));
1305
1306      freeKeyName:
1307	kfree(key.name);
1308
1309	/* Rollback quota allocation */
1310	if (rc && quota_allocation)
1311		dquot_free_block(ip, quota_allocation);
1312
1313      dtSplitUp_Exit:
1314
1315	return rc;
1316}
1317
1318
1319/*
1320 *	dtSplitPage()
1321 *
1322 * function: Split a non-root page of a btree.
1323 *
1324 * parameter:
1325 *
1326 * return: 0 - success;
1327 *	   errno - failure;
1328 *	return split and new page pinned;
1329 */
1330static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
1331	    struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rpxdp)
1332{
1333	int rc = 0;
1334	struct metapage *smp;
1335	dtpage_t *sp;
1336	struct metapage *rmp;
1337	dtpage_t *rp;		/* new right page allocated */
1338	s64 rbn;		/* new right page block number */
1339	struct metapage *mp;
1340	dtpage_t *p;
1341	s64 nextbn;
1342	struct pxdlist *pxdlist;
1343	pxd_t *pxd;
1344	int skip, nextindex, half, left, nxt, off, si;
1345	struct ldtentry *ldtentry;
1346	struct idtentry *idtentry;
1347	u8 *stbl;
1348	struct dtslot *f;
1349	int fsi, stblsize;
1350	int n;
1351	struct dt_lock *sdtlck, *rdtlck;
1352	struct tlock *tlck;
1353	struct dt_lock *dtlck;
1354	struct lv *slv, *rlv, *lv;
1355
1356	/* get split page */
1357	smp = split->mp;
1358	sp = DT_PAGE(ip, smp);
1359
1360	/*
1361	 * allocate the new right page for the split
1362	 */
1363	pxdlist = split->pxdlist;
1364	pxd = &pxdlist->pxd[pxdlist->npxd];
1365	pxdlist->npxd++;
1366	rbn = addressPXD(pxd);
1367	rmp = get_metapage(ip, rbn, PSIZE, 1);
1368	if (rmp == NULL)
1369		return -EIO;
1370
1371	/* Allocate blocks to quota. */
1372	rc = dquot_alloc_block(ip, lengthPXD(pxd));
1373	if (rc) {
1374		release_metapage(rmp);
1375		return rc;
1376	}
1377
1378	jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp);
1379
1380	BT_MARK_DIRTY(rmp, ip);
1381	/*
1382	 * acquire a transaction lock on the new right page
1383	 */
1384	tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW);
1385	rdtlck = (struct dt_lock *) & tlck->lock;
1386
1387	rp = (dtpage_t *) rmp->data;
1388	*rpp = rp;
1389	rp->header.self = *pxd;
1390
1391	BT_MARK_DIRTY(smp, ip);
1392	/*
1393	 * acquire a transaction lock on the split page
1394	 *
1395	 * action:
1396	 */
1397	tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY);
1398	sdtlck = (struct dt_lock *) & tlck->lock;
1399
1400	/* linelock header of split page */
1401	ASSERT(sdtlck->index == 0);
1402	slv = & sdtlck->lv[0];
1403	slv->offset = 0;
1404	slv->length = 1;
1405	sdtlck->index++;
1406
1407	/*
1408	 * initialize/update sibling pointers between sp and rp
1409	 */
1410	nextbn = le64_to_cpu(sp->header.next);
1411	rp->header.next = cpu_to_le64(nextbn);
1412	rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self));
1413	sp->header.next = cpu_to_le64(rbn);
1414
1415	/*
1416	 * initialize new right page
1417	 */
1418	rp->header.flag = sp->header.flag;
1419
1420	/* compute sorted entry table at start of extent data area */
1421	rp->header.nextindex = 0;
1422	rp->header.stblindex = 1;
1423
1424	n = PSIZE >> L2DTSLOTSIZE;
1425	rp->header.maxslot = n;
1426	stblsize = (n + 31) >> L2DTSLOTSIZE;	/* in unit of slot */
1427
1428	/* init freelist */
1429	fsi = rp->header.stblindex + stblsize;
1430	rp->header.freelist = fsi;
1431	rp->header.freecnt = rp->header.maxslot - fsi;
1432
1433	/*
1434	 *	sequential append at tail: append without split
1435	 *
1436	 * If splitting the last page on a level because of appending
1437	 * a entry to it (skip is maxentry), it's likely that the access is
1438	 * sequential. Adding an empty page on the side of the level is less
1439	 * work and can push the fill factor much higher than normal.
1440	 * If we're wrong it's no big deal, we'll just do the split the right
1441	 * way next time.
1442	 * (It may look like it's equally easy to do a similar hack for
1443	 * reverse sorted data, that is, split the tree left,
1444	 * but it's not. Be my guest.)
1445	 */
1446	if (nextbn == 0 && split->index == sp->header.nextindex) {
1447		/* linelock header + stbl (first slot) of new page */
1448		rlv = & rdtlck->lv[rdtlck->index];
1449		rlv->offset = 0;
1450		rlv->length = 2;
1451		rdtlck->index++;
1452
1453		/*
1454		 * initialize freelist of new right page
1455		 */
1456		f = &rp->slot[fsi];
1457		for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
1458			f->next = fsi;
1459		f->next = -1;
1460
1461		/* insert entry at the first entry of the new right page */
1462		dtInsertEntry(rp, 0, split->key, split->data, &rdtlck);
1463
1464		goto out;
1465	}
1466
1467	/*
1468	 *	non-sequential insert (at possibly middle page)
1469	 */
1470
1471	/*
1472	 * update prev pointer of previous right sibling page;
1473	 */
1474	if (nextbn != 0) {
1475		DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
1476		if (rc) {
1477			discard_metapage(rmp);
1478			return rc;
1479		}
1480
1481		BT_MARK_DIRTY(mp, ip);
1482		/*
1483		 * acquire a transaction lock on the next page
1484		 */
1485		tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
1486		jfs_info("dtSplitPage: tlck = 0x%p, ip = 0x%p, mp=0x%p",
1487			tlck, ip, mp);
1488		dtlck = (struct dt_lock *) & tlck->lock;
1489
1490		/* linelock header of previous right sibling page */
1491		lv = & dtlck->lv[dtlck->index];
1492		lv->offset = 0;
1493		lv->length = 1;
1494		dtlck->index++;
1495
1496		p->header.prev = cpu_to_le64(rbn);
1497
1498		DT_PUTPAGE(mp);
1499	}
1500
1501	/*
1502	 * split the data between the split and right pages.
1503	 */
1504	skip = split->index;
1505	half = (PSIZE >> L2DTSLOTSIZE) >> 1;	/* swag */
1506	left = 0;
1507
1508	/*
1509	 *	compute fill factor for split pages
1510	 *
1511	 * <nxt> traces the next entry to move to rp
1512	 * <off> traces the next entry to stay in sp
1513	 */
1514	stbl = (u8 *) & sp->slot[sp->header.stblindex];
1515	nextindex = sp->header.nextindex;
1516	for (nxt = off = 0; nxt < nextindex; ++off) {
1517		if (off == skip)
1518			/* check for fill factor with new entry size */
1519			n = split->nslot;
1520		else {
1521			si = stbl[nxt];
1522			switch (sp->header.flag & BT_TYPE) {
1523			case BT_LEAF:
1524				ldtentry = (struct ldtentry *) & sp->slot[si];
1525				if (DO_INDEX(ip))
1526					n = NDTLEAF(ldtentry->namlen);
1527				else
1528					n = NDTLEAF_LEGACY(ldtentry->
1529							   namlen);
1530				break;
1531
1532			case BT_INTERNAL:
1533				idtentry = (struct idtentry *) & sp->slot[si];
1534				n = NDTINTERNAL(idtentry->namlen);
1535				break;
1536
1537			default:
1538				break;
1539			}
1540
1541			++nxt;	/* advance to next entry to move in sp */
1542		}
1543
1544		left += n;
1545		if (left >= half)
1546			break;
1547	}
1548
1549	/* <nxt> poins to the 1st entry to move */
1550
1551	/*
1552	 *	move entries to right page
1553	 *
1554	 * dtMoveEntry() initializes rp and reserves entry for insertion
1555	 *
1556	 * split page moved out entries are linelocked;
1557	 * new/right page moved in entries are linelocked;
1558	 */
1559	/* linelock header + stbl of new right page */
1560	rlv = & rdtlck->lv[rdtlck->index];
1561	rlv->offset = 0;
1562	rlv->length = 5;
1563	rdtlck->index++;
1564
1565	dtMoveEntry(sp, nxt, rp, &sdtlck, &rdtlck, DO_INDEX(ip));
1566
1567	sp->header.nextindex = nxt;
1568
1569	/*
1570	 * finalize freelist of new right page
1571	 */
1572	fsi = rp->header.freelist;
1573	f = &rp->slot[fsi];
1574	for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
1575		f->next = fsi;
1576	f->next = -1;
1577
1578	/*
1579	 * Update directory index table for entries now in right page
1580	 */
1581	if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) {
1582		s64 lblock;
1583
1584		mp = NULL;
1585		stbl = DT_GETSTBL(rp);
1586		for (n = 0; n < rp->header.nextindex; n++) {
1587			ldtentry = (struct ldtentry *) & rp->slot[stbl[n]];
1588			modify_index(tid, ip, le32_to_cpu(ldtentry->index),
1589				     rbn, n, &mp, &lblock);
1590		}
1591		if (mp)
1592			release_metapage(mp);
1593	}
1594
1595	/*
1596	 * the skipped index was on the left page,
1597	 */
1598	if (skip <= off) {
1599		/* insert the new entry in the split page */
1600		dtInsertEntry(sp, skip, split->key, split->data, &sdtlck);
1601
1602		/* linelock stbl of split page */
1603		if (sdtlck->index >= sdtlck->maxcnt)
1604			sdtlck = (struct dt_lock *) txLinelock(sdtlck);
1605		slv = & sdtlck->lv[sdtlck->index];
1606		n = skip >> L2DTSLOTSIZE;
1607		slv->offset = sp->header.stblindex + n;
1608		slv->length =
1609		    ((sp->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1;
1610		sdtlck->index++;
1611	}
1612	/*
1613	 * the skipped index was on the right page,
1614	 */
1615	else {
1616		/* adjust the skip index to reflect the new position */
1617		skip -= nxt;
1618
1619		/* insert the new entry in the right page */
1620		dtInsertEntry(rp, skip, split->key, split->data, &rdtlck);
1621	}
1622
1623      out:
1624	*rmpp = rmp;
1625	*rpxdp = *pxd;
1626
1627	return rc;
1628}
1629
1630
1631/*
1632 *	dtExtendPage()
1633 *
1634 * function: extend 1st/only directory leaf page
1635 *
1636 * parameter:
1637 *
1638 * return: 0 - success;
1639 *	   errno - failure;
1640 *	return extended page pinned;
1641 */
1642static int dtExtendPage(tid_t tid,
1643	     struct inode *ip, struct dtsplit * split, struct btstack * btstack)
1644{
1645	struct super_block *sb = ip->i_sb;
1646	int rc;
1647	struct metapage *smp, *pmp, *mp;
1648	dtpage_t *sp, *pp;
1649	struct pxdlist *pxdlist;
1650	pxd_t *pxd, *tpxd;
1651	int xlen, xsize;
1652	int newstblindex, newstblsize;
1653	int oldstblindex, oldstblsize;
1654	int fsi, last;
1655	struct dtslot *f;
1656	struct btframe *parent;
1657	int n;
1658	struct dt_lock *dtlck;
1659	s64 xaddr, txaddr;
1660	struct tlock *tlck;
1661	struct pxd_lock *pxdlock;
1662	struct lv *lv;
1663	uint type;
1664	struct ldtentry *ldtentry;
1665	u8 *stbl;
1666
1667	/* get page to extend */
1668	smp = split->mp;
1669	sp = DT_PAGE(ip, smp);
1670
1671	/* get parent/root page */
1672	parent = BT_POP(btstack);
1673	DT_GETPAGE(ip, parent->bn, pmp, PSIZE, pp, rc);
1674	if (rc)
1675		return (rc);
1676
1677	/*
1678	 *	extend the extent
1679	 */
1680	pxdlist = split->pxdlist;
1681	pxd = &pxdlist->pxd[pxdlist->npxd];
1682	pxdlist->npxd++;
1683
1684	xaddr = addressPXD(pxd);
1685	tpxd = &sp->header.self;
1686	txaddr = addressPXD(tpxd);
1687	/* in-place extension */
1688	if (xaddr == txaddr) {
1689		type = tlckEXTEND;
1690	}
1691	/* relocation */
1692	else {
1693		type = tlckNEW;
1694
1695		/* save moved extent descriptor for later free */
1696		tlck = txMaplock(tid, ip, tlckDTREE | tlckRELOCATE);
1697		pxdlock = (struct pxd_lock *) & tlck->lock;
1698		pxdlock->flag = mlckFREEPXD;
1699		pxdlock->pxd = sp->header.self;
1700		pxdlock->index = 1;
1701
1702		/*
1703		 * Update directory index table to reflect new page address
1704		 */
1705		if (DO_INDEX(ip)) {
1706			s64 lblock;
1707
1708			mp = NULL;
1709			stbl = DT_GETSTBL(sp);
1710			for (n = 0; n < sp->header.nextindex; n++) {
1711				ldtentry =
1712				    (struct ldtentry *) & sp->slot[stbl[n]];
1713				modify_index(tid, ip,
1714					     le32_to_cpu(ldtentry->index),
1715					     xaddr, n, &mp, &lblock);
1716			}
1717			if (mp)
1718				release_metapage(mp);
1719		}
1720	}
1721
1722	/*
1723	 *	extend the page
1724	 */
1725	sp->header.self = *pxd;
1726
1727	jfs_info("dtExtendPage: ip:0x%p smp:0x%p sp:0x%p", ip, smp, sp);
1728
1729	BT_MARK_DIRTY(smp, ip);
1730	/*
1731	 * acquire a transaction lock on the extended/leaf page
1732	 */
1733	tlck = txLock(tid, ip, smp, tlckDTREE | type);
1734	dtlck = (struct dt_lock *) & tlck->lock;
1735	lv = & dtlck->lv[0];
1736
1737	/* update buffer extent descriptor of extended page */
1738	xlen = lengthPXD(pxd);
1739	xsize = xlen << JFS_SBI(sb)->l2bsize;
1740
1741	/*
1742	 * copy old stbl to new stbl at start of extended area
1743	 */
1744	oldstblindex = sp->header.stblindex;
1745	oldstblsize = (sp->header.maxslot + 31) >> L2DTSLOTSIZE;
1746	newstblindex = sp->header.maxslot;
1747	n = xsize >> L2DTSLOTSIZE;
1748	newstblsize = (n + 31) >> L2DTSLOTSIZE;
1749	memcpy(&sp->slot[newstblindex], &sp->slot[oldstblindex],
1750	       sp->header.nextindex);
1751
1752	/*
1753	 * in-line extension: linelock old area of extended page
1754	 */
1755	if (type == tlckEXTEND) {
1756		/* linelock header */
1757		lv->offset = 0;
1758		lv->length = 1;
1759		dtlck->index++;
1760		lv++;
1761
1762		/* linelock new stbl of extended page */
1763		lv->offset = newstblindex;
1764		lv->length = newstblsize;
1765	}
1766	/*
1767	 * relocation: linelock whole relocated area
1768	 */
1769	else {
1770		lv->offset = 0;
1771		lv->length = sp->header.maxslot + newstblsize;
1772	}
1773
1774	dtlck->index++;
1775
1776	sp->header.maxslot = n;
1777	sp->header.stblindex = newstblindex;
1778	/* sp->header.nextindex remains the same */
1779
1780	/*
1781	 * add old stbl region at head of freelist
1782	 */
1783	fsi = oldstblindex;
1784	f = &sp->slot[fsi];
1785	last = sp->header.freelist;
1786	for (n = 0; n < oldstblsize; n++, fsi++, f++) {
1787		f->next = last;
1788		last = fsi;
1789	}
1790	sp->header.freelist = last;
1791	sp->header.freecnt += oldstblsize;
1792
1793	/*
1794	 * append free region of newly extended area at tail of freelist
1795	 */
1796	/* init free region of newly extended area */
1797	fsi = n = newstblindex + newstblsize;
1798	f = &sp->slot[fsi];
1799	for (fsi++; fsi < sp->header.maxslot; f++, fsi++)
1800		f->next = fsi;
1801	f->next = -1;
1802
1803	/* append new free region at tail of old freelist */
1804	fsi = sp->header.freelist;
1805	if (fsi == -1)
1806		sp->header.freelist = n;
1807	else {
1808		do {
1809			f = &sp->slot[fsi];
1810			fsi = f->next;
1811		} while (fsi != -1);
1812
1813		f->next = n;
1814	}
1815
1816	sp->header.freecnt += sp->header.maxslot - n;
1817
1818	/*
1819	 * insert the new entry
1820	 */
1821	dtInsertEntry(sp, split->index, split->key, split->data, &dtlck);
1822
1823	BT_MARK_DIRTY(pmp, ip);
1824	/*
1825	 * linelock any freeslots residing in old extent
1826	 */
1827	if (type == tlckEXTEND) {
1828		n = sp->header.maxslot >> 2;
1829		if (sp->header.freelist < n)
1830			dtLinelockFreelist(sp, n, &dtlck);
1831	}
1832
1833	/*
1834	 *	update parent entry on the parent/root page
1835	 */
1836	/*
1837	 * acquire a transaction lock on the parent/root page
1838	 */
1839	tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY);
1840	dtlck = (struct dt_lock *) & tlck->lock;
1841	lv = & dtlck->lv[dtlck->index];
1842
1843	/* linelock parent entry - 1st slot */
1844	lv->offset = 1;
1845	lv->length = 1;
1846	dtlck->index++;
1847
1848	/* update the parent pxd for page extension */
1849	tpxd = (pxd_t *) & pp->slot[1];
1850	*tpxd = *pxd;
1851
1852	DT_PUTPAGE(pmp);
1853	return 0;
1854}
1855
1856
1857/*
1858 *	dtSplitRoot()
1859 *
1860 * function:
1861 *	split the full root page into
1862 *	original/root/split page and new right page
1863 *	i.e., root remains fixed in tree anchor (inode) and
1864 *	the root is copied to a single new right child page
1865 *	since root page << non-root page, and
1866 *	the split root page contains a single entry for the
1867 *	new right child page.
1868 *
1869 * parameter:
1870 *
1871 * return: 0 - success;
1872 *	   errno - failure;
1873 *	return new page pinned;
1874 */
1875static int dtSplitRoot(tid_t tid,
1876	    struct inode *ip, struct dtsplit * split, struct metapage ** rmpp)
1877{
1878	struct super_block *sb = ip->i_sb;
1879	struct metapage *smp;
1880	dtroot_t *sp;
1881	struct metapage *rmp;
1882	dtpage_t *rp;
1883	s64 rbn;
1884	int xlen;
1885	int xsize;
1886	struct dtslot *f;
1887	s8 *stbl;
1888	int fsi, stblsize, n;
1889	struct idtentry *s;
1890	pxd_t *ppxd;
1891	struct pxdlist *pxdlist;
1892	pxd_t *pxd;
1893	struct dt_lock *dtlck;
1894	struct tlock *tlck;
1895	struct lv *lv;
1896	int rc;
1897
1898	/* get split root page */
1899	smp = split->mp;
1900	sp = &JFS_IP(ip)->i_dtroot;
1901
1902	/*
1903	 *	allocate/initialize a single (right) child page
1904	 *
1905	 * N.B. at first split, a one (or two) block to fit new entry
1906	 * is allocated; at subsequent split, a full page is allocated;
1907	 */
1908	pxdlist = split->pxdlist;
1909	pxd = &pxdlist->pxd[pxdlist->npxd];
1910	pxdlist->npxd++;
1911	rbn = addressPXD(pxd);
1912	xlen = lengthPXD(pxd);
1913	xsize = xlen << JFS_SBI(sb)->l2bsize;
1914	rmp = get_metapage(ip, rbn, xsize, 1);
1915	if (!rmp)
1916		return -EIO;
1917
1918	rp = rmp->data;
1919
1920	/* Allocate blocks to quota. */
1921	rc = dquot_alloc_block(ip, lengthPXD(pxd));
1922	if (rc) {
1923		release_metapage(rmp);
1924		return rc;
1925	}
1926
1927	BT_MARK_DIRTY(rmp, ip);
1928	/*
1929	 * acquire a transaction lock on the new right page
1930	 */
1931	tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW);
1932	dtlck = (struct dt_lock *) & tlck->lock;
1933
1934	rp->header.flag =
1935	    (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL;
1936	rp->header.self = *pxd;
1937
1938	/* initialize sibling pointers */
1939	rp->header.next = 0;
1940	rp->header.prev = 0;
1941
1942	/*
1943	 *	move in-line root page into new right page extent
1944	 */
1945	/* linelock header + copied entries + new stbl (1st slot) in new page */
1946	ASSERT(dtlck->index == 0);
1947	lv = & dtlck->lv[0];
1948	lv->offset = 0;
1949	lv->length = 10;	/* 1 + 8 + 1 */
1950	dtlck->index++;
1951
1952	n = xsize >> L2DTSLOTSIZE;
1953	rp->header.maxslot = n;
1954	stblsize = (n + 31) >> L2DTSLOTSIZE;
1955
1956	/* copy old stbl to new stbl at start of extended area */
1957	rp->header.stblindex = DTROOTMAXSLOT;
1958	stbl = (s8 *) & rp->slot[DTROOTMAXSLOT];
1959	memcpy(stbl, sp->header.stbl, sp->header.nextindex);
1960	rp->header.nextindex = sp->header.nextindex;
1961
1962	/* copy old data area to start of new data area */
1963	memcpy(&rp->slot[1], &sp->slot[1], IDATASIZE);
1964
1965	/*
1966	 * append free region of newly extended area at tail of freelist
1967	 */
1968	/* init free region of newly extended area */
1969	fsi = n = DTROOTMAXSLOT + stblsize;
1970	f = &rp->slot[fsi];
1971	for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
1972		f->next = fsi;
1973	f->next = -1;
1974
1975	/* append new free region at tail of old freelist */
1976	fsi = sp->header.freelist;
1977	if (fsi == -1)
1978		rp->header.freelist = n;
1979	else {
1980		rp->header.freelist = fsi;
1981
1982		do {
1983			f = &rp->slot[fsi];
1984			fsi = f->next;
1985		} while (fsi != -1);
1986
1987		f->next = n;
1988	}
1989
1990	rp->header.freecnt = sp->header.freecnt + rp->header.maxslot - n;
1991
1992	/*
1993	 * Update directory index table for entries now in right page
1994	 */
1995	if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) {
1996		s64 lblock;
1997		struct metapage *mp = NULL;
1998		struct ldtentry *ldtentry;
1999
2000		stbl = DT_GETSTBL(rp);
2001		for (n = 0; n < rp->header.nextindex; n++) {
2002			ldtentry = (struct ldtentry *) & rp->slot[stbl[n]];
2003			modify_index(tid, ip, le32_to_cpu(ldtentry->index),
2004				     rbn, n, &mp, &lblock);
2005		}
2006		if (mp)
2007			release_metapage(mp);
2008	}
2009	/*
2010	 * insert the new entry into the new right/child page
2011	 * (skip index in the new right page will not change)
2012	 */
2013	dtInsertEntry(rp, split->index, split->key, split->data, &dtlck);
2014
2015	/*
2016	 *	reset parent/root page
2017	 *
2018	 * set the 1st entry offset to 0, which force the left-most key
2019	 * at any level of the tree to be less than any search key.
2020	 *
2021	 * The btree comparison code guarantees that the left-most key on any
2022	 * level of the tree is never used, so it doesn't need to be filled in.
2023	 */
2024	BT_MARK_DIRTY(smp, ip);
2025	/*
2026	 * acquire a transaction lock on the root page (in-memory inode)
2027	 */
2028	tlck = txLock(tid, ip, smp, tlckDTREE | tlckNEW | tlckBTROOT);
2029	dtlck = (struct dt_lock *) & tlck->lock;
2030
2031	/* linelock root */
2032	ASSERT(dtlck->index == 0);
2033	lv = & dtlck->lv[0];
2034	lv->offset = 0;
2035	lv->length = DTROOTMAXSLOT;
2036	dtlck->index++;
2037
2038	/* update page header of root */
2039	if (sp->header.flag & BT_LEAF) {
2040		sp->header.flag &= ~BT_LEAF;
2041		sp->header.flag |= BT_INTERNAL;
2042	}
2043
2044	/* init the first entry */
2045	s = (struct idtentry *) & sp->slot[DTENTRYSTART];
2046	ppxd = (pxd_t *) s;
2047	*ppxd = *pxd;
2048	s->next = -1;
2049	s->namlen = 0;
2050
2051	stbl = sp->header.stbl;
2052	stbl[0] = DTENTRYSTART;
2053	sp->header.nextindex = 1;
2054
2055	/* init freelist */
2056	fsi = DTENTRYSTART + 1;
2057	f = &sp->slot[fsi];
2058
2059	/* init free region of remaining area */
2060	for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++)
2061		f->next = fsi;
2062	f->next = -1;
2063
2064	sp->header.freelist = DTENTRYSTART + 1;
2065	sp->header.freecnt = DTROOTMAXSLOT - (DTENTRYSTART + 1);
2066
2067	*rmpp = rmp;
2068
2069	return 0;
2070}
2071
2072
2073/*
2074 *	dtDelete()
2075 *
2076 * function: delete the entry(s) referenced by a key.
2077 *
2078 * parameter:
2079 *
2080 * return:
2081 */
2082int dtDelete(tid_t tid,
2083	 struct inode *ip, struct component_name * key, ino_t * ino, int flag)
2084{
2085	int rc = 0;
2086	s64 bn;
2087	struct metapage *mp, *imp;
2088	dtpage_t *p;
2089	int index;
2090	struct btstack btstack;
2091	struct dt_lock *dtlck;
2092	struct tlock *tlck;
2093	struct lv *lv;
2094	int i;
2095	struct ldtentry *ldtentry;
2096	u8 *stbl;
2097	u32 table_index, next_index;
2098	struct metapage *nmp;
2099	dtpage_t *np;
2100
2101	/*
2102	 *	search for the entry to delete:
2103	 *
2104	 * dtSearch() returns (leaf page pinned, index at which to delete).
2105	 */
2106	if ((rc = dtSearch(ip, key, ino, &btstack, flag)))
2107		return rc;
2108
2109	/* retrieve search result */
2110	DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
2111
2112	/*
2113	 * We need to find put the index of the next entry into the
2114	 * directory index table in order to resume a readdir from this
2115	 * entry.
2116	 */
2117	if (DO_INDEX(ip)) {
2118		stbl = DT_GETSTBL(p);
2119		ldtentry = (struct ldtentry *) & p->slot[stbl[index]];
2120		table_index = le32_to_cpu(ldtentry->index);
2121		if (index == (p->header.nextindex - 1)) {
2122			/*
2123			 * Last entry in this leaf page
2124			 */
2125			if ((p->header.flag & BT_ROOT)
2126			    || (p->header.next == 0))
2127				next_index = -1;
2128			else {
2129				/* Read next leaf page */
2130				DT_GETPAGE(ip, le64_to_cpu(p->header.next),
2131					   nmp, PSIZE, np, rc);
2132				if (rc)
2133					next_index = -1;
2134				else {
2135					stbl = DT_GETSTBL(np);
2136					ldtentry =
2137					    (struct ldtentry *) & np->
2138					    slot[stbl[0]];
2139					next_index =
2140					    le32_to_cpu(ldtentry->index);
2141					DT_PUTPAGE(nmp);
2142				}
2143			}
2144		} else {
2145			ldtentry =
2146			    (struct ldtentry *) & p->slot[stbl[index + 1]];
2147			next_index = le32_to_cpu(ldtentry->index);
2148		}
2149		free_index(tid, ip, table_index, next_index);
2150	}
2151	/*
2152	 * the leaf page becomes empty, delete the page
2153	 */
2154	if (p->header.nextindex == 1) {
2155		/* delete empty page */
2156		rc = dtDeleteUp(tid, ip, mp, p, &btstack);
2157	}
2158	/*
2159	 * the leaf page has other entries remaining:
2160	 *
2161	 * delete the entry from the leaf page.
2162	 */
2163	else {
2164		BT_MARK_DIRTY(mp, ip);
2165		/*
2166		 * acquire a transaction lock on the leaf page
2167		 */
2168		tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
2169		dtlck = (struct dt_lock *) & tlck->lock;
2170
2171		/*
2172		 * Do not assume that dtlck->index will be zero.  During a
2173		 * rename within a directory, this transaction may have
2174		 * modified this page already when adding the new entry.
2175		 */
2176
2177		/* linelock header */
2178		if (dtlck->index >= dtlck->maxcnt)
2179			dtlck = (struct dt_lock *) txLinelock(dtlck);
2180		lv = & dtlck->lv[dtlck->index];
2181		lv->offset = 0;
2182		lv->length = 1;
2183		dtlck->index++;
2184
2185		/* linelock stbl of non-root leaf page */
2186		if (!(p->header.flag & BT_ROOT)) {
2187			if (dtlck->index >= dtlck->maxcnt)
2188				dtlck = (struct dt_lock *) txLinelock(dtlck);
2189			lv = & dtlck->lv[dtlck->index];
2190			i = index >> L2DTSLOTSIZE;
2191			lv->offset = p->header.stblindex + i;
2192			lv->length =
2193			    ((p->header.nextindex - 1) >> L2DTSLOTSIZE) -
2194			    i + 1;
2195			dtlck->index++;
2196		}
2197
2198		/* free the leaf entry */
2199		dtDeleteEntry(p, index, &dtlck);
2200
2201		/*
2202		 * Update directory index table for entries moved in stbl
2203		 */
2204		if (DO_INDEX(ip) && index < p->header.nextindex) {
2205			s64 lblock;
2206
2207			imp = NULL;
2208			stbl = DT_GETSTBL(p);
2209			for (i = index; i < p->header.nextindex; i++) {
2210				ldtentry =
2211				    (struct ldtentry *) & p->slot[stbl[i]];
2212				modify_index(tid, ip,
2213					     le32_to_cpu(ldtentry->index),
2214					     bn, i, &imp, &lblock);
2215			}
2216			if (imp)
2217				release_metapage(imp);
2218		}
2219
2220		DT_PUTPAGE(mp);
2221	}
2222
2223	return rc;
2224}
2225
2226
2227/*
2228 *	dtDeleteUp()
2229 *
2230 * function:
2231 *	free empty pages as propagating deletion up the tree
2232 *
2233 * parameter:
2234 *
2235 * return:
2236 */
2237static int dtDeleteUp(tid_t tid, struct inode *ip,
2238	   struct metapage * fmp, dtpage_t * fp, struct btstack * btstack)
2239{
2240	int rc = 0;
2241	struct metapage *mp;
2242	dtpage_t *p;
2243	int index, nextindex;
2244	int xlen;
2245	struct btframe *parent;
2246	struct dt_lock *dtlck;
2247	struct tlock *tlck;
2248	struct lv *lv;
2249	struct pxd_lock *pxdlock;
2250	int i;
2251
2252	/*
2253	 *	keep the root leaf page which has become empty
2254	 */
2255	if (BT_IS_ROOT(fmp)) {
2256		/*
2257		 * reset the root
2258		 *
2259		 * dtInitRoot() acquires txlock on the root
2260		 */
2261		dtInitRoot(tid, ip, PARENT(ip));
2262
2263		DT_PUTPAGE(fmp);
2264
2265		return 0;
2266	}
2267
2268	/*
2269	 *	free the non-root leaf page
2270	 */
2271	/*
2272	 * acquire a transaction lock on the page
2273	 *
2274	 * write FREEXTENT|NOREDOPAGE log record
2275	 * N.B. linelock is overlaid as freed extent descriptor, and
2276	 * the buffer page is freed;
2277	 */
2278	tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE);
2279	pxdlock = (struct pxd_lock *) & tlck->lock;
2280	pxdlock->flag = mlckFREEPXD;
2281	pxdlock->pxd = fp->header.self;
2282	pxdlock->index = 1;
2283
2284	/* update sibling pointers */
2285	if ((rc = dtRelink(tid, ip, fp))) {
2286		BT_PUTPAGE(fmp);
2287		return rc;
2288	}
2289
2290	xlen = lengthPXD(&fp->header.self);
2291
2292	/* Free quota allocation. */
2293	dquot_free_block(ip, xlen);
2294
2295	/* free/invalidate its buffer page */
2296	discard_metapage(fmp);
2297
2298	/*
2299	 *	propagate page deletion up the directory tree
2300	 *
2301	 * If the delete from the parent page makes it empty,
2302	 * continue all the way up the tree.
2303	 * stop if the root page is reached (which is never deleted) or
2304	 * if the entry deletion does not empty the page.
2305	 */
2306	while ((parent = BT_POP(btstack)) != NULL) {
2307		/* pin the parent page <sp> */
2308		DT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc);
2309		if (rc)
2310			return rc;
2311
2312		/*
2313		 * free the extent of the child page deleted
2314		 */
2315		index = parent->index;
2316
2317		/*
2318		 * delete the entry for the child page from parent
2319		 */
2320		nextindex = p->header.nextindex;
2321
2322		/*
2323		 * the parent has the single entry being deleted:
2324		 *
2325		 * free the parent page which has become empty.
2326		 */
2327		if (nextindex == 1) {
2328			/*
2329			 * keep the root internal page which has become empty
2330			 */
2331			if (p->header.flag & BT_ROOT) {
2332				/*
2333				 * reset the root
2334				 *
2335				 * dtInitRoot() acquires txlock on the root
2336				 */
2337				dtInitRoot(tid, ip, PARENT(ip));
2338
2339				DT_PUTPAGE(mp);
2340
2341				return 0;
2342			}
2343			/*
2344			 * free the parent page
2345			 */
2346			else {
2347				/*
2348				 * acquire a transaction lock on the page
2349				 *
2350				 * write FREEXTENT|NOREDOPAGE log record
2351				 */
2352				tlck =
2353				    txMaplock(tid, ip,
2354					      tlckDTREE | tlckFREE);
2355				pxdlock = (struct pxd_lock *) & tlck->lock;
2356				pxdlock->flag = mlckFREEPXD;
2357				pxdlock->pxd = p->header.self;
2358				pxdlock->index = 1;
2359
2360				/* update sibling pointers */
2361				if ((rc = dtRelink(tid, ip, p))) {
2362					DT_PUTPAGE(mp);
2363					return rc;
2364				}
2365
2366				xlen = lengthPXD(&p->header.self);
2367
2368				/* Free quota allocation */
2369				dquot_free_block(ip, xlen);
2370
2371				/* free/invalidate its buffer page */
2372				discard_metapage(mp);
2373
2374				/* propagate up */
2375				continue;
2376			}
2377		}
2378
2379		/*
2380		 * the parent has other entries remaining:
2381		 *
2382		 * delete the router entry from the parent page.
2383		 */
2384		BT_MARK_DIRTY(mp, ip);
2385		/*
2386		 * acquire a transaction lock on the page
2387		 *
2388		 * action: router entry deletion
2389		 */
2390		tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
2391		dtlck = (struct dt_lock *) & tlck->lock;
2392
2393		/* linelock header */
2394		if (dtlck->index >= dtlck->maxcnt)
2395			dtlck = (struct dt_lock *) txLinelock(dtlck);
2396		lv = & dtlck->lv[dtlck->index];
2397		lv->offset = 0;
2398		lv->length = 1;
2399		dtlck->index++;
2400
2401		/* linelock stbl of non-root leaf page */
2402		if (!(p->header.flag & BT_ROOT)) {
2403			if (dtlck->index < dtlck->maxcnt)
2404				lv++;
2405			else {
2406				dtlck = (struct dt_lock *) txLinelock(dtlck);
2407				lv = & dtlck->lv[0];
2408			}
2409			i = index >> L2DTSLOTSIZE;
2410			lv->offset = p->header.stblindex + i;
2411			lv->length =
2412			    ((p->header.nextindex - 1) >> L2DTSLOTSIZE) -
2413			    i + 1;
2414			dtlck->index++;
2415		}
2416
2417		/* free the router entry */
2418		dtDeleteEntry(p, index, &dtlck);
2419
2420		/* reset key of new leftmost entry of level (for consistency) */
2421		if (index == 0 &&
2422		    ((p->header.flag & BT_ROOT) || p->header.prev == 0))
2423			dtTruncateEntry(p, 0, &dtlck);
2424
2425		/* unpin the parent page */
2426		DT_PUTPAGE(mp);
2427
2428		/* exit propagation up */
2429		break;
2430	}
2431
2432	if (!DO_INDEX(ip))
2433		ip->i_size -= PSIZE;
2434
2435	return 0;
2436}
2437
2438#ifdef _NOTYET
2439/*
2440 * NAME:	dtRelocate()
2441 *
2442 * FUNCTION:	relocate dtpage (internal or leaf) of directory;
2443 *		This function is mainly used by defragfs utility.
2444 */
2445int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
2446	       s64 nxaddr)
2447{
2448	int rc = 0;
2449	struct metapage *mp, *pmp, *lmp, *rmp;
2450	dtpage_t *p, *pp, *rp = 0, *lp= 0;
2451	s64 bn;
2452	int index;
2453	struct btstack btstack;
2454	pxd_t *pxd;
2455	s64 oxaddr, nextbn, prevbn;
2456	int xlen, xsize;
2457	struct tlock *tlck;
2458	struct dt_lock *dtlck;
2459	struct pxd_lock *pxdlock;
2460	s8 *stbl;
2461	struct lv *lv;
2462
2463	oxaddr = addressPXD(opxd);
2464	xlen = lengthPXD(opxd);
2465
2466	jfs_info("dtRelocate: lmxaddr:%Ld xaddr:%Ld:%Ld xlen:%d",
2467		   (long long)lmxaddr, (long long)oxaddr, (long long)nxaddr,
2468		   xlen);
2469
2470	/*
2471	 *	1. get the internal parent dtpage covering
2472	 *	router entry for the tartget page to be relocated;
2473	 */
2474	rc = dtSearchNode(ip, lmxaddr, opxd, &btstack);
2475	if (rc)
2476		return rc;
2477
2478	/* retrieve search result */
2479	DT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
2480	jfs_info("dtRelocate: parent router entry validated.");
2481
2482	/*
2483	 *	2. relocate the target dtpage
2484	 */
2485	/* read in the target page from src extent */
2486	DT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc);
2487	if (rc) {
2488		/* release the pinned parent page */
2489		DT_PUTPAGE(pmp);
2490		return rc;
2491	}
2492
2493	/*
2494	 * read in sibling pages if any to update sibling pointers;
2495	 */
2496	rmp = NULL;
2497	if (p->header.next) {
2498		nextbn = le64_to_cpu(p->header.next);
2499		DT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc);
2500		if (rc) {
2501			DT_PUTPAGE(mp);
2502			DT_PUTPAGE(pmp);
2503			return (rc);
2504		}
2505	}
2506
2507	lmp = NULL;
2508	if (p->header.prev) {
2509		prevbn = le64_to_cpu(p->header.prev);
2510		DT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc);
2511		if (rc) {
2512			DT_PUTPAGE(mp);
2513			DT_PUTPAGE(pmp);
2514			if (rmp)
2515				DT_PUTPAGE(rmp);
2516			return (rc);
2517		}
2518	}
2519
2520	/* at this point, all xtpages to be updated are in memory */
2521
2522	/*
2523	 * update sibling pointers of sibling dtpages if any;
2524	 */
2525	if (lmp) {
2526		tlck = txLock(tid, ip, lmp, tlckDTREE | tlckRELINK);
2527		dtlck = (struct dt_lock *) & tlck->lock;
2528		/* linelock header */
2529		ASSERT(dtlck->index == 0);
2530		lv = & dtlck->lv[0];
2531		lv->offset = 0;
2532		lv->length = 1;
2533		dtlck->index++;
2534
2535		lp->header.next = cpu_to_le64(nxaddr);
2536		DT_PUTPAGE(lmp);
2537	}
2538
2539	if (rmp) {
2540		tlck = txLock(tid, ip, rmp, tlckDTREE | tlckRELINK);
2541		dtlck = (struct dt_lock *) & tlck->lock;
2542		/* linelock header */
2543		ASSERT(dtlck->index == 0);
2544		lv = & dtlck->lv[0];
2545		lv->offset = 0;
2546		lv->length = 1;
2547		dtlck->index++;
2548
2549		rp->header.prev = cpu_to_le64(nxaddr);
2550		DT_PUTPAGE(rmp);
2551	}
2552
2553	/*
2554	 * update the target dtpage to be relocated
2555	 *
2556	 * write LOG_REDOPAGE of LOG_NEW type for dst page
2557	 * for the whole target page (logredo() will apply
2558	 * after image and update bmap for allocation of the
2559	 * dst extent), and update bmap for allocation of
2560	 * the dst extent;
2561	 */
2562	tlck = txLock(tid, ip, mp, tlckDTREE | tlckNEW);
2563	dtlck = (struct dt_lock *) & tlck->lock;
2564	/* linelock header */
2565	ASSERT(dtlck->index == 0);
2566	lv = & dtlck->lv[0];
2567
2568	/* update the self address in the dtpage header */
2569	pxd = &p->header.self;
2570	PXDaddress(pxd, nxaddr);
2571
2572	/* the dst page is the same as the src page, i.e.,
2573	 * linelock for afterimage of the whole page;
2574	 */
2575	lv->offset = 0;
2576	lv->length = p->header.maxslot;
2577	dtlck->index++;
2578
2579	/* update the buffer extent descriptor of the dtpage */
2580	xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize;
2581
2582	/* unpin the relocated page */
2583	DT_PUTPAGE(mp);
2584	jfs_info("dtRelocate: target dtpage relocated.");
2585
2586	/* the moved extent is dtpage, then a LOG_NOREDOPAGE log rec
2587	 * needs to be written (in logredo(), the LOG_NOREDOPAGE log rec
2588	 * will also force a bmap update ).
2589	 */
2590
2591	/*
2592	 *	3. acquire maplock for the source extent to be freed;
2593	 */
2594	/* for dtpage relocation, write a LOG_NOREDOPAGE record
2595	 * for the source dtpage (logredo() will init NoRedoPage
2596	 * filter and will also update bmap for free of the source
2597	 * dtpage), and upadte bmap for free of the source dtpage;
2598	 */
2599	tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE);
2600	pxdlock = (struct pxd_lock *) & tlck->lock;
2601	pxdlock->flag = mlckFREEPXD;
2602	PXDaddress(&pxdlock->pxd, oxaddr);
2603	PXDlength(&pxdlock->pxd, xlen);
2604	pxdlock->index = 1;
2605
2606	/*
2607	 *	4. update the parent router entry for relocation;
2608	 *
2609	 * acquire tlck for the parent entry covering the target dtpage;
2610	 * write LOG_REDOPAGE to apply after image only;
2611	 */
2612	jfs_info("dtRelocate: update parent router entry.");
2613	tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY);
2614	dtlck = (struct dt_lock *) & tlck->lock;
2615	lv = & dtlck->lv[dtlck->index];
2616
2617	/* update the PXD with the new address */
2618	stbl = DT_GETSTBL(pp);
2619	pxd = (pxd_t *) & pp->slot[stbl[index]];
2620	PXDaddress(pxd, nxaddr);
2621	lv->offset = stbl[index];
2622	lv->length = 1;
2623	dtlck->index++;
2624
2625	/* unpin the parent dtpage */
2626	DT_PUTPAGE(pmp);
2627
2628	return rc;
2629}
2630
2631/*
2632 * NAME:	dtSearchNode()
2633 *
2634 * FUNCTION:	Search for an dtpage containing a specified address
2635 *		This function is mainly used by defragfs utility.
2636 *
2637 * NOTE:	Search result on stack, the found page is pinned at exit.
2638 *		The result page must be an internal dtpage.
2639 *		lmxaddr give the address of the left most page of the
2640 *		dtree level, in which the required dtpage resides.
2641 */
2642static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd,
2643			struct btstack * btstack)
2644{
2645	int rc = 0;
2646	s64 bn;
2647	struct metapage *mp;
2648	dtpage_t *p;
2649	int psize = 288;	/* initial in-line directory */
2650	s8 *stbl;
2651	int i;
2652	pxd_t *pxd;
2653	struct btframe *btsp;
2654
2655	BT_CLR(btstack);	/* reset stack */
2656
2657	/*
2658	 *	descend tree to the level with specified leftmost page
2659	 *
2660	 *  by convention, root bn = 0.
2661	 */
2662	for (bn = 0;;) {
2663		/* get/pin the page to search */
2664		DT_GETPAGE(ip, bn, mp, psize, p, rc);
2665		if (rc)
2666			return rc;
2667
2668		/* does the xaddr of leftmost page of the levevl
2669		 * matches levevl search key ?
2670		 */
2671		if (p->header.flag & BT_ROOT) {
2672			if (lmxaddr == 0)
2673				break;
2674		} else if (addressPXD(&p->header.self) == lmxaddr)
2675			break;
2676
2677		/*
2678		 * descend down to leftmost child page
2679		 */
2680		if (p->header.flag & BT_LEAF) {
2681			DT_PUTPAGE(mp);
2682			return -ESTALE;
2683		}
2684
2685		/* get the leftmost entry */
2686		stbl = DT_GETSTBL(p);
2687		pxd = (pxd_t *) & p->slot[stbl[0]];
2688
2689		/* get the child page block address */
2690		bn = addressPXD(pxd);
2691		psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize;
2692		/* unpin the parent page */
2693		DT_PUTPAGE(mp);
2694	}
2695
2696	/*
2697	 *	search each page at the current levevl
2698	 */
2699      loop:
2700	stbl = DT_GETSTBL(p);
2701	for (i = 0; i < p->header.nextindex; i++) {
2702		pxd = (pxd_t *) & p->slot[stbl[i]];
2703
2704		/* found the specified router entry */
2705		if (addressPXD(pxd) == addressPXD(kpxd) &&
2706		    lengthPXD(pxd) == lengthPXD(kpxd)) {
2707			btsp = btstack->top;
2708			btsp->bn = bn;
2709			btsp->index = i;
2710			btsp->mp = mp;
2711
2712			return 0;
2713		}
2714	}
2715
2716	/* get the right sibling page if any */
2717	if (p->header.next)
2718		bn = le64_to_cpu(p->header.next);
2719	else {
2720		DT_PUTPAGE(mp);
2721		return -ESTALE;
2722	}
2723
2724	/* unpin current page */
2725	DT_PUTPAGE(mp);
2726
2727	/* get the right sibling page */
2728	DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
2729	if (rc)
2730		return rc;
2731
2732	goto loop;
2733}
2734#endif /* _NOTYET */
2735
2736/*
2737 *	dtRelink()
2738 *
2739 * function:
2740 *	link around a freed page.
2741 *
2742 * parameter:
2743 *	fp:	page to be freed
2744 *
2745 * return:
2746 */
2747static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p)
2748{
2749	int rc;
2750	struct metapage *mp;
2751	s64 nextbn, prevbn;
2752	struct tlock *tlck;
2753	struct dt_lock *dtlck;
2754	struct lv *lv;
2755
2756	nextbn = le64_to_cpu(p->header.next);
2757	prevbn = le64_to_cpu(p->header.prev);
2758
2759	/* update prev pointer of the next page */
2760	if (nextbn != 0) {
2761		DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
2762		if (rc)
2763			return rc;
2764
2765		BT_MARK_DIRTY(mp, ip);
2766		/*
2767		 * acquire a transaction lock on the next page
2768		 *
2769		 * action: update prev pointer;
2770		 */
2771		tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
2772		jfs_info("dtRelink nextbn: tlck = 0x%p, ip = 0x%p, mp=0x%p",
2773			tlck, ip, mp);
2774		dtlck = (struct dt_lock *) & tlck->lock;
2775
2776		/* linelock header */
2777		if (dtlck->index >= dtlck->maxcnt)
2778			dtlck = (struct dt_lock *) txLinelock(dtlck);
2779		lv = & dtlck->lv[dtlck->index];
2780		lv->offset = 0;
2781		lv->length = 1;
2782		dtlck->index++;
2783
2784		p->header.prev = cpu_to_le64(prevbn);
2785		DT_PUTPAGE(mp);
2786	}
2787
2788	/* update next pointer of the previous page */
2789	if (prevbn != 0) {
2790		DT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc);
2791		if (rc)
2792			return rc;
2793
2794		BT_MARK_DIRTY(mp, ip);
2795		/*
2796		 * acquire a transaction lock on the prev page
2797		 *
2798		 * action: update next pointer;
2799		 */
2800		tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
2801		jfs_info("dtRelink prevbn: tlck = 0x%p, ip = 0x%p, mp=0x%p",
2802			tlck, ip, mp);
2803		dtlck = (struct dt_lock *) & tlck->lock;
2804
2805		/* linelock header */
2806		if (dtlck->index >= dtlck->maxcnt)
2807			dtlck = (struct dt_lock *) txLinelock(dtlck);
2808		lv = & dtlck->lv[dtlck->index];
2809		lv->offset = 0;
2810		lv->length = 1;
2811		dtlck->index++;
2812
2813		p->header.next = cpu_to_le64(nextbn);
2814		DT_PUTPAGE(mp);
2815	}
2816
2817	return 0;
2818}
2819
2820
2821/*
2822 *	dtInitRoot()
2823 *
2824 * initialize directory root (inline in inode)
2825 */
2826void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot)
2827{
2828	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
2829	dtroot_t *p;
2830	int fsi;
2831	struct dtslot *f;
2832	struct tlock *tlck;
2833	struct dt_lock *dtlck;
2834	struct lv *lv;
2835	u16 xflag_save;
2836
2837	/*
2838	 * If this was previously an non-empty directory, we need to remove
2839	 * the old directory table.
2840	 */
2841	if (DO_INDEX(ip)) {
2842		if (!jfs_dirtable_inline(ip)) {
2843			struct tblock *tblk = tid_to_tblock(tid);
2844			/*
2845			 * We're playing games with the tid's xflag.  If
2846			 * we're removing a regular file, the file's xtree
2847			 * is committed with COMMIT_PMAP, but we always
2848			 * commit the directories xtree with COMMIT_PWMAP.
2849			 */
2850			xflag_save = tblk->xflag;
2851			tblk->xflag = 0;
2852			/*
2853			 * xtTruncate isn't guaranteed to fully truncate
2854			 * the xtree.  The caller needs to check i_size
2855			 * after committing the transaction to see if
2856			 * additional truncation is needed.  The
2857			 * COMMIT_Stale flag tells caller that we
2858			 * initiated the truncation.
2859			 */
2860			xtTruncate(tid, ip, 0, COMMIT_PWMAP);
2861			set_cflag(COMMIT_Stale, ip);
2862
2863			tblk->xflag = xflag_save;
2864		} else
2865			ip->i_size = 1;
2866
2867		jfs_ip->next_index = 2;
2868	} else
2869		ip->i_size = IDATASIZE;
2870
2871	/*
2872	 * acquire a transaction lock on the root
2873	 *
2874	 * action: directory initialization;
2875	 */
2876	tlck = txLock(tid, ip, (struct metapage *) & jfs_ip->bxflag,
2877		      tlckDTREE | tlckENTRY | tlckBTROOT);
2878	dtlck = (struct dt_lock *) & tlck->lock;
2879
2880	/* linelock root */
2881	ASSERT(dtlck->index == 0);
2882	lv = & dtlck->lv[0];
2883	lv->offset = 0;
2884	lv->length = DTROOTMAXSLOT;
2885	dtlck->index++;
2886
2887	p = &jfs_ip->i_dtroot;
2888
2889	p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF;
2890
2891	p->header.nextindex = 0;
2892
2893	/* init freelist */
2894	fsi = 1;
2895	f = &p->slot[fsi];
2896
2897	/* init data area of root */
2898	for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++)
2899		f->next = fsi;
2900	f->next = -1;
2901
2902	p->header.freelist = 1;
2903	p->header.freecnt = 8;
2904
2905	/* init '..' entry */
2906	p->header.idotdot = cpu_to_le32(idotdot);
2907
2908	return;
2909}
2910
2911/*
2912 *	add_missing_indices()
2913 *
2914 * function: Fix dtree page in which one or more entries has an invalid index.
2915 *	     fsck.jfs should really fix this, but it currently does not.
2916 *	     Called from jfs_readdir when bad index is detected.
2917 */
2918static void add_missing_indices(struct inode *inode, s64 bn)
2919{
2920	struct ldtentry *d;
2921	struct dt_lock *dtlck;
2922	int i;
2923	uint index;
2924	struct lv *lv;
2925	struct metapage *mp;
2926	dtpage_t *p;
2927	int rc;
2928	s8 *stbl;
2929	tid_t tid;
2930	struct tlock *tlck;
2931
2932	tid = txBegin(inode->i_sb, 0);
2933
2934	DT_GETPAGE(inode, bn, mp, PSIZE, p, rc);
2935
2936	if (rc) {
2937		printk(KERN_ERR "DT_GETPAGE failed!\n");
2938		goto end;
2939	}
2940	BT_MARK_DIRTY(mp, inode);
2941
2942	ASSERT(p->header.flag & BT_LEAF);
2943
2944	tlck = txLock(tid, inode, mp, tlckDTREE | tlckENTRY);
2945	if (BT_IS_ROOT(mp))
2946		tlck->type |= tlckBTROOT;
2947
2948	dtlck = (struct dt_lock *) &tlck->lock;
2949
2950	stbl = DT_GETSTBL(p);
2951	for (i = 0; i < p->header.nextindex; i++) {
2952		d = (struct ldtentry *) &p->slot[stbl[i]];
2953		index = le32_to_cpu(d->index);
2954		if ((index < 2) || (index >= JFS_IP(inode)->next_index)) {
2955			d->index = cpu_to_le32(add_index(tid, inode, bn, i));
2956			if (dtlck->index >= dtlck->maxcnt)
2957				dtlck = (struct dt_lock *) txLinelock(dtlck);
2958			lv = &dtlck->lv[dtlck->index];
2959			lv->offset = stbl[i];
2960			lv->length = 1;
2961			dtlck->index++;
2962		}
2963	}
2964
2965	DT_PUTPAGE(mp);
2966	(void) txCommit(tid, 1, &inode, 0);
2967end:
2968	txEnd(tid);
2969}
2970
2971/*
2972 * Buffer to hold directory entry info while traversing a dtree page
2973 * before being fed to the filldir function
2974 */
2975struct jfs_dirent {
2976	loff_t position;
2977	int ino;
2978	u16 name_len;
2979	char name[0];
2980};
2981
2982/*
2983 * function to determine next variable-sized jfs_dirent in buffer
2984 */
2985static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent)
2986{
2987	return (struct jfs_dirent *)
2988		((char *)dirent +
2989		 ((sizeof (struct jfs_dirent) + dirent->name_len + 1 +
2990		   sizeof (loff_t) - 1) &
2991		  ~(sizeof (loff_t) - 1)));
2992}
2993
2994/*
2995 *	jfs_readdir()
2996 *
2997 * function: read directory entries sequentially
2998 *	from the specified entry offset
2999 *
3000 * parameter:
3001 *
3002 * return: offset = (pn, index) of start entry
3003 *	of next jfs_readdir()/dtRead()
3004 */
3005int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
3006{
3007	struct inode *ip = filp->f_path.dentry->d_inode;
3008	struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab;
3009	int rc = 0;
3010	loff_t dtpos;	/* legacy OS/2 style position */
3011	struct dtoffset {
3012		s16 pn;
3013		s16 index;
3014		s32 unused;
3015	} *dtoffset = (struct dtoffset *) &dtpos;
3016	s64 bn;
3017	struct metapage *mp;
3018	dtpage_t *p;
3019	int index;
3020	s8 *stbl;
3021	struct btstack btstack;
3022	int i, next;
3023	struct ldtentry *d;
3024	struct dtslot *t;
3025	int d_namleft, len, outlen;
3026	unsigned long dirent_buf;
3027	char *name_ptr;
3028	u32 dir_index;
3029	int do_index = 0;
3030	uint loop_count = 0;
3031	struct jfs_dirent *jfs_dirent;
3032	int jfs_dirents;
3033	int overflow, fix_page, page_fixed = 0;
3034	static int unique_pos = 2;	/* If we can't fix broken index */
3035
3036	if (filp->f_pos == DIREND)
3037		return 0;
3038
3039	if (DO_INDEX(ip)) {
3040		/*
3041		 * persistent index is stored in directory entries.
3042		 * Special cases:	 0 = .
3043		 *			 1 = ..
3044		 *			-1 = End of directory
3045		 */
3046		do_index = 1;
3047
3048		dir_index = (u32) filp->f_pos;
 
 
 
 
 
 
 
 
3049
3050		if (dir_index > 1) {
3051			struct dir_table_slot dirtab_slot;
3052
3053			if (dtEmpty(ip) ||
3054			    (dir_index >= JFS_IP(ip)->next_index)) {
3055				/* Stale position.  Directory has shrunk */
3056				filp->f_pos = DIREND;
3057				return 0;
3058			}
3059		      repeat:
3060			rc = read_index(ip, dir_index, &dirtab_slot);
3061			if (rc) {
3062				filp->f_pos = DIREND;
3063				return rc;
3064			}
3065			if (dirtab_slot.flag == DIR_INDEX_FREE) {
3066				if (loop_count++ > JFS_IP(ip)->next_index) {
3067					jfs_err("jfs_readdir detected "
3068						   "infinite loop!");
3069					filp->f_pos = DIREND;
3070					return 0;
3071				}
3072				dir_index = le32_to_cpu(dirtab_slot.addr2);
3073				if (dir_index == -1) {
3074					filp->f_pos = DIREND;
3075					return 0;
3076				}
3077				goto repeat;
3078			}
3079			bn = addressDTS(&dirtab_slot);
3080			index = dirtab_slot.slot;
3081			DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3082			if (rc) {
3083				filp->f_pos = DIREND;
3084				return 0;
3085			}
3086			if (p->header.flag & BT_INTERNAL) {
3087				jfs_err("jfs_readdir: bad index table");
3088				DT_PUTPAGE(mp);
3089				filp->f_pos = -1;
3090				return 0;
3091			}
3092		} else {
3093			if (dir_index == 0) {
3094				/*
3095				 * self "."
3096				 */
3097				filp->f_pos = 0;
3098				if (filldir(dirent, ".", 1, 0, ip->i_ino,
3099					    DT_DIR))
3100					return 0;
3101			}
3102			/*
3103			 * parent ".."
3104			 */
3105			filp->f_pos = 1;
3106			if (filldir(dirent, "..", 2, 1, PARENT(ip), DT_DIR))
3107				return 0;
3108
3109			/*
3110			 * Find first entry of left-most leaf
3111			 */
3112			if (dtEmpty(ip)) {
3113				filp->f_pos = DIREND;
3114				return 0;
3115			}
3116
3117			if ((rc = dtReadFirst(ip, &btstack)))
3118				return rc;
3119
3120			DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
3121		}
3122	} else {
3123		/*
3124		 * Legacy filesystem - OS/2 & Linux JFS < 0.3.6
3125		 *
3126		 * pn = index = 0:	First entry "."
3127		 * pn = 0; index = 1:	Second entry ".."
3128		 * pn > 0:		Real entries, pn=1 -> leftmost page
3129		 * pn = index = -1:	No more entries
3130		 */
3131		dtpos = filp->f_pos;
3132		if (dtpos == 0) {
3133			/* build "." entry */
3134
3135			if (filldir(dirent, ".", 1, filp->f_pos, ip->i_ino,
3136				    DT_DIR))
3137				return 0;
3138			dtoffset->index = 1;
3139			filp->f_pos = dtpos;
3140		}
3141
3142		if (dtoffset->pn == 0) {
3143			if (dtoffset->index == 1) {
3144				/* build ".." entry */
3145
3146				if (filldir(dirent, "..", 2, filp->f_pos,
3147					    PARENT(ip), DT_DIR))
3148					return 0;
3149			} else {
3150				jfs_err("jfs_readdir called with "
3151					"invalid offset!");
3152			}
3153			dtoffset->pn = 1;
3154			dtoffset->index = 0;
3155			filp->f_pos = dtpos;
3156		}
3157
3158		if (dtEmpty(ip)) {
3159			filp->f_pos = DIREND;
3160			return 0;
3161		}
3162
3163		if ((rc = dtReadNext(ip, &filp->f_pos, &btstack))) {
3164			jfs_err("jfs_readdir: unexpected rc = %d "
3165				"from dtReadNext", rc);
3166			filp->f_pos = DIREND;
3167			return 0;
3168		}
3169		/* get start leaf page and index */
3170		DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
3171
3172		/* offset beyond directory eof ? */
3173		if (bn < 0) {
3174			filp->f_pos = DIREND;
3175			return 0;
3176		}
3177	}
3178
3179	dirent_buf = __get_free_page(GFP_KERNEL);
3180	if (dirent_buf == 0) {
3181		DT_PUTPAGE(mp);
3182		jfs_warn("jfs_readdir: __get_free_page failed!");
3183		filp->f_pos = DIREND;
3184		return -ENOMEM;
3185	}
3186
3187	while (1) {
3188		jfs_dirent = (struct jfs_dirent *) dirent_buf;
3189		jfs_dirents = 0;
3190		overflow = fix_page = 0;
3191
3192		stbl = DT_GETSTBL(p);
3193
3194		for (i = index; i < p->header.nextindex; i++) {
3195			d = (struct ldtentry *) & p->slot[stbl[i]];
3196
3197			if (((long) jfs_dirent + d->namlen + 1) >
3198			    (dirent_buf + PAGE_SIZE)) {
3199				/* DBCS codepages could overrun dirent_buf */
3200				index = i;
3201				overflow = 1;
3202				break;
3203			}
3204
3205			d_namleft = d->namlen;
3206			name_ptr = jfs_dirent->name;
3207			jfs_dirent->ino = le32_to_cpu(d->inumber);
3208
3209			if (do_index) {
3210				len = min(d_namleft, DTLHDRDATALEN);
3211				jfs_dirent->position = le32_to_cpu(d->index);
3212				/*
3213				 * d->index should always be valid, but it
3214				 * isn't.  fsck.jfs doesn't create the
3215				 * directory index for the lost+found
3216				 * directory.  Rather than let it go,
3217				 * we can try to fix it.
3218				 */
3219				if ((jfs_dirent->position < 2) ||
3220				    (jfs_dirent->position >=
3221				     JFS_IP(ip)->next_index)) {
3222					if (!page_fixed && !isReadOnly(ip)) {
3223						fix_page = 1;
3224						/*
3225						 * setting overflow and setting
3226						 * index to i will cause the
3227						 * same page to be processed
3228						 * again starting here
3229						 */
3230						overflow = 1;
3231						index = i;
3232						break;
3233					}
3234					jfs_dirent->position = unique_pos++;
3235				}
 
 
 
 
 
 
3236			} else {
3237				jfs_dirent->position = dtpos;
3238				len = min(d_namleft, DTLHDRDATALEN_LEGACY);
3239			}
3240
3241			/* copy the name of head/only segment */
3242			outlen = jfs_strfromUCS_le(name_ptr, d->name, len,
3243						   codepage);
3244			jfs_dirent->name_len = outlen;
3245
3246			/* copy name in the additional segment(s) */
3247			next = d->next;
3248			while (next >= 0) {
3249				t = (struct dtslot *) & p->slot[next];
3250				name_ptr += outlen;
3251				d_namleft -= len;
3252				/* Sanity Check */
3253				if (d_namleft == 0) {
3254					jfs_error(ip->i_sb,
3255						  "JFS:Dtree error: ino = "
3256						  "%ld, bn=%Ld, index = %d",
3257						  (long)ip->i_ino,
3258						  (long long)bn,
3259						  i);
3260					goto skip_one;
3261				}
3262				len = min(d_namleft, DTSLOTDATALEN);
3263				outlen = jfs_strfromUCS_le(name_ptr, t->name,
3264							   len, codepage);
3265				jfs_dirent->name_len += outlen;
3266
3267				next = t->next;
3268			}
3269
3270			jfs_dirents++;
3271			jfs_dirent = next_jfs_dirent(jfs_dirent);
3272skip_one:
3273			if (!do_index)
3274				dtoffset->index++;
3275		}
3276
3277		if (!overflow) {
3278			/* Point to next leaf page */
3279			if (p->header.flag & BT_ROOT)
3280				bn = 0;
3281			else {
3282				bn = le64_to_cpu(p->header.next);
3283				index = 0;
3284				/* update offset (pn:index) for new page */
3285				if (!do_index) {
3286					dtoffset->pn++;
3287					dtoffset->index = 0;
3288				}
3289			}
3290			page_fixed = 0;
3291		}
3292
3293		/* unpin previous leaf page */
3294		DT_PUTPAGE(mp);
3295
3296		jfs_dirent = (struct jfs_dirent *) dirent_buf;
3297		while (jfs_dirents--) {
3298			filp->f_pos = jfs_dirent->position;
3299			if (filldir(dirent, jfs_dirent->name,
3300				    jfs_dirent->name_len, filp->f_pos,
3301				    jfs_dirent->ino, DT_UNKNOWN))
3302				goto out;
3303			jfs_dirent = next_jfs_dirent(jfs_dirent);
3304		}
3305
3306		if (fix_page) {
3307			add_missing_indices(ip, bn);
3308			page_fixed = 1;
3309		}
3310
3311		if (!overflow && (bn == 0)) {
3312			filp->f_pos = DIREND;
3313			break;
3314		}
3315
3316		DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3317		if (rc) {
3318			free_page(dirent_buf);
3319			return rc;
3320		}
3321	}
3322
3323      out:
3324	free_page(dirent_buf);
3325
3326	return rc;
3327}
3328
3329
3330/*
3331 *	dtReadFirst()
3332 *
3333 * function: get the leftmost page of the directory
3334 */
3335static int dtReadFirst(struct inode *ip, struct btstack * btstack)
3336{
3337	int rc = 0;
3338	s64 bn;
3339	int psize = 288;	/* initial in-line directory */
3340	struct metapage *mp;
3341	dtpage_t *p;
3342	s8 *stbl;
3343	struct btframe *btsp;
3344	pxd_t *xd;
3345
3346	BT_CLR(btstack);	/* reset stack */
3347
3348	/*
3349	 *	descend leftmost path of the tree
3350	 *
3351	 * by convention, root bn = 0.
3352	 */
3353	for (bn = 0;;) {
3354		DT_GETPAGE(ip, bn, mp, psize, p, rc);
3355		if (rc)
3356			return rc;
3357
3358		/*
3359		 * leftmost leaf page
3360		 */
3361		if (p->header.flag & BT_LEAF) {
3362			/* return leftmost entry */
3363			btsp = btstack->top;
3364			btsp->bn = bn;
3365			btsp->index = 0;
3366			btsp->mp = mp;
3367
3368			return 0;
3369		}
3370
3371		/*
3372		 * descend down to leftmost child page
3373		 */
3374		if (BT_STACK_FULL(btstack)) {
3375			DT_PUTPAGE(mp);
3376			jfs_error(ip->i_sb, "dtReadFirst: btstack overrun");
3377			BT_STACK_DUMP(btstack);
3378			return -EIO;
3379		}
3380		/* push (bn, index) of the parent page/entry */
3381		BT_PUSH(btstack, bn, 0);
3382
3383		/* get the leftmost entry */
3384		stbl = DT_GETSTBL(p);
3385		xd = (pxd_t *) & p->slot[stbl[0]];
3386
3387		/* get the child page block address */
3388		bn = addressPXD(xd);
3389		psize = lengthPXD(xd) << JFS_SBI(ip->i_sb)->l2bsize;
3390
3391		/* unpin the parent page */
3392		DT_PUTPAGE(mp);
3393	}
3394}
3395
3396
3397/*
3398 *	dtReadNext()
3399 *
3400 * function: get the page of the specified offset (pn:index)
3401 *
3402 * return: if (offset > eof), bn = -1;
3403 *
3404 * note: if index > nextindex of the target leaf page,
3405 * start with 1st entry of next leaf page;
3406 */
3407static int dtReadNext(struct inode *ip, loff_t * offset,
3408		      struct btstack * btstack)
3409{
3410	int rc = 0;
3411	struct dtoffset {
3412		s16 pn;
3413		s16 index;
3414		s32 unused;
3415	} *dtoffset = (struct dtoffset *) offset;
3416	s64 bn;
3417	struct metapage *mp;
3418	dtpage_t *p;
3419	int index;
3420	int pn;
3421	s8 *stbl;
3422	struct btframe *btsp, *parent;
3423	pxd_t *xd;
3424
3425	/*
3426	 * get leftmost leaf page pinned
3427	 */
3428	if ((rc = dtReadFirst(ip, btstack)))
3429		return rc;
3430
3431	/* get leaf page */
3432	DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
3433
3434	/* get the start offset (pn:index) */
3435	pn = dtoffset->pn - 1;	/* Now pn = 0 represents leftmost leaf */
3436	index = dtoffset->index;
3437
3438	/* start at leftmost page ? */
3439	if (pn == 0) {
3440		/* offset beyond eof ? */
3441		if (index < p->header.nextindex)
3442			goto out;
3443
3444		if (p->header.flag & BT_ROOT) {
3445			bn = -1;
3446			goto out;
3447		}
3448
3449		/* start with 1st entry of next leaf page */
3450		dtoffset->pn++;
3451		dtoffset->index = index = 0;
3452		goto a;
3453	}
3454
3455	/* start at non-leftmost page: scan parent pages for large pn */
3456	if (p->header.flag & BT_ROOT) {
3457		bn = -1;
3458		goto out;
3459	}
3460
3461	/* start after next leaf page ? */
3462	if (pn > 1)
3463		goto b;
3464
3465	/* get leaf page pn = 1 */
3466      a:
3467	bn = le64_to_cpu(p->header.next);
3468
3469	/* unpin leaf page */
3470	DT_PUTPAGE(mp);
3471
3472	/* offset beyond eof ? */
3473	if (bn == 0) {
3474		bn = -1;
3475		goto out;
3476	}
3477
3478	goto c;
3479
3480	/*
3481	 * scan last internal page level to get target leaf page
3482	 */
3483      b:
3484	/* unpin leftmost leaf page */
3485	DT_PUTPAGE(mp);
3486
3487	/* get left most parent page */
3488	btsp = btstack->top;
3489	parent = btsp - 1;
3490	bn = parent->bn;
3491	DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3492	if (rc)
3493		return rc;
3494
3495	/* scan parent pages at last internal page level */
3496	while (pn >= p->header.nextindex) {
3497		pn -= p->header.nextindex;
3498
3499		/* get next parent page address */
3500		bn = le64_to_cpu(p->header.next);
3501
3502		/* unpin current parent page */
3503		DT_PUTPAGE(mp);
3504
3505		/* offset beyond eof ? */
3506		if (bn == 0) {
3507			bn = -1;
3508			goto out;
3509		}
3510
3511		/* get next parent page */
3512		DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3513		if (rc)
3514			return rc;
3515
3516		/* update parent page stack frame */
3517		parent->bn = bn;
3518	}
3519
3520	/* get leaf page address */
3521	stbl = DT_GETSTBL(p);
3522	xd = (pxd_t *) & p->slot[stbl[pn]];
3523	bn = addressPXD(xd);
3524
3525	/* unpin parent page */
3526	DT_PUTPAGE(mp);
3527
3528	/*
3529	 * get target leaf page
3530	 */
3531      c:
3532	DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3533	if (rc)
3534		return rc;
3535
3536	/*
3537	 * leaf page has been completed:
3538	 * start with 1st entry of next leaf page
3539	 */
3540	if (index >= p->header.nextindex) {
3541		bn = le64_to_cpu(p->header.next);
3542
3543		/* unpin leaf page */
3544		DT_PUTPAGE(mp);
3545
3546		/* offset beyond eof ? */
3547		if (bn == 0) {
3548			bn = -1;
3549			goto out;
3550		}
3551
3552		/* get next leaf page */
3553		DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3554		if (rc)
3555			return rc;
3556
3557		/* start with 1st entry of next leaf page */
3558		dtoffset->pn++;
3559		dtoffset->index = 0;
3560	}
3561
3562      out:
3563	/* return target leaf page pinned */
3564	btsp = btstack->top;
3565	btsp->bn = bn;
3566	btsp->index = dtoffset->index;
3567	btsp->mp = mp;
3568
3569	return 0;
3570}
3571
3572
3573/*
3574 *	dtCompare()
3575 *
3576 * function: compare search key with an internal entry
3577 *
3578 * return:
3579 *	< 0 if k is < record
3580 *	= 0 if k is = record
3581 *	> 0 if k is > record
3582 */
3583static int dtCompare(struct component_name * key,	/* search key */
3584		     dtpage_t * p,	/* directory page */
3585		     int si)
3586{				/* entry slot index */
3587	wchar_t *kname;
3588	__le16 *name;
3589	int klen, namlen, len, rc;
3590	struct idtentry *ih;
3591	struct dtslot *t;
3592
3593	/*
3594	 * force the left-most key on internal pages, at any level of
3595	 * the tree, to be less than any search key.
3596	 * this obviates having to update the leftmost key on an internal
3597	 * page when the user inserts a new key in the tree smaller than
3598	 * anything that has been stored.
3599	 *
3600	 * (? if/when dtSearch() narrows down to 1st entry (index = 0),
3601	 * at any internal page at any level of the tree,
3602	 * it descends to child of the entry anyway -
3603	 * ? make the entry as min size dummy entry)
3604	 *
3605	 * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF))
3606	 * return (1);
3607	 */
3608
3609	kname = key->name;
3610	klen = key->namlen;
3611
3612	ih = (struct idtentry *) & p->slot[si];
3613	si = ih->next;
3614	name = ih->name;
3615	namlen = ih->namlen;
3616	len = min(namlen, DTIHDRDATALEN);
3617
3618	/* compare with head/only segment */
3619	len = min(klen, len);
3620	if ((rc = UniStrncmp_le(kname, name, len)))
3621		return rc;
3622
3623	klen -= len;
3624	namlen -= len;
3625
3626	/* compare with additional segment(s) */
3627	kname += len;
3628	while (klen > 0 && namlen > 0) {
3629		/* compare with next name segment */
3630		t = (struct dtslot *) & p->slot[si];
3631		len = min(namlen, DTSLOTDATALEN);
3632		len = min(klen, len);
3633		name = t->name;
3634		if ((rc = UniStrncmp_le(kname, name, len)))
3635			return rc;
3636
3637		klen -= len;
3638		namlen -= len;
3639		kname += len;
3640		si = t->next;
3641	}
3642
3643	return (klen - namlen);
3644}
3645
3646
3647
3648
3649/*
3650 *	ciCompare()
3651 *
3652 * function: compare search key with an (leaf/internal) entry
3653 *
3654 * return:
3655 *	< 0 if k is < record
3656 *	= 0 if k is = record
3657 *	> 0 if k is > record
3658 */
3659static int ciCompare(struct component_name * key,	/* search key */
3660		     dtpage_t * p,	/* directory page */
3661		     int si,	/* entry slot index */
3662		     int flag)
3663{
3664	wchar_t *kname, x;
3665	__le16 *name;
3666	int klen, namlen, len, rc;
3667	struct ldtentry *lh;
3668	struct idtentry *ih;
3669	struct dtslot *t;
3670	int i;
3671
3672	/*
3673	 * force the left-most key on internal pages, at any level of
3674	 * the tree, to be less than any search key.
3675	 * this obviates having to update the leftmost key on an internal
3676	 * page when the user inserts a new key in the tree smaller than
3677	 * anything that has been stored.
3678	 *
3679	 * (? if/when dtSearch() narrows down to 1st entry (index = 0),
3680	 * at any internal page at any level of the tree,
3681	 * it descends to child of the entry anyway -
3682	 * ? make the entry as min size dummy entry)
3683	 *
3684	 * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF))
3685	 * return (1);
3686	 */
3687
3688	kname = key->name;
3689	klen = key->namlen;
3690
3691	/*
3692	 * leaf page entry
3693	 */
3694	if (p->header.flag & BT_LEAF) {
3695		lh = (struct ldtentry *) & p->slot[si];
3696		si = lh->next;
3697		name = lh->name;
3698		namlen = lh->namlen;
3699		if (flag & JFS_DIR_INDEX)
3700			len = min(namlen, DTLHDRDATALEN);
3701		else
3702			len = min(namlen, DTLHDRDATALEN_LEGACY);
3703	}
3704	/*
3705	 * internal page entry
3706	 */
3707	else {
3708		ih = (struct idtentry *) & p->slot[si];
3709		si = ih->next;
3710		name = ih->name;
3711		namlen = ih->namlen;
3712		len = min(namlen, DTIHDRDATALEN);
3713	}
3714
3715	/* compare with head/only segment */
3716	len = min(klen, len);
3717	for (i = 0; i < len; i++, kname++, name++) {
3718		/* only uppercase if case-insensitive support is on */
3719		if ((flag & JFS_OS2) == JFS_OS2)
3720			x = UniToupper(le16_to_cpu(*name));
3721		else
3722			x = le16_to_cpu(*name);
3723		if ((rc = *kname - x))
3724			return rc;
3725	}
3726
3727	klen -= len;
3728	namlen -= len;
3729
3730	/* compare with additional segment(s) */
3731	while (klen > 0 && namlen > 0) {
3732		/* compare with next name segment */
3733		t = (struct dtslot *) & p->slot[si];
3734		len = min(namlen, DTSLOTDATALEN);
3735		len = min(klen, len);
3736		name = t->name;
3737		for (i = 0; i < len; i++, kname++, name++) {
3738			/* only uppercase if case-insensitive support is on */
3739			if ((flag & JFS_OS2) == JFS_OS2)
3740				x = UniToupper(le16_to_cpu(*name));
3741			else
3742				x = le16_to_cpu(*name);
3743
3744			if ((rc = *kname - x))
3745				return rc;
3746		}
3747
3748		klen -= len;
3749		namlen -= len;
3750		si = t->next;
3751	}
3752
3753	return (klen - namlen);
3754}
3755
3756
3757/*
3758 *	ciGetLeafPrefixKey()
3759 *
3760 * function: compute prefix of suffix compression
3761 *	     from two adjacent leaf entries
3762 *	     across page boundary
3763 *
3764 * return: non-zero on error
3765 *
3766 */
3767static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp,
3768			       int ri, struct component_name * key, int flag)
3769{
3770	int klen, namlen;
3771	wchar_t *pl, *pr, *kname;
3772	struct component_name lkey;
3773	struct component_name rkey;
3774
3775	lkey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t),
3776					GFP_KERNEL);
3777	if (lkey.name == NULL)
3778		return -ENOMEM;
3779
3780	rkey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t),
3781					GFP_KERNEL);
3782	if (rkey.name == NULL) {
3783		kfree(lkey.name);
3784		return -ENOMEM;
3785	}
3786
3787	/* get left and right key */
3788	dtGetKey(lp, li, &lkey, flag);
3789	lkey.name[lkey.namlen] = 0;
3790
3791	if ((flag & JFS_OS2) == JFS_OS2)
3792		ciToUpper(&lkey);
3793
3794	dtGetKey(rp, ri, &rkey, flag);
3795	rkey.name[rkey.namlen] = 0;
3796
3797
3798	if ((flag & JFS_OS2) == JFS_OS2)
3799		ciToUpper(&rkey);
3800
3801	/* compute prefix */
3802	klen = 0;
3803	kname = key->name;
3804	namlen = min(lkey.namlen, rkey.namlen);
3805	for (pl = lkey.name, pr = rkey.name;
3806	     namlen; pl++, pr++, namlen--, klen++, kname++) {
3807		*kname = *pr;
3808		if (*pl != *pr) {
3809			key->namlen = klen + 1;
3810			goto free_names;
3811		}
3812	}
3813
3814	/* l->namlen <= r->namlen since l <= r */
3815	if (lkey.namlen < rkey.namlen) {
3816		*kname = *pr;
3817		key->namlen = klen + 1;
3818	} else			/* l->namelen == r->namelen */
3819		key->namlen = klen;
3820
3821free_names:
3822	kfree(lkey.name);
3823	kfree(rkey.name);
3824	return 0;
3825}
3826
3827
3828
3829/*
3830 *	dtGetKey()
3831 *
3832 * function: get key of the entry
3833 */
3834static void dtGetKey(dtpage_t * p, int i,	/* entry index */
3835		     struct component_name * key, int flag)
3836{
3837	int si;
3838	s8 *stbl;
3839	struct ldtentry *lh;
3840	struct idtentry *ih;
3841	struct dtslot *t;
3842	int namlen, len;
3843	wchar_t *kname;
3844	__le16 *name;
3845
3846	/* get entry */
3847	stbl = DT_GETSTBL(p);
3848	si = stbl[i];
3849	if (p->header.flag & BT_LEAF) {
3850		lh = (struct ldtentry *) & p->slot[si];
3851		si = lh->next;
3852		namlen = lh->namlen;
3853		name = lh->name;
3854		if (flag & JFS_DIR_INDEX)
3855			len = min(namlen, DTLHDRDATALEN);
3856		else
3857			len = min(namlen, DTLHDRDATALEN_LEGACY);
3858	} else {
3859		ih = (struct idtentry *) & p->slot[si];
3860		si = ih->next;
3861		namlen = ih->namlen;
3862		name = ih->name;
3863		len = min(namlen, DTIHDRDATALEN);
3864	}
3865
3866	key->namlen = namlen;
3867	kname = key->name;
3868
3869	/*
3870	 * move head/only segment
3871	 */
3872	UniStrncpy_from_le(kname, name, len);
3873
3874	/*
3875	 * move additional segment(s)
3876	 */
3877	while (si >= 0) {
3878		/* get next segment */
3879		t = &p->slot[si];
3880		kname += len;
3881		namlen -= len;
3882		len = min(namlen, DTSLOTDATALEN);
3883		UniStrncpy_from_le(kname, t->name, len);
3884
3885		si = t->next;
3886	}
3887}
3888
3889
3890/*
3891 *	dtInsertEntry()
3892 *
3893 * function: allocate free slot(s) and
3894 *	     write a leaf/internal entry
3895 *
3896 * return: entry slot index
3897 */
3898static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key,
3899			  ddata_t * data, struct dt_lock ** dtlock)
3900{
3901	struct dtslot *h, *t;
3902	struct ldtentry *lh = NULL;
3903	struct idtentry *ih = NULL;
3904	int hsi, fsi, klen, len, nextindex;
3905	wchar_t *kname;
3906	__le16 *name;
3907	s8 *stbl;
3908	pxd_t *xd;
3909	struct dt_lock *dtlck = *dtlock;
3910	struct lv *lv;
3911	int xsi, n;
3912	s64 bn = 0;
3913	struct metapage *mp = NULL;
3914
3915	klen = key->namlen;
3916	kname = key->name;
3917
3918	/* allocate a free slot */
3919	hsi = fsi = p->header.freelist;
3920	h = &p->slot[fsi];
3921	p->header.freelist = h->next;
3922	--p->header.freecnt;
3923
3924	/* open new linelock */
3925	if (dtlck->index >= dtlck->maxcnt)
3926		dtlck = (struct dt_lock *) txLinelock(dtlck);
3927
3928	lv = & dtlck->lv[dtlck->index];
3929	lv->offset = hsi;
3930
3931	/* write head/only segment */
3932	if (p->header.flag & BT_LEAF) {
3933		lh = (struct ldtentry *) h;
3934		lh->next = h->next;
3935		lh->inumber = cpu_to_le32(data->leaf.ino);
3936		lh->namlen = klen;
3937		name = lh->name;
3938		if (data->leaf.ip) {
3939			len = min(klen, DTLHDRDATALEN);
3940			if (!(p->header.flag & BT_ROOT))
3941				bn = addressPXD(&p->header.self);
3942			lh->index = cpu_to_le32(add_index(data->leaf.tid,
3943							  data->leaf.ip,
3944							  bn, index));
3945		} else
3946			len = min(klen, DTLHDRDATALEN_LEGACY);
3947	} else {
3948		ih = (struct idtentry *) h;
3949		ih->next = h->next;
3950		xd = (pxd_t *) ih;
3951		*xd = data->xd;
3952		ih->namlen = klen;
3953		name = ih->name;
3954		len = min(klen, DTIHDRDATALEN);
3955	}
3956
3957	UniStrncpy_to_le(name, kname, len);
3958
3959	n = 1;
3960	xsi = hsi;
3961
3962	/* write additional segment(s) */
3963	t = h;
3964	klen -= len;
3965	while (klen) {
3966		/* get free slot */
3967		fsi = p->header.freelist;
3968		t = &p->slot[fsi];
3969		p->header.freelist = t->next;
3970		--p->header.freecnt;
3971
3972		/* is next slot contiguous ? */
3973		if (fsi != xsi + 1) {
3974			/* close current linelock */
3975			lv->length = n;
3976			dtlck->index++;
3977
3978			/* open new linelock */
3979			if (dtlck->index < dtlck->maxcnt)
3980				lv++;
3981			else {
3982				dtlck = (struct dt_lock *) txLinelock(dtlck);
3983				lv = & dtlck->lv[0];
3984			}
3985
3986			lv->offset = fsi;
3987			n = 0;
3988		}
3989
3990		kname += len;
3991		len = min(klen, DTSLOTDATALEN);
3992		UniStrncpy_to_le(t->name, kname, len);
3993
3994		n++;
3995		xsi = fsi;
3996		klen -= len;
3997	}
3998
3999	/* close current linelock */
4000	lv->length = n;
4001	dtlck->index++;
4002
4003	*dtlock = dtlck;
4004
4005	/* terminate last/only segment */
4006	if (h == t) {
4007		/* single segment entry */
4008		if (p->header.flag & BT_LEAF)
4009			lh->next = -1;
4010		else
4011			ih->next = -1;
4012	} else
4013		/* multi-segment entry */
4014		t->next = -1;
4015
4016	/* if insert into middle, shift right succeeding entries in stbl */
4017	stbl = DT_GETSTBL(p);
4018	nextindex = p->header.nextindex;
4019	if (index < nextindex) {
4020		memmove(stbl + index + 1, stbl + index, nextindex - index);
4021
4022		if ((p->header.flag & BT_LEAF) && data->leaf.ip) {
4023			s64 lblock;
4024
4025			/*
4026			 * Need to update slot number for entries that moved
4027			 * in the stbl
4028			 */
4029			mp = NULL;
4030			for (n = index + 1; n <= nextindex; n++) {
4031				lh = (struct ldtentry *) & (p->slot[stbl[n]]);
4032				modify_index(data->leaf.tid, data->leaf.ip,
4033					     le32_to_cpu(lh->index), bn, n,
4034					     &mp, &lblock);
4035			}
4036			if (mp)
4037				release_metapage(mp);
4038		}
4039	}
4040
4041	stbl[index] = hsi;
4042
4043	/* advance next available entry index of stbl */
4044	++p->header.nextindex;
4045}
4046
4047
4048/*
4049 *	dtMoveEntry()
4050 *
4051 * function: move entries from split/left page to new/right page
4052 *
4053 *	nextindex of dst page and freelist/freecnt of both pages
4054 *	are updated.
4055 */
4056static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp,
4057			struct dt_lock ** sdtlock, struct dt_lock ** ddtlock,
4058			int do_index)
4059{
4060	int ssi, next;		/* src slot index */
4061	int di;			/* dst entry index */
4062	int dsi;		/* dst slot index */
4063	s8 *sstbl, *dstbl;	/* sorted entry table */
4064	int snamlen, len;
4065	struct ldtentry *slh, *dlh = NULL;
4066	struct idtentry *sih, *dih = NULL;
4067	struct dtslot *h, *s, *d;
4068	struct dt_lock *sdtlck = *sdtlock, *ddtlck = *ddtlock;
4069	struct lv *slv, *dlv;
4070	int xssi, ns, nd;
4071	int sfsi;
4072
4073	sstbl = (s8 *) & sp->slot[sp->header.stblindex];
4074	dstbl = (s8 *) & dp->slot[dp->header.stblindex];
4075
4076	dsi = dp->header.freelist;	/* first (whole page) free slot */
4077	sfsi = sp->header.freelist;
4078
4079	/* linelock destination entry slot */
4080	dlv = & ddtlck->lv[ddtlck->index];
4081	dlv->offset = dsi;
4082
4083	/* linelock source entry slot */
4084	slv = & sdtlck->lv[sdtlck->index];
4085	slv->offset = sstbl[si];
4086	xssi = slv->offset - 1;
4087
4088	/*
4089	 * move entries
4090	 */
4091	ns = nd = 0;
4092	for (di = 0; si < sp->header.nextindex; si++, di++) {
4093		ssi = sstbl[si];
4094		dstbl[di] = dsi;
4095
4096		/* is next slot contiguous ? */
4097		if (ssi != xssi + 1) {
4098			/* close current linelock */
4099			slv->length = ns;
4100			sdtlck->index++;
4101
4102			/* open new linelock */
4103			if (sdtlck->index < sdtlck->maxcnt)
4104				slv++;
4105			else {
4106				sdtlck = (struct dt_lock *) txLinelock(sdtlck);
4107				slv = & sdtlck->lv[0];
4108			}
4109
4110			slv->offset = ssi;
4111			ns = 0;
4112		}
4113
4114		/*
4115		 * move head/only segment of an entry
4116		 */
4117		/* get dst slot */
4118		h = d = &dp->slot[dsi];
4119
4120		/* get src slot and move */
4121		s = &sp->slot[ssi];
4122		if (sp->header.flag & BT_LEAF) {
4123			/* get source entry */
4124			slh = (struct ldtentry *) s;
4125			dlh = (struct ldtentry *) h;
4126			snamlen = slh->namlen;
4127
4128			if (do_index) {
4129				len = min(snamlen, DTLHDRDATALEN);
4130				dlh->index = slh->index; /* little-endian */
4131			} else
4132				len = min(snamlen, DTLHDRDATALEN_LEGACY);
4133
4134			memcpy(dlh, slh, 6 + len * 2);
4135
4136			next = slh->next;
4137
4138			/* update dst head/only segment next field */
4139			dsi++;
4140			dlh->next = dsi;
4141		} else {
4142			sih = (struct idtentry *) s;
4143			snamlen = sih->namlen;
4144
4145			len = min(snamlen, DTIHDRDATALEN);
4146			dih = (struct idtentry *) h;
4147			memcpy(dih, sih, 10 + len * 2);
4148			next = sih->next;
4149
4150			dsi++;
4151			dih->next = dsi;
4152		}
4153
4154		/* free src head/only segment */
4155		s->next = sfsi;
4156		s->cnt = 1;
4157		sfsi = ssi;
4158
4159		ns++;
4160		nd++;
4161		xssi = ssi;
4162
4163		/*
4164		 * move additional segment(s) of the entry
4165		 */
4166		snamlen -= len;
4167		while ((ssi = next) >= 0) {
4168			/* is next slot contiguous ? */
4169			if (ssi != xssi + 1) {
4170				/* close current linelock */
4171				slv->length = ns;
4172				sdtlck->index++;
4173
4174				/* open new linelock */
4175				if (sdtlck->index < sdtlck->maxcnt)
4176					slv++;
4177				else {
4178					sdtlck =
4179					    (struct dt_lock *)
4180					    txLinelock(sdtlck);
4181					slv = & sdtlck->lv[0];
4182				}
4183
4184				slv->offset = ssi;
4185				ns = 0;
4186			}
4187
4188			/* get next source segment */
4189			s = &sp->slot[ssi];
4190
4191			/* get next destination free slot */
4192			d++;
4193
4194			len = min(snamlen, DTSLOTDATALEN);
4195			UniStrncpy_le(d->name, s->name, len);
4196
4197			ns++;
4198			nd++;
4199			xssi = ssi;
4200
4201			dsi++;
4202			d->next = dsi;
4203
4204			/* free source segment */
4205			next = s->next;
4206			s->next = sfsi;
4207			s->cnt = 1;
4208			sfsi = ssi;
4209
4210			snamlen -= len;
4211		}		/* end while */
4212
4213		/* terminate dst last/only segment */
4214		if (h == d) {
4215			/* single segment entry */
4216			if (dp->header.flag & BT_LEAF)
4217				dlh->next = -1;
4218			else
4219				dih->next = -1;
4220		} else
4221			/* multi-segment entry */
4222			d->next = -1;
4223	}			/* end for */
4224
4225	/* close current linelock */
4226	slv->length = ns;
4227	sdtlck->index++;
4228	*sdtlock = sdtlck;
4229
4230	dlv->length = nd;
4231	ddtlck->index++;
4232	*ddtlock = ddtlck;
4233
4234	/* update source header */
4235	sp->header.freelist = sfsi;
4236	sp->header.freecnt += nd;
4237
4238	/* update destination header */
4239	dp->header.nextindex = di;
4240
4241	dp->header.freelist = dsi;
4242	dp->header.freecnt -= nd;
4243}
4244
4245
4246/*
4247 *	dtDeleteEntry()
4248 *
4249 * function: free a (leaf/internal) entry
4250 *
4251 * log freelist header, stbl, and each segment slot of entry
4252 * (even though last/only segment next field is modified,
4253 * physical image logging requires all segment slots of
4254 * the entry logged to avoid applying previous updates
4255 * to the same slots)
4256 */
4257static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock)
4258{
4259	int fsi;		/* free entry slot index */
4260	s8 *stbl;
4261	struct dtslot *t;
4262	int si, freecnt;
4263	struct dt_lock *dtlck = *dtlock;
4264	struct lv *lv;
4265	int xsi, n;
4266
4267	/* get free entry slot index */
4268	stbl = DT_GETSTBL(p);
4269	fsi = stbl[fi];
4270
4271	/* open new linelock */
4272	if (dtlck->index >= dtlck->maxcnt)
4273		dtlck = (struct dt_lock *) txLinelock(dtlck);
4274	lv = & dtlck->lv[dtlck->index];
4275
4276	lv->offset = fsi;
4277
4278	/* get the head/only segment */
4279	t = &p->slot[fsi];
4280	if (p->header.flag & BT_LEAF)
4281		si = ((struct ldtentry *) t)->next;
4282	else
4283		si = ((struct idtentry *) t)->next;
4284	t->next = si;
4285	t->cnt = 1;
4286
4287	n = freecnt = 1;
4288	xsi = fsi;
4289
4290	/* find the last/only segment */
4291	while (si >= 0) {
4292		/* is next slot contiguous ? */
4293		if (si != xsi + 1) {
4294			/* close current linelock */
4295			lv->length = n;
4296			dtlck->index++;
4297
4298			/* open new linelock */
4299			if (dtlck->index < dtlck->maxcnt)
4300				lv++;
4301			else {
4302				dtlck = (struct dt_lock *) txLinelock(dtlck);
4303				lv = & dtlck->lv[0];
4304			}
4305
4306			lv->offset = si;
4307			n = 0;
4308		}
4309
4310		n++;
4311		xsi = si;
4312		freecnt++;
4313
4314		t = &p->slot[si];
4315		t->cnt = 1;
4316		si = t->next;
4317	}
4318
4319	/* close current linelock */
4320	lv->length = n;
4321	dtlck->index++;
4322
4323	*dtlock = dtlck;
4324
4325	/* update freelist */
4326	t->next = p->header.freelist;
4327	p->header.freelist = fsi;
4328	p->header.freecnt += freecnt;
4329
4330	/* if delete from middle,
4331	 * shift left the succedding entries in the stbl
4332	 */
4333	si = p->header.nextindex;
4334	if (fi < si - 1)
4335		memmove(&stbl[fi], &stbl[fi + 1], si - fi - 1);
4336
4337	p->header.nextindex--;
4338}
4339
4340
4341/*
4342 *	dtTruncateEntry()
4343 *
4344 * function: truncate a (leaf/internal) entry
4345 *
4346 * log freelist header, stbl, and each segment slot of entry
4347 * (even though last/only segment next field is modified,
4348 * physical image logging requires all segment slots of
4349 * the entry logged to avoid applying previous updates
4350 * to the same slots)
4351 */
4352static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock)
4353{
4354	int tsi;		/* truncate entry slot index */
4355	s8 *stbl;
4356	struct dtslot *t;
4357	int si, freecnt;
4358	struct dt_lock *dtlck = *dtlock;
4359	struct lv *lv;
4360	int fsi, xsi, n;
4361
4362	/* get free entry slot index */
4363	stbl = DT_GETSTBL(p);
4364	tsi = stbl[ti];
4365
4366	/* open new linelock */
4367	if (dtlck->index >= dtlck->maxcnt)
4368		dtlck = (struct dt_lock *) txLinelock(dtlck);
4369	lv = & dtlck->lv[dtlck->index];
4370
4371	lv->offset = tsi;
4372
4373	/* get the head/only segment */
4374	t = &p->slot[tsi];
4375	ASSERT(p->header.flag & BT_INTERNAL);
4376	((struct idtentry *) t)->namlen = 0;
4377	si = ((struct idtentry *) t)->next;
4378	((struct idtentry *) t)->next = -1;
4379
4380	n = 1;
4381	freecnt = 0;
4382	fsi = si;
4383	xsi = tsi;
4384
4385	/* find the last/only segment */
4386	while (si >= 0) {
4387		/* is next slot contiguous ? */
4388		if (si != xsi + 1) {
4389			/* close current linelock */
4390			lv->length = n;
4391			dtlck->index++;
4392
4393			/* open new linelock */
4394			if (dtlck->index < dtlck->maxcnt)
4395				lv++;
4396			else {
4397				dtlck = (struct dt_lock *) txLinelock(dtlck);
4398				lv = & dtlck->lv[0];
4399			}
4400
4401			lv->offset = si;
4402			n = 0;
4403		}
4404
4405		n++;
4406		xsi = si;
4407		freecnt++;
4408
4409		t = &p->slot[si];
4410		t->cnt = 1;
4411		si = t->next;
4412	}
4413
4414	/* close current linelock */
4415	lv->length = n;
4416	dtlck->index++;
4417
4418	*dtlock = dtlck;
4419
4420	/* update freelist */
4421	if (freecnt == 0)
4422		return;
4423	t->next = p->header.freelist;
4424	p->header.freelist = fsi;
4425	p->header.freecnt += freecnt;
4426}
4427
4428
4429/*
4430 *	dtLinelockFreelist()
4431 */
4432static void dtLinelockFreelist(dtpage_t * p,	/* directory page */
4433			       int m,	/* max slot index */
4434			       struct dt_lock ** dtlock)
4435{
4436	int fsi;		/* free entry slot index */
4437	struct dtslot *t;
4438	int si;
4439	struct dt_lock *dtlck = *dtlock;
4440	struct lv *lv;
4441	int xsi, n;
4442
4443	/* get free entry slot index */
4444	fsi = p->header.freelist;
4445
4446	/* open new linelock */
4447	if (dtlck->index >= dtlck->maxcnt)
4448		dtlck = (struct dt_lock *) txLinelock(dtlck);
4449	lv = & dtlck->lv[dtlck->index];
4450
4451	lv->offset = fsi;
4452
4453	n = 1;
4454	xsi = fsi;
4455
4456	t = &p->slot[fsi];
4457	si = t->next;
4458
4459	/* find the last/only segment */
4460	while (si < m && si >= 0) {
4461		/* is next slot contiguous ? */
4462		if (si != xsi + 1) {
4463			/* close current linelock */
4464			lv->length = n;
4465			dtlck->index++;
4466
4467			/* open new linelock */
4468			if (dtlck->index < dtlck->maxcnt)
4469				lv++;
4470			else {
4471				dtlck = (struct dt_lock *) txLinelock(dtlck);
4472				lv = & dtlck->lv[0];
4473			}
4474
4475			lv->offset = si;
4476			n = 0;
4477		}
4478
4479		n++;
4480		xsi = si;
4481
4482		t = &p->slot[si];
4483		si = t->next;
4484	}
4485
4486	/* close current linelock */
4487	lv->length = n;
4488	dtlck->index++;
4489
4490	*dtlock = dtlck;
4491}
4492
4493
4494/*
4495 * NAME: dtModify
4496 *
4497 * FUNCTION: Modify the inode number part of a directory entry
4498 *
4499 * PARAMETERS:
4500 *	tid	- Transaction id
4501 *	ip	- Inode of parent directory
4502 *	key	- Name of entry to be modified
4503 *	orig_ino	- Original inode number expected in entry
4504 *	new_ino	- New inode number to put into entry
4505 *	flag	- JFS_RENAME
4506 *
4507 * RETURNS:
4508 *	-ESTALE	- If entry found does not match orig_ino passed in
4509 *	-ENOENT	- If no entry can be found to match key
4510 *	0	- If successfully modified entry
4511 */
4512int dtModify(tid_t tid, struct inode *ip,
4513	 struct component_name * key, ino_t * orig_ino, ino_t new_ino, int flag)
4514{
4515	int rc;
4516	s64 bn;
4517	struct metapage *mp;
4518	dtpage_t *p;
4519	int index;
4520	struct btstack btstack;
4521	struct tlock *tlck;
4522	struct dt_lock *dtlck;
4523	struct lv *lv;
4524	s8 *stbl;
4525	int entry_si;		/* entry slot index */
4526	struct ldtentry *entry;
4527
4528	/*
4529	 *	search for the entry to modify:
4530	 *
4531	 * dtSearch() returns (leaf page pinned, index at which to modify).
4532	 */
4533	if ((rc = dtSearch(ip, key, orig_ino, &btstack, flag)))
4534		return rc;
4535
4536	/* retrieve search result */
4537	DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
4538
4539	BT_MARK_DIRTY(mp, ip);
4540	/*
4541	 * acquire a transaction lock on the leaf page of named entry
4542	 */
4543	tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
4544	dtlck = (struct dt_lock *) & tlck->lock;
4545
4546	/* get slot index of the entry */
4547	stbl = DT_GETSTBL(p);
4548	entry_si = stbl[index];
4549
4550	/* linelock entry */
4551	ASSERT(dtlck->index == 0);
4552	lv = & dtlck->lv[0];
4553	lv->offset = entry_si;
4554	lv->length = 1;
4555	dtlck->index++;
4556
4557	/* get the head/only segment */
4558	entry = (struct ldtentry *) & p->slot[entry_si];
4559
4560	/* substitute the inode number of the entry */
4561	entry->inumber = cpu_to_le32(new_ino);
4562
4563	/* unpin the leaf page */
4564	DT_PUTPAGE(mp);
4565
4566	return 0;
4567}