Linux Audio

Check our new training course

Loading...
v4.17
 
   1/*
   2 *   Copyright (C) International Business Machines Corp., 2000-2004
   3 *
   4 *   This program is free software;  you can redistribute it and/or modify
   5 *   it under the terms of the GNU General Public License as published by
   6 *   the Free Software Foundation; either version 2 of the License, or
   7 *   (at your option) any later version.
   8 *
   9 *   This program is distributed in the hope that it will be useful,
  10 *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  11 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  12 *   the GNU General Public License for more details.
  13 *
  14 *   You should have received a copy of the GNU General Public License
  15 *   along with this program;  if not, write to the Free Software
  16 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  17 */
  18
  19/*
  20 *	jfs_dtree.c: directory B+-tree manager
  21 *
  22 * B+-tree with variable length key directory:
  23 *
  24 * each directory page is structured as an array of 32-byte
  25 * directory entry slots initialized as a freelist
  26 * to avoid search/compaction of free space at insertion.
  27 * when an entry is inserted, a number of slots are allocated
  28 * from the freelist as required to store variable length data
  29 * of the entry; when the entry is deleted, slots of the entry
  30 * are returned to freelist.
  31 *
  32 * leaf entry stores full name as key and file serial number
  33 * (aka inode number) as data.
  34 * internal/router entry stores sufffix compressed name
  35 * as key and simple extent descriptor as data.
  36 *
  37 * each directory page maintains a sorted entry index table
  38 * which stores the start slot index of sorted entries
  39 * to allow binary search on the table.
  40 *
  41 * directory starts as a root/leaf page in on-disk inode
  42 * inline data area.
  43 * when it becomes full, it starts a leaf of a external extent
  44 * of length of 1 block. each time the first leaf becomes full,
  45 * it is extended rather than split (its size is doubled),
  46 * until its length becoms 4 KBytes, from then the extent is split
  47 * with new 4 Kbyte extent when it becomes full
  48 * to reduce external fragmentation of small directories.
  49 *
  50 * blah, blah, blah, for linear scan of directory in pieces by
  51 * readdir().
  52 *
  53 *
  54 *	case-insensitive directory file system
  55 *
  56 * names are stored in case-sensitive way in leaf entry.
  57 * but stored, searched and compared in case-insensitive (uppercase) order
  58 * (i.e., both search key and entry key are folded for search/compare):
  59 * (note that case-sensitive order is BROKEN in storage, e.g.,
  60 *  sensitive: Ad, aB, aC, aD -> insensitive: aB, aC, aD, Ad
  61 *
  62 *  entries which folds to the same key makes up a equivalent class
  63 *  whose members are stored as contiguous cluster (may cross page boundary)
  64 *  but whose order is arbitrary and acts as duplicate, e.g.,
  65 *  abc, Abc, aBc, abC)
  66 *
  67 * once match is found at leaf, requires scan forward/backward
  68 * either for, in case-insensitive search, duplicate
  69 * or for, in case-sensitive search, for exact match
  70 *
  71 * router entry must be created/stored in case-insensitive way
  72 * in internal entry:
  73 * (right most key of left page and left most key of right page
  74 * are folded, and its suffix compression is propagated as router
  75 * key in parent)
  76 * (e.g., if split occurs <abc> and <aBd>, <ABD> trather than <aB>
  77 * should be made the router key for the split)
  78 *
  79 * case-insensitive search:
  80 *
  81 *	fold search key;
  82 *
  83 *	case-insensitive search of B-tree:
  84 *	for internal entry, router key is already folded;
  85 *	for leaf entry, fold the entry key before comparison.
  86 *
  87 *	if (leaf entry case-insensitive match found)
  88 *		if (next entry satisfies case-insensitive match)
  89 *			return EDUPLICATE;
  90 *		if (prev entry satisfies case-insensitive match)
  91 *			return EDUPLICATE;
  92 *		return match;
  93 *	else
  94 *		return no match;
  95 *
  96 *	serialization:
  97 * target directory inode lock is being held on entry/exit
  98 * of all main directory service routines.
  99 *
 100 *	log based recovery:
 101 */
 102
 103#include <linux/fs.h>
 104#include <linux/quotaops.h>
 105#include <linux/slab.h>
 106#include "jfs_incore.h"
 107#include "jfs_superblock.h"
 108#include "jfs_filsys.h"
 109#include "jfs_metapage.h"
 110#include "jfs_dmap.h"
 111#include "jfs_unicode.h"
 112#include "jfs_debug.h"
 113
 114/* dtree split parameter */
 115struct dtsplit {
 116	struct metapage *mp;
 117	s16 index;
 118	s16 nslot;
 119	struct component_name *key;
 120	ddata_t *data;
 121	struct pxdlist *pxdlist;
 122};
 123
 124#define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot)
 125
 126/* get page buffer for specified block address */
 127#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC)				\
 128do {									\
 129	BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot);	\
 130	if (!(RC)) {							\
 131		if (((P)->header.nextindex >				\
 132		     (((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \
 133		    ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT))) {	\
 134			BT_PUTPAGE(MP);					\
 135			jfs_error((IP)->i_sb,				\
 136				  "DT_GETPAGE: dtree page corrupt\n");	\
 137			MP = NULL;					\
 138			RC = -EIO;					\
 139		}							\
 140	}								\
 141} while (0)
 142
 143/* for consistency */
 144#define DT_PUTPAGE(MP) BT_PUTPAGE(MP)
 145
 146#define DT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \
 147	BT_GETSEARCH(IP, LEAF, BN, MP, dtpage_t, P, INDEX, i_dtroot)
 148
 149/*
 150 * forward references
 151 */
 152static int dtSplitUp(tid_t tid, struct inode *ip,
 153		     struct dtsplit * split, struct btstack * btstack);
 154
 155static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
 156		       struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rxdp);
 157
 158static int dtExtendPage(tid_t tid, struct inode *ip,
 159			struct dtsplit * split, struct btstack * btstack);
 160
 161static int dtSplitRoot(tid_t tid, struct inode *ip,
 162		       struct dtsplit * split, struct metapage ** rmpp);
 163
 164static int dtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp,
 165		      dtpage_t * fp, struct btstack * btstack);
 166
 167static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p);
 168
 169static int dtReadFirst(struct inode *ip, struct btstack * btstack);
 170
 171static int dtReadNext(struct inode *ip,
 172		      loff_t * offset, struct btstack * btstack);
 173
 174static int dtCompare(struct component_name * key, dtpage_t * p, int si);
 175
 176static int ciCompare(struct component_name * key, dtpage_t * p, int si,
 177		     int flag);
 178
 179static void dtGetKey(dtpage_t * p, int i, struct component_name * key,
 180		     int flag);
 181
 182static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp,
 183			      int ri, struct component_name * key, int flag);
 184
 185static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key,
 186			  ddata_t * data, struct dt_lock **);
 187
 188static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp,
 189			struct dt_lock ** sdtlock, struct dt_lock ** ddtlock,
 190			int do_index);
 191
 192static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock);
 193
 194static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock);
 195
 196static void dtLinelockFreelist(dtpage_t * p, int m, struct dt_lock ** dtlock);
 197
 198#define ciToUpper(c)	UniStrupr((c)->name)
 199
 200/*
 201 *	read_index_page()
 202 *
 203 *	Reads a page of a directory's index table.
 204 *	Having metadata mapped into the directory inode's address space
 205 *	presents a multitude of problems.  We avoid this by mapping to
 206 *	the absolute address space outside of the *_metapage routines
 207 */
 208static struct metapage *read_index_page(struct inode *inode, s64 blkno)
 209{
 210	int rc;
 211	s64 xaddr;
 212	int xflag;
 213	s32 xlen;
 214
 215	rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1);
 216	if (rc || (xaddr == 0))
 217		return NULL;
 218
 219	return read_metapage(inode, xaddr, PSIZE, 1);
 220}
 221
 222/*
 223 *	get_index_page()
 224 *
 225 *	Same as get_index_page(), but get's a new page without reading
 226 */
 227static struct metapage *get_index_page(struct inode *inode, s64 blkno)
 228{
 229	int rc;
 230	s64 xaddr;
 231	int xflag;
 232	s32 xlen;
 233
 234	rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1);
 235	if (rc || (xaddr == 0))
 236		return NULL;
 237
 238	return get_metapage(inode, xaddr, PSIZE, 1);
 239}
 240
 241/*
 242 *	find_index()
 243 *
 244 *	Returns dtree page containing directory table entry for specified
 245 *	index and pointer to its entry.
 246 *
 247 *	mp must be released by caller.
 248 */
 249static struct dir_table_slot *find_index(struct inode *ip, u32 index,
 250					 struct metapage ** mp, s64 *lblock)
 251{
 252	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
 253	s64 blkno;
 254	s64 offset;
 255	int page_offset;
 256	struct dir_table_slot *slot;
 257	static int maxWarnings = 10;
 258
 259	if (index < 2) {
 260		if (maxWarnings) {
 261			jfs_warn("find_entry called with index = %d", index);
 262			maxWarnings--;
 263		}
 264		return NULL;
 265	}
 266
 267	if (index >= jfs_ip->next_index) {
 268		jfs_warn("find_entry called with index >= next_index");
 269		return NULL;
 270	}
 271
 272	if (jfs_dirtable_inline(ip)) {
 273		/*
 274		 * Inline directory table
 275		 */
 276		*mp = NULL;
 277		slot = &jfs_ip->i_dirtable[index - 2];
 278	} else {
 279		offset = (index - 2) * sizeof(struct dir_table_slot);
 280		page_offset = offset & (PSIZE - 1);
 281		blkno = ((offset + 1) >> L2PSIZE) <<
 282		    JFS_SBI(ip->i_sb)->l2nbperpage;
 283
 284		if (*mp && (*lblock != blkno)) {
 285			release_metapage(*mp);
 286			*mp = NULL;
 287		}
 288		if (!(*mp)) {
 289			*lblock = blkno;
 290			*mp = read_index_page(ip, blkno);
 291		}
 292		if (!(*mp)) {
 293			jfs_err("free_index: error reading directory table");
 294			return NULL;
 295		}
 296
 297		slot =
 298		    (struct dir_table_slot *) ((char *) (*mp)->data +
 299					       page_offset);
 300	}
 301	return slot;
 302}
 303
 304static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp,
 305			      u32 index)
 306{
 307	struct tlock *tlck;
 308	struct linelock *llck;
 309	struct lv *lv;
 310
 311	tlck = txLock(tid, ip, mp, tlckDATA);
 312	llck = (struct linelock *) tlck->lock;
 313
 314	if (llck->index >= llck->maxcnt)
 315		llck = txLinelock(llck);
 316	lv = &llck->lv[llck->index];
 317
 318	/*
 319	 *	Linelock slot size is twice the size of directory table
 320	 *	slot size.  512 entries per page.
 321	 */
 322	lv->offset = ((index - 2) & 511) >> 1;
 323	lv->length = 1;
 324	llck->index++;
 325}
 326
 327/*
 328 *	add_index()
 329 *
 330 *	Adds an entry to the directory index table.  This is used to provide
 331 *	each directory entry with a persistent index in which to resume
 332 *	directory traversals
 333 */
 334static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
 335{
 336	struct super_block *sb = ip->i_sb;
 337	struct jfs_sb_info *sbi = JFS_SBI(sb);
 338	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
 339	u64 blkno;
 340	struct dir_table_slot *dirtab_slot;
 341	u32 index;
 342	struct linelock *llck;
 343	struct lv *lv;
 344	struct metapage *mp;
 345	s64 offset;
 346	uint page_offset;
 347	struct tlock *tlck;
 348	s64 xaddr;
 349
 350	ASSERT(DO_INDEX(ip));
 351
 352	if (jfs_ip->next_index < 2) {
 353		jfs_warn("add_index: next_index = %d.  Resetting!",
 354			   jfs_ip->next_index);
 355		jfs_ip->next_index = 2;
 356	}
 357
 358	index = jfs_ip->next_index++;
 359
 360	if (index <= MAX_INLINE_DIRTABLE_ENTRY) {
 361		/*
 362		 * i_size reflects size of index table, or 8 bytes per entry.
 363		 */
 364		ip->i_size = (loff_t) (index - 1) << 3;
 365
 366		/*
 367		 * dir table fits inline within inode
 368		 */
 369		dirtab_slot = &jfs_ip->i_dirtable[index-2];
 370		dirtab_slot->flag = DIR_INDEX_VALID;
 371		dirtab_slot->slot = slot;
 372		DTSaddress(dirtab_slot, bn);
 373
 374		set_cflag(COMMIT_Dirtable, ip);
 375
 376		return index;
 377	}
 378	if (index == (MAX_INLINE_DIRTABLE_ENTRY + 1)) {
 379		struct dir_table_slot temp_table[12];
 380
 381		/*
 382		 * It's time to move the inline table to an external
 383		 * page and begin to build the xtree
 384		 */
 385		if (dquot_alloc_block(ip, sbi->nbperpage))
 386			goto clean_up;
 387		if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) {
 388			dquot_free_block(ip, sbi->nbperpage);
 389			goto clean_up;
 390		}
 391
 392		/*
 393		 * Save the table, we're going to overwrite it with the
 394		 * xtree root
 395		 */
 396		memcpy(temp_table, &jfs_ip->i_dirtable, sizeof(temp_table));
 397
 398		/*
 399		 * Initialize empty x-tree
 400		 */
 401		xtInitRoot(tid, ip);
 402
 403		/*
 404		 * Add the first block to the xtree
 405		 */
 406		if (xtInsert(tid, ip, 0, 0, sbi->nbperpage, &xaddr, 0)) {
 407			/* This really shouldn't fail */
 408			jfs_warn("add_index: xtInsert failed!");
 409			memcpy(&jfs_ip->i_dirtable, temp_table,
 410			       sizeof (temp_table));
 411			dbFree(ip, xaddr, sbi->nbperpage);
 412			dquot_free_block(ip, sbi->nbperpage);
 413			goto clean_up;
 414		}
 415		ip->i_size = PSIZE;
 416
 417		mp = get_index_page(ip, 0);
 418		if (!mp) {
 419			jfs_err("add_index: get_metapage failed!");
 420			xtTruncate(tid, ip, 0, COMMIT_PWMAP);
 421			memcpy(&jfs_ip->i_dirtable, temp_table,
 422			       sizeof (temp_table));
 423			goto clean_up;
 424		}
 425		tlck = txLock(tid, ip, mp, tlckDATA);
 426		llck = (struct linelock *) & tlck->lock;
 427		ASSERT(llck->index == 0);
 428		lv = &llck->lv[0];
 429
 430		lv->offset = 0;
 431		lv->length = 6;	/* tlckDATA slot size is 16 bytes */
 432		llck->index++;
 433
 434		memcpy(mp->data, temp_table, sizeof(temp_table));
 435
 436		mark_metapage_dirty(mp);
 437		release_metapage(mp);
 438
 439		/*
 440		 * Logging is now directed by xtree tlocks
 441		 */
 442		clear_cflag(COMMIT_Dirtable, ip);
 443	}
 444
 445	offset = (index - 2) * sizeof(struct dir_table_slot);
 446	page_offset = offset & (PSIZE - 1);
 447	blkno = ((offset + 1) >> L2PSIZE) << sbi->l2nbperpage;
 448	if (page_offset == 0) {
 449		/*
 450		 * This will be the beginning of a new page
 451		 */
 452		xaddr = 0;
 453		if (xtInsert(tid, ip, 0, blkno, sbi->nbperpage, &xaddr, 0)) {
 454			jfs_warn("add_index: xtInsert failed!");
 455			goto clean_up;
 456		}
 457		ip->i_size += PSIZE;
 458
 459		if ((mp = get_index_page(ip, blkno)))
 460			memset(mp->data, 0, PSIZE);	/* Just looks better */
 461		else
 462			xtTruncate(tid, ip, offset, COMMIT_PWMAP);
 463	} else
 464		mp = read_index_page(ip, blkno);
 465
 466	if (!mp) {
 467		jfs_err("add_index: get/read_metapage failed!");
 468		goto clean_up;
 469	}
 470
 471	lock_index(tid, ip, mp, index);
 472
 473	dirtab_slot =
 474	    (struct dir_table_slot *) ((char *) mp->data + page_offset);
 475	dirtab_slot->flag = DIR_INDEX_VALID;
 476	dirtab_slot->slot = slot;
 477	DTSaddress(dirtab_slot, bn);
 478
 479	mark_metapage_dirty(mp);
 480	release_metapage(mp);
 481
 482	return index;
 483
 484      clean_up:
 485
 486	jfs_ip->next_index--;
 487
 488	return 0;
 489}
 490
 491/*
 492 *	free_index()
 493 *
 494 *	Marks an entry to the directory index table as free.
 495 */
 496static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next)
 497{
 498	struct dir_table_slot *dirtab_slot;
 499	s64 lblock;
 500	struct metapage *mp = NULL;
 501
 502	dirtab_slot = find_index(ip, index, &mp, &lblock);
 503
 504	if (!dirtab_slot)
 505		return;
 506
 507	dirtab_slot->flag = DIR_INDEX_FREE;
 508	dirtab_slot->slot = dirtab_slot->addr1 = 0;
 509	dirtab_slot->addr2 = cpu_to_le32(next);
 510
 511	if (mp) {
 512		lock_index(tid, ip, mp, index);
 513		mark_metapage_dirty(mp);
 514		release_metapage(mp);
 515	} else
 516		set_cflag(COMMIT_Dirtable, ip);
 517}
 518
 519/*
 520 *	modify_index()
 521 *
 522 *	Changes an entry in the directory index table
 523 */
 524static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn,
 525			 int slot, struct metapage ** mp, s64 *lblock)
 526{
 527	struct dir_table_slot *dirtab_slot;
 528
 529	dirtab_slot = find_index(ip, index, mp, lblock);
 530
 531	if (!dirtab_slot)
 532		return;
 533
 534	DTSaddress(dirtab_slot, bn);
 535	dirtab_slot->slot = slot;
 536
 537	if (*mp) {
 538		lock_index(tid, ip, *mp, index);
 539		mark_metapage_dirty(*mp);
 540	} else
 541		set_cflag(COMMIT_Dirtable, ip);
 542}
 543
 544/*
 545 *	read_index()
 546 *
 547 *	reads a directory table slot
 548 */
 549static int read_index(struct inode *ip, u32 index,
 550		     struct dir_table_slot * dirtab_slot)
 551{
 552	s64 lblock;
 553	struct metapage *mp = NULL;
 554	struct dir_table_slot *slot;
 555
 556	slot = find_index(ip, index, &mp, &lblock);
 557	if (!slot) {
 558		return -EIO;
 559	}
 560
 561	memcpy(dirtab_slot, slot, sizeof(struct dir_table_slot));
 562
 563	if (mp)
 564		release_metapage(mp);
 565
 566	return 0;
 567}
 568
 569/*
 570 *	dtSearch()
 571 *
 572 * function:
 573 *	Search for the entry with specified key
 574 *
 575 * parameter:
 576 *
 577 * return: 0 - search result on stack, leaf page pinned;
 578 *	   errno - I/O error
 579 */
 580int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
 581	     struct btstack * btstack, int flag)
 582{
 583	int rc = 0;
 584	int cmp = 1;		/* init for empty page */
 585	s64 bn;
 586	struct metapage *mp;
 587	dtpage_t *p;
 588	s8 *stbl;
 589	int base, index, lim;
 590	struct btframe *btsp;
 591	pxd_t *pxd;
 592	int psize = 288;	/* initial in-line directory */
 593	ino_t inumber;
 594	struct component_name ciKey;
 595	struct super_block *sb = ip->i_sb;
 596
 597	ciKey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), GFP_NOFS);
 
 598	if (!ciKey.name) {
 599		rc = -ENOMEM;
 600		goto dtSearch_Exit2;
 601	}
 602
 603
 604	/* uppercase search key for c-i directory */
 605	UniStrcpy(ciKey.name, key->name);
 606	ciKey.namlen = key->namlen;
 607
 608	/* only uppercase if case-insensitive support is on */
 609	if ((JFS_SBI(sb)->mntflag & JFS_OS2) == JFS_OS2) {
 610		ciToUpper(&ciKey);
 611	}
 612	BT_CLR(btstack);	/* reset stack */
 613
 614	/* init level count for max pages to split */
 615	btstack->nsplit = 1;
 616
 617	/*
 618	 *	search down tree from root:
 619	 *
 620	 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
 621	 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
 622	 *
 623	 * if entry with search key K is not found
 624	 * internal page search find the entry with largest key Ki
 625	 * less than K which point to the child page to search;
 626	 * leaf page search find the entry with smallest key Kj
 627	 * greater than K so that the returned index is the position of
 628	 * the entry to be shifted right for insertion of new entry.
 629	 * for empty tree, search key is greater than any key of the tree.
 630	 *
 631	 * by convention, root bn = 0.
 632	 */
 633	for (bn = 0;;) {
 634		/* get/pin the page to search */
 635		DT_GETPAGE(ip, bn, mp, psize, p, rc);
 636		if (rc)
 637			goto dtSearch_Exit1;
 638
 639		/* get sorted entry table of the page */
 640		stbl = DT_GETSTBL(p);
 641
 642		/*
 643		 * binary search with search key K on the current page.
 644		 */
 645		for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) {
 646			index = base + (lim >> 1);
 647
 
 
 
 
 
 648			if (p->header.flag & BT_LEAF) {
 649				/* uppercase leaf name to compare */
 650				cmp =
 651				    ciCompare(&ciKey, p, stbl[index],
 652					      JFS_SBI(sb)->mntflag);
 653			} else {
 654				/* router key is in uppercase */
 655
 656				cmp = dtCompare(&ciKey, p, stbl[index]);
 657
 658
 659			}
 660			if (cmp == 0) {
 661				/*
 662				 *	search hit
 663				 */
 664				/* search hit - leaf page:
 665				 * return the entry found
 666				 */
 667				if (p->header.flag & BT_LEAF) {
 668					inumber = le32_to_cpu(
 669			((struct ldtentry *) & p->slot[stbl[index]])->inumber);
 670
 671					/*
 672					 * search for JFS_LOOKUP
 673					 */
 674					if (flag == JFS_LOOKUP) {
 675						*data = inumber;
 676						rc = 0;
 677						goto out;
 678					}
 679
 680					/*
 681					 * search for JFS_CREATE
 682					 */
 683					if (flag == JFS_CREATE) {
 684						*data = inumber;
 685						rc = -EEXIST;
 686						goto out;
 687					}
 688
 689					/*
 690					 * search for JFS_REMOVE or JFS_RENAME
 691					 */
 692					if ((flag == JFS_REMOVE ||
 693					     flag == JFS_RENAME) &&
 694					    *data != inumber) {
 695						rc = -ESTALE;
 696						goto out;
 697					}
 698
 699					/*
 700					 * JFS_REMOVE|JFS_FINDDIR|JFS_RENAME
 701					 */
 702					/* save search result */
 703					*data = inumber;
 704					btsp = btstack->top;
 705					btsp->bn = bn;
 706					btsp->index = index;
 707					btsp->mp = mp;
 708
 709					rc = 0;
 710					goto dtSearch_Exit1;
 711				}
 712
 713				/* search hit - internal page:
 714				 * descend/search its child page
 715				 */
 716				goto getChild;
 717			}
 718
 719			if (cmp > 0) {
 720				base = index + 1;
 721				--lim;
 722			}
 723		}
 724
 725		/*
 726		 *	search miss
 727		 *
 728		 * base is the smallest index with key (Kj) greater than
 729		 * search key (K) and may be zero or (maxindex + 1) index.
 730		 */
 731		/*
 732		 * search miss - leaf page
 733		 *
 734		 * return location of entry (base) where new entry with
 735		 * search key K is to be inserted.
 736		 */
 737		if (p->header.flag & BT_LEAF) {
 738			/*
 739			 * search for JFS_LOOKUP, JFS_REMOVE, or JFS_RENAME
 740			 */
 741			if (flag == JFS_LOOKUP || flag == JFS_REMOVE ||
 742			    flag == JFS_RENAME) {
 743				rc = -ENOENT;
 744				goto out;
 745			}
 746
 747			/*
 748			 * search for JFS_CREATE|JFS_FINDDIR:
 749			 *
 750			 * save search result
 751			 */
 752			*data = 0;
 753			btsp = btstack->top;
 754			btsp->bn = bn;
 755			btsp->index = base;
 756			btsp->mp = mp;
 757
 758			rc = 0;
 759			goto dtSearch_Exit1;
 760		}
 761
 762		/*
 763		 * search miss - internal page
 764		 *
 765		 * if base is non-zero, decrement base by one to get the parent
 766		 * entry of the child page to search.
 767		 */
 768		index = base ? base - 1 : base;
 769
 770		/*
 771		 * go down to child page
 772		 */
 773	      getChild:
 774		/* update max. number of pages to split */
 775		if (BT_STACK_FULL(btstack)) {
 776			/* Something's corrupted, mark filesystem dirty so
 777			 * chkdsk will fix it.
 778			 */
 779			jfs_error(sb, "stack overrun!\n");
 780			BT_STACK_DUMP(btstack);
 781			rc = -EIO;
 782			goto out;
 783		}
 784		btstack->nsplit++;
 785
 786		/* push (bn, index) of the parent page/entry */
 787		BT_PUSH(btstack, bn, index);
 788
 789		/* get the child page block number */
 790		pxd = (pxd_t *) & p->slot[stbl[index]];
 791		bn = addressPXD(pxd);
 792		psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize;
 793
 794		/* unpin the parent page */
 795		DT_PUTPAGE(mp);
 796	}
 797
 798      out:
 799	DT_PUTPAGE(mp);
 800
 801      dtSearch_Exit1:
 802
 803	kfree(ciKey.name);
 804
 805      dtSearch_Exit2:
 806
 807	return rc;
 808}
 809
 810
 811/*
 812 *	dtInsert()
 813 *
 814 * function: insert an entry to directory tree
 815 *
 816 * parameter:
 817 *
 818 * return: 0 - success;
 819 *	   errno - failure;
 820 */
 821int dtInsert(tid_t tid, struct inode *ip,
 822	 struct component_name * name, ino_t * fsn, struct btstack * btstack)
 823{
 824	int rc = 0;
 825	struct metapage *mp;	/* meta-page buffer */
 826	dtpage_t *p;		/* base B+-tree index page */
 827	s64 bn;
 828	int index;
 829	struct dtsplit split;	/* split information */
 830	ddata_t data;
 831	struct dt_lock *dtlck;
 832	int n;
 833	struct tlock *tlck;
 834	struct lv *lv;
 835
 836	/*
 837	 *	retrieve search result
 838	 *
 839	 * dtSearch() returns (leaf page pinned, index at which to insert).
 840	 * n.b. dtSearch() may return index of (maxindex + 1) of
 841	 * the full page.
 842	 */
 843	DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
 
 
 844
 845	/*
 846	 *	insert entry for new key
 847	 */
 848	if (DO_INDEX(ip)) {
 849		if (JFS_IP(ip)->next_index == DIREND) {
 850			DT_PUTPAGE(mp);
 851			return -EMLINK;
 852		}
 853		n = NDTLEAF(name->namlen);
 854		data.leaf.tid = tid;
 855		data.leaf.ip = ip;
 856	} else {
 857		n = NDTLEAF_LEGACY(name->namlen);
 858		data.leaf.ip = NULL;	/* signifies legacy directory format */
 859	}
 860	data.leaf.ino = *fsn;
 861
 862	/*
 863	 *	leaf page does not have enough room for new entry:
 864	 *
 865	 *	extend/split the leaf page;
 866	 *
 867	 * dtSplitUp() will insert the entry and unpin the leaf page.
 868	 */
 869	if (n > p->header.freecnt) {
 870		split.mp = mp;
 871		split.index = index;
 872		split.nslot = n;
 873		split.key = name;
 874		split.data = &data;
 875		rc = dtSplitUp(tid, ip, &split, btstack);
 876		return rc;
 877	}
 878
 879	/*
 880	 *	leaf page does have enough room for new entry:
 881	 *
 882	 *	insert the new data entry into the leaf page;
 883	 */
 884	BT_MARK_DIRTY(mp, ip);
 885	/*
 886	 * acquire a transaction lock on the leaf page
 887	 */
 888	tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
 889	dtlck = (struct dt_lock *) & tlck->lock;
 890	ASSERT(dtlck->index == 0);
 891	lv = & dtlck->lv[0];
 892
 893	/* linelock header */
 894	lv->offset = 0;
 895	lv->length = 1;
 896	dtlck->index++;
 897
 898	dtInsertEntry(p, index, name, &data, &dtlck);
 899
 900	/* linelock stbl of non-root leaf page */
 901	if (!(p->header.flag & BT_ROOT)) {
 902		if (dtlck->index >= dtlck->maxcnt)
 903			dtlck = (struct dt_lock *) txLinelock(dtlck);
 904		lv = & dtlck->lv[dtlck->index];
 905		n = index >> L2DTSLOTSIZE;
 906		lv->offset = p->header.stblindex + n;
 907		lv->length =
 908		    ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1;
 909		dtlck->index++;
 910	}
 911
 912	/* unpin the leaf page */
 913	DT_PUTPAGE(mp);
 914
 915	return 0;
 916}
 917
 918
 919/*
 920 *	dtSplitUp()
 921 *
 922 * function: propagate insertion bottom up;
 923 *
 924 * parameter:
 925 *
 926 * return: 0 - success;
 927 *	   errno - failure;
 928 *	leaf page unpinned;
 929 */
 930static int dtSplitUp(tid_t tid,
 931	  struct inode *ip, struct dtsplit * split, struct btstack * btstack)
 932{
 933	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
 934	int rc = 0;
 935	struct metapage *smp;
 936	dtpage_t *sp;		/* split page */
 937	struct metapage *rmp;
 938	dtpage_t *rp;		/* new right page split from sp */
 939	pxd_t rpxd;		/* new right page extent descriptor */
 940	struct metapage *lmp;
 941	dtpage_t *lp;		/* left child page */
 942	int skip;		/* index of entry of insertion */
 943	struct btframe *parent;	/* parent page entry on traverse stack */
 944	s64 xaddr, nxaddr;
 945	int xlen, xsize;
 946	struct pxdlist pxdlist;
 947	pxd_t *pxd;
 948	struct component_name key = { 0, NULL };
 949	ddata_t *data = split->data;
 950	int n;
 951	struct dt_lock *dtlck;
 952	struct tlock *tlck;
 953	struct lv *lv;
 954	int quota_allocation = 0;
 955
 956	/* get split page */
 957	smp = split->mp;
 958	sp = DT_PAGE(ip, smp);
 959
 960	key.name = kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t), GFP_NOFS);
 961	if (!key.name) {
 962		DT_PUTPAGE(smp);
 963		rc = -ENOMEM;
 964		goto dtSplitUp_Exit;
 965	}
 966
 967	/*
 968	 *	split leaf page
 969	 *
 970	 * The split routines insert the new entry, and
 971	 * acquire txLock as appropriate.
 972	 */
 973	/*
 974	 *	split root leaf page:
 975	 */
 976	if (sp->header.flag & BT_ROOT) {
 977		/*
 978		 * allocate a single extent child page
 979		 */
 980		xlen = 1;
 981		n = sbi->bsize >> L2DTSLOTSIZE;
 982		n -= (n + 31) >> L2DTSLOTSIZE;	/* stbl size */
 983		n -= DTROOTMAXSLOT - sp->header.freecnt; /* header + entries */
 984		if (n <= split->nslot)
 985			xlen++;
 986		if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr))) {
 987			DT_PUTPAGE(smp);
 988			goto freeKeyName;
 989		}
 990
 991		pxdlist.maxnpxd = 1;
 992		pxdlist.npxd = 0;
 993		pxd = &pxdlist.pxd[0];
 994		PXDaddress(pxd, xaddr);
 995		PXDlength(pxd, xlen);
 996		split->pxdlist = &pxdlist;
 997		rc = dtSplitRoot(tid, ip, split, &rmp);
 998
 999		if (rc)
1000			dbFree(ip, xaddr, xlen);
1001		else
1002			DT_PUTPAGE(rmp);
1003
1004		DT_PUTPAGE(smp);
1005
1006		if (!DO_INDEX(ip))
1007			ip->i_size = xlen << sbi->l2bsize;
1008
1009		goto freeKeyName;
1010	}
1011
1012	/*
1013	 *	extend first leaf page
1014	 *
1015	 * extend the 1st extent if less than buffer page size
1016	 * (dtExtendPage() reurns leaf page unpinned)
1017	 */
1018	pxd = &sp->header.self;
1019	xlen = lengthPXD(pxd);
1020	xsize = xlen << sbi->l2bsize;
1021	if (xsize < PSIZE) {
1022		xaddr = addressPXD(pxd);
1023		n = xsize >> L2DTSLOTSIZE;
1024		n -= (n + 31) >> L2DTSLOTSIZE;	/* stbl size */
1025		if ((n + sp->header.freecnt) <= split->nslot)
1026			n = xlen + (xlen << 1);
1027		else
1028			n = xlen;
1029
1030		/* Allocate blocks to quota. */
1031		rc = dquot_alloc_block(ip, n);
1032		if (rc)
1033			goto extendOut;
1034		quota_allocation += n;
1035
1036		if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen,
1037				    (s64) n, &nxaddr)))
1038			goto extendOut;
1039
1040		pxdlist.maxnpxd = 1;
1041		pxdlist.npxd = 0;
1042		pxd = &pxdlist.pxd[0];
1043		PXDaddress(pxd, nxaddr);
1044		PXDlength(pxd, xlen + n);
1045		split->pxdlist = &pxdlist;
1046		if ((rc = dtExtendPage(tid, ip, split, btstack))) {
1047			nxaddr = addressPXD(pxd);
1048			if (xaddr != nxaddr) {
1049				/* free relocated extent */
1050				xlen = lengthPXD(pxd);
1051				dbFree(ip, nxaddr, (s64) xlen);
1052			} else {
1053				/* free extended delta */
1054				xlen = lengthPXD(pxd) - n;
1055				xaddr = addressPXD(pxd) + xlen;
1056				dbFree(ip, xaddr, (s64) n);
1057			}
1058		} else if (!DO_INDEX(ip))
1059			ip->i_size = lengthPXD(pxd) << sbi->l2bsize;
1060
1061
1062	      extendOut:
1063		DT_PUTPAGE(smp);
1064		goto freeKeyName;
1065	}
1066
1067	/*
1068	 *	split leaf page <sp> into <sp> and a new right page <rp>.
1069	 *
1070	 * return <rp> pinned and its extent descriptor <rpxd>
1071	 */
1072	/*
1073	 * allocate new directory page extent and
1074	 * new index page(s) to cover page split(s)
1075	 *
1076	 * allocation hint: ?
1077	 */
1078	n = btstack->nsplit;
1079	pxdlist.maxnpxd = pxdlist.npxd = 0;
1080	xlen = sbi->nbperpage;
1081	for (pxd = pxdlist.pxd; n > 0; n--, pxd++) {
1082		if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr)) == 0) {
1083			PXDaddress(pxd, xaddr);
1084			PXDlength(pxd, xlen);
1085			pxdlist.maxnpxd++;
1086			continue;
1087		}
1088
1089		DT_PUTPAGE(smp);
1090
1091		/* undo allocation */
1092		goto splitOut;
1093	}
1094
1095	split->pxdlist = &pxdlist;
1096	if ((rc = dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd))) {
1097		DT_PUTPAGE(smp);
1098
1099		/* undo allocation */
1100		goto splitOut;
1101	}
1102
1103	if (!DO_INDEX(ip))
1104		ip->i_size += PSIZE;
1105
1106	/*
1107	 * propagate up the router entry for the leaf page just split
1108	 *
1109	 * insert a router entry for the new page into the parent page,
1110	 * propagate the insert/split up the tree by walking back the stack
1111	 * of (bn of parent page, index of child page entry in parent page)
1112	 * that were traversed during the search for the page that split.
1113	 *
1114	 * the propagation of insert/split up the tree stops if the root
1115	 * splits or the page inserted into doesn't have to split to hold
1116	 * the new entry.
1117	 *
1118	 * the parent entry for the split page remains the same, and
1119	 * a new entry is inserted at its right with the first key and
1120	 * block number of the new right page.
1121	 *
1122	 * There are a maximum of 4 pages pinned at any time:
1123	 * two children, left parent and right parent (when the parent splits).
1124	 * keep the child pages pinned while working on the parent.
1125	 * make sure that all pins are released at exit.
1126	 */
1127	while ((parent = BT_POP(btstack)) != NULL) {
1128		/* parent page specified by stack frame <parent> */
1129
1130		/* keep current child pages (<lp>, <rp>) pinned */
1131		lmp = smp;
1132		lp = sp;
1133
1134		/*
1135		 * insert router entry in parent for new right child page <rp>
1136		 */
1137		/* get the parent page <sp> */
1138		DT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc);
1139		if (rc) {
1140			DT_PUTPAGE(lmp);
1141			DT_PUTPAGE(rmp);
1142			goto splitOut;
1143		}
1144
1145		/*
1146		 * The new key entry goes ONE AFTER the index of parent entry,
1147		 * because the split was to the right.
1148		 */
1149		skip = parent->index + 1;
1150
1151		/*
1152		 * compute the key for the router entry
1153		 *
1154		 * key suffix compression:
1155		 * for internal pages that have leaf pages as children,
1156		 * retain only what's needed to distinguish between
1157		 * the new entry and the entry on the page to its left.
1158		 * If the keys compare equal, retain the entire key.
1159		 *
1160		 * note that compression is performed only at computing
1161		 * router key at the lowest internal level.
1162		 * further compression of the key between pairs of higher
1163		 * level internal pages loses too much information and
1164		 * the search may fail.
1165		 * (e.g., two adjacent leaf pages of {a, ..., x} {xx, ...,}
1166		 * results in two adjacent parent entries (a)(xx).
1167		 * if split occurs between these two entries, and
1168		 * if compression is applied, the router key of parent entry
1169		 * of right page (x) will divert search for x into right
1170		 * subtree and miss x in the left subtree.)
1171		 *
1172		 * the entire key must be retained for the next-to-leftmost
1173		 * internal key at any level of the tree, or search may fail
1174		 * (e.g., ?)
1175		 */
1176		switch (rp->header.flag & BT_TYPE) {
1177		case BT_LEAF:
1178			/*
1179			 * compute the length of prefix for suffix compression
1180			 * between last entry of left page and first entry
1181			 * of right page
1182			 */
1183			if ((sp->header.flag & BT_ROOT && skip > 1) ||
1184			    sp->header.prev != 0 || skip > 1) {
1185				/* compute uppercase router prefix key */
1186				rc = ciGetLeafPrefixKey(lp,
1187							lp->header.nextindex-1,
1188							rp, 0, &key,
1189							sbi->mntflag);
1190				if (rc) {
1191					DT_PUTPAGE(lmp);
1192					DT_PUTPAGE(rmp);
1193					DT_PUTPAGE(smp);
1194					goto splitOut;
1195				}
1196			} else {
1197				/* next to leftmost entry of
1198				   lowest internal level */
1199
1200				/* compute uppercase router key */
1201				dtGetKey(rp, 0, &key, sbi->mntflag);
1202				key.name[key.namlen] = 0;
1203
1204				if ((sbi->mntflag & JFS_OS2) == JFS_OS2)
1205					ciToUpper(&key);
1206			}
1207
1208			n = NDTINTERNAL(key.namlen);
1209			break;
1210
1211		case BT_INTERNAL:
1212			dtGetKey(rp, 0, &key, sbi->mntflag);
1213			n = NDTINTERNAL(key.namlen);
1214			break;
1215
1216		default:
1217			jfs_err("dtSplitUp(): UFO!");
1218			break;
1219		}
1220
1221		/* unpin left child page */
1222		DT_PUTPAGE(lmp);
1223
1224		/*
1225		 * compute the data for the router entry
1226		 */
1227		data->xd = rpxd;	/* child page xd */
1228
1229		/*
1230		 * parent page is full - split the parent page
1231		 */
1232		if (n > sp->header.freecnt) {
1233			/* init for parent page split */
1234			split->mp = smp;
1235			split->index = skip;	/* index at insert */
1236			split->nslot = n;
1237			split->key = &key;
1238			/* split->data = data; */
1239
1240			/* unpin right child page */
1241			DT_PUTPAGE(rmp);
1242
1243			/* The split routines insert the new entry,
1244			 * acquire txLock as appropriate.
1245			 * return <rp> pinned and its block number <rbn>.
1246			 */
1247			rc = (sp->header.flag & BT_ROOT) ?
1248			    dtSplitRoot(tid, ip, split, &rmp) :
1249			    dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd);
1250			if (rc) {
1251				DT_PUTPAGE(smp);
1252				goto splitOut;
1253			}
1254
1255			/* smp and rmp are pinned */
1256		}
1257		/*
1258		 * parent page is not full - insert router entry in parent page
1259		 */
1260		else {
1261			BT_MARK_DIRTY(smp, ip);
1262			/*
1263			 * acquire a transaction lock on the parent page
1264			 */
1265			tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY);
1266			dtlck = (struct dt_lock *) & tlck->lock;
1267			ASSERT(dtlck->index == 0);
1268			lv = & dtlck->lv[0];
1269
1270			/* linelock header */
1271			lv->offset = 0;
1272			lv->length = 1;
1273			dtlck->index++;
1274
1275			/* linelock stbl of non-root parent page */
1276			if (!(sp->header.flag & BT_ROOT)) {
1277				lv++;
1278				n = skip >> L2DTSLOTSIZE;
1279				lv->offset = sp->header.stblindex + n;
1280				lv->length =
1281				    ((sp->header.nextindex -
1282				      1) >> L2DTSLOTSIZE) - n + 1;
1283				dtlck->index++;
1284			}
1285
1286			dtInsertEntry(sp, skip, &key, data, &dtlck);
1287
1288			/* exit propagate up */
1289			break;
1290		}
1291	}
1292
1293	/* unpin current split and its right page */
1294	DT_PUTPAGE(smp);
1295	DT_PUTPAGE(rmp);
1296
1297	/*
1298	 * free remaining extents allocated for split
1299	 */
1300      splitOut:
1301	n = pxdlist.npxd;
1302	pxd = &pxdlist.pxd[n];
1303	for (; n < pxdlist.maxnpxd; n++, pxd++)
1304		dbFree(ip, addressPXD(pxd), (s64) lengthPXD(pxd));
1305
1306      freeKeyName:
1307	kfree(key.name);
1308
1309	/* Rollback quota allocation */
1310	if (rc && quota_allocation)
1311		dquot_free_block(ip, quota_allocation);
1312
1313      dtSplitUp_Exit:
1314
1315	return rc;
1316}
1317
1318
1319/*
1320 *	dtSplitPage()
1321 *
1322 * function: Split a non-root page of a btree.
1323 *
1324 * parameter:
1325 *
1326 * return: 0 - success;
1327 *	   errno - failure;
1328 *	return split and new page pinned;
1329 */
1330static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
1331	    struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rpxdp)
1332{
1333	int rc = 0;
1334	struct metapage *smp;
1335	dtpage_t *sp;
1336	struct metapage *rmp;
1337	dtpage_t *rp;		/* new right page allocated */
1338	s64 rbn;		/* new right page block number */
1339	struct metapage *mp;
1340	dtpage_t *p;
1341	s64 nextbn;
1342	struct pxdlist *pxdlist;
1343	pxd_t *pxd;
1344	int skip, nextindex, half, left, nxt, off, si;
1345	struct ldtentry *ldtentry;
1346	struct idtentry *idtentry;
1347	u8 *stbl;
1348	struct dtslot *f;
1349	int fsi, stblsize;
1350	int n;
1351	struct dt_lock *sdtlck, *rdtlck;
1352	struct tlock *tlck;
1353	struct dt_lock *dtlck;
1354	struct lv *slv, *rlv, *lv;
1355
1356	/* get split page */
1357	smp = split->mp;
1358	sp = DT_PAGE(ip, smp);
1359
1360	/*
1361	 * allocate the new right page for the split
1362	 */
1363	pxdlist = split->pxdlist;
1364	pxd = &pxdlist->pxd[pxdlist->npxd];
1365	pxdlist->npxd++;
1366	rbn = addressPXD(pxd);
1367	rmp = get_metapage(ip, rbn, PSIZE, 1);
1368	if (rmp == NULL)
1369		return -EIO;
1370
1371	/* Allocate blocks to quota. */
1372	rc = dquot_alloc_block(ip, lengthPXD(pxd));
1373	if (rc) {
1374		release_metapage(rmp);
1375		return rc;
1376	}
1377
1378	jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp);
1379
1380	BT_MARK_DIRTY(rmp, ip);
1381	/*
1382	 * acquire a transaction lock on the new right page
1383	 */
1384	tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW);
1385	rdtlck = (struct dt_lock *) & tlck->lock;
1386
1387	rp = (dtpage_t *) rmp->data;
1388	*rpp = rp;
1389	rp->header.self = *pxd;
1390
1391	BT_MARK_DIRTY(smp, ip);
1392	/*
1393	 * acquire a transaction lock on the split page
1394	 *
1395	 * action:
1396	 */
1397	tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY);
1398	sdtlck = (struct dt_lock *) & tlck->lock;
1399
1400	/* linelock header of split page */
1401	ASSERT(sdtlck->index == 0);
1402	slv = & sdtlck->lv[0];
1403	slv->offset = 0;
1404	slv->length = 1;
1405	sdtlck->index++;
1406
1407	/*
1408	 * initialize/update sibling pointers between sp and rp
1409	 */
1410	nextbn = le64_to_cpu(sp->header.next);
1411	rp->header.next = cpu_to_le64(nextbn);
1412	rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self));
1413	sp->header.next = cpu_to_le64(rbn);
1414
1415	/*
1416	 * initialize new right page
1417	 */
1418	rp->header.flag = sp->header.flag;
1419
1420	/* compute sorted entry table at start of extent data area */
1421	rp->header.nextindex = 0;
1422	rp->header.stblindex = 1;
1423
1424	n = PSIZE >> L2DTSLOTSIZE;
1425	rp->header.maxslot = n;
1426	stblsize = (n + 31) >> L2DTSLOTSIZE;	/* in unit of slot */
1427
1428	/* init freelist */
1429	fsi = rp->header.stblindex + stblsize;
1430	rp->header.freelist = fsi;
1431	rp->header.freecnt = rp->header.maxslot - fsi;
1432
1433	/*
1434	 *	sequential append at tail: append without split
1435	 *
1436	 * If splitting the last page on a level because of appending
1437	 * a entry to it (skip is maxentry), it's likely that the access is
1438	 * sequential. Adding an empty page on the side of the level is less
1439	 * work and can push the fill factor much higher than normal.
1440	 * If we're wrong it's no big deal, we'll just do the split the right
1441	 * way next time.
1442	 * (It may look like it's equally easy to do a similar hack for
1443	 * reverse sorted data, that is, split the tree left,
1444	 * but it's not. Be my guest.)
1445	 */
1446	if (nextbn == 0 && split->index == sp->header.nextindex) {
1447		/* linelock header + stbl (first slot) of new page */
1448		rlv = & rdtlck->lv[rdtlck->index];
1449		rlv->offset = 0;
1450		rlv->length = 2;
1451		rdtlck->index++;
1452
1453		/*
1454		 * initialize freelist of new right page
1455		 */
1456		f = &rp->slot[fsi];
1457		for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
1458			f->next = fsi;
1459		f->next = -1;
1460
1461		/* insert entry at the first entry of the new right page */
1462		dtInsertEntry(rp, 0, split->key, split->data, &rdtlck);
1463
1464		goto out;
1465	}
1466
1467	/*
1468	 *	non-sequential insert (at possibly middle page)
1469	 */
1470
1471	/*
1472	 * update prev pointer of previous right sibling page;
1473	 */
1474	if (nextbn != 0) {
1475		DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
1476		if (rc) {
1477			discard_metapage(rmp);
1478			return rc;
1479		}
1480
1481		BT_MARK_DIRTY(mp, ip);
1482		/*
1483		 * acquire a transaction lock on the next page
1484		 */
1485		tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
1486		jfs_info("dtSplitPage: tlck = 0x%p, ip = 0x%p, mp=0x%p",
1487			tlck, ip, mp);
1488		dtlck = (struct dt_lock *) & tlck->lock;
1489
1490		/* linelock header of previous right sibling page */
1491		lv = & dtlck->lv[dtlck->index];
1492		lv->offset = 0;
1493		lv->length = 1;
1494		dtlck->index++;
1495
1496		p->header.prev = cpu_to_le64(rbn);
1497
1498		DT_PUTPAGE(mp);
1499	}
1500
1501	/*
1502	 * split the data between the split and right pages.
1503	 */
1504	skip = split->index;
1505	half = (PSIZE >> L2DTSLOTSIZE) >> 1;	/* swag */
1506	left = 0;
1507
1508	/*
1509	 *	compute fill factor for split pages
1510	 *
1511	 * <nxt> traces the next entry to move to rp
1512	 * <off> traces the next entry to stay in sp
1513	 */
1514	stbl = (u8 *) & sp->slot[sp->header.stblindex];
1515	nextindex = sp->header.nextindex;
1516	for (nxt = off = 0; nxt < nextindex; ++off) {
1517		if (off == skip)
1518			/* check for fill factor with new entry size */
1519			n = split->nslot;
1520		else {
1521			si = stbl[nxt];
1522			switch (sp->header.flag & BT_TYPE) {
1523			case BT_LEAF:
1524				ldtentry = (struct ldtentry *) & sp->slot[si];
1525				if (DO_INDEX(ip))
1526					n = NDTLEAF(ldtentry->namlen);
1527				else
1528					n = NDTLEAF_LEGACY(ldtentry->
1529							   namlen);
1530				break;
1531
1532			case BT_INTERNAL:
1533				idtentry = (struct idtentry *) & sp->slot[si];
1534				n = NDTINTERNAL(idtentry->namlen);
1535				break;
1536
1537			default:
1538				break;
1539			}
1540
1541			++nxt;	/* advance to next entry to move in sp */
1542		}
1543
1544		left += n;
1545		if (left >= half)
1546			break;
1547	}
1548
1549	/* <nxt> poins to the 1st entry to move */
1550
1551	/*
1552	 *	move entries to right page
1553	 *
1554	 * dtMoveEntry() initializes rp and reserves entry for insertion
1555	 *
1556	 * split page moved out entries are linelocked;
1557	 * new/right page moved in entries are linelocked;
1558	 */
1559	/* linelock header + stbl of new right page */
1560	rlv = & rdtlck->lv[rdtlck->index];
1561	rlv->offset = 0;
1562	rlv->length = 5;
1563	rdtlck->index++;
1564
1565	dtMoveEntry(sp, nxt, rp, &sdtlck, &rdtlck, DO_INDEX(ip));
1566
1567	sp->header.nextindex = nxt;
1568
1569	/*
1570	 * finalize freelist of new right page
1571	 */
1572	fsi = rp->header.freelist;
1573	f = &rp->slot[fsi];
1574	for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
1575		f->next = fsi;
1576	f->next = -1;
1577
1578	/*
1579	 * Update directory index table for entries now in right page
1580	 */
1581	if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) {
1582		s64 lblock;
1583
1584		mp = NULL;
1585		stbl = DT_GETSTBL(rp);
1586		for (n = 0; n < rp->header.nextindex; n++) {
1587			ldtentry = (struct ldtentry *) & rp->slot[stbl[n]];
1588			modify_index(tid, ip, le32_to_cpu(ldtentry->index),
1589				     rbn, n, &mp, &lblock);
1590		}
1591		if (mp)
1592			release_metapage(mp);
1593	}
1594
1595	/*
1596	 * the skipped index was on the left page,
1597	 */
1598	if (skip <= off) {
1599		/* insert the new entry in the split page */
1600		dtInsertEntry(sp, skip, split->key, split->data, &sdtlck);
1601
1602		/* linelock stbl of split page */
1603		if (sdtlck->index >= sdtlck->maxcnt)
1604			sdtlck = (struct dt_lock *) txLinelock(sdtlck);
1605		slv = & sdtlck->lv[sdtlck->index];
1606		n = skip >> L2DTSLOTSIZE;
1607		slv->offset = sp->header.stblindex + n;
1608		slv->length =
1609		    ((sp->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1;
1610		sdtlck->index++;
1611	}
1612	/*
1613	 * the skipped index was on the right page,
1614	 */
1615	else {
1616		/* adjust the skip index to reflect the new position */
1617		skip -= nxt;
1618
1619		/* insert the new entry in the right page */
1620		dtInsertEntry(rp, skip, split->key, split->data, &rdtlck);
1621	}
1622
1623      out:
1624	*rmpp = rmp;
1625	*rpxdp = *pxd;
1626
1627	return rc;
1628}
1629
1630
1631/*
1632 *	dtExtendPage()
1633 *
1634 * function: extend 1st/only directory leaf page
1635 *
1636 * parameter:
1637 *
1638 * return: 0 - success;
1639 *	   errno - failure;
1640 *	return extended page pinned;
1641 */
1642static int dtExtendPage(tid_t tid,
1643	     struct inode *ip, struct dtsplit * split, struct btstack * btstack)
1644{
1645	struct super_block *sb = ip->i_sb;
1646	int rc;
1647	struct metapage *smp, *pmp, *mp;
1648	dtpage_t *sp, *pp;
1649	struct pxdlist *pxdlist;
1650	pxd_t *pxd, *tpxd;
1651	int xlen, xsize;
1652	int newstblindex, newstblsize;
1653	int oldstblindex, oldstblsize;
1654	int fsi, last;
1655	struct dtslot *f;
1656	struct btframe *parent;
1657	int n;
1658	struct dt_lock *dtlck;
1659	s64 xaddr, txaddr;
1660	struct tlock *tlck;
1661	struct pxd_lock *pxdlock;
1662	struct lv *lv;
1663	uint type;
1664	struct ldtentry *ldtentry;
1665	u8 *stbl;
1666
1667	/* get page to extend */
1668	smp = split->mp;
1669	sp = DT_PAGE(ip, smp);
1670
1671	/* get parent/root page */
1672	parent = BT_POP(btstack);
1673	DT_GETPAGE(ip, parent->bn, pmp, PSIZE, pp, rc);
1674	if (rc)
1675		return (rc);
1676
1677	/*
1678	 *	extend the extent
1679	 */
1680	pxdlist = split->pxdlist;
1681	pxd = &pxdlist->pxd[pxdlist->npxd];
1682	pxdlist->npxd++;
1683
1684	xaddr = addressPXD(pxd);
1685	tpxd = &sp->header.self;
1686	txaddr = addressPXD(tpxd);
1687	/* in-place extension */
1688	if (xaddr == txaddr) {
1689		type = tlckEXTEND;
1690	}
1691	/* relocation */
1692	else {
1693		type = tlckNEW;
1694
1695		/* save moved extent descriptor for later free */
1696		tlck = txMaplock(tid, ip, tlckDTREE | tlckRELOCATE);
1697		pxdlock = (struct pxd_lock *) & tlck->lock;
1698		pxdlock->flag = mlckFREEPXD;
1699		pxdlock->pxd = sp->header.self;
1700		pxdlock->index = 1;
1701
1702		/*
1703		 * Update directory index table to reflect new page address
1704		 */
1705		if (DO_INDEX(ip)) {
1706			s64 lblock;
1707
1708			mp = NULL;
1709			stbl = DT_GETSTBL(sp);
1710			for (n = 0; n < sp->header.nextindex; n++) {
1711				ldtentry =
1712				    (struct ldtentry *) & sp->slot[stbl[n]];
1713				modify_index(tid, ip,
1714					     le32_to_cpu(ldtentry->index),
1715					     xaddr, n, &mp, &lblock);
1716			}
1717			if (mp)
1718				release_metapage(mp);
1719		}
1720	}
1721
1722	/*
1723	 *	extend the page
1724	 */
1725	sp->header.self = *pxd;
1726
1727	jfs_info("dtExtendPage: ip:0x%p smp:0x%p sp:0x%p", ip, smp, sp);
1728
1729	BT_MARK_DIRTY(smp, ip);
1730	/*
1731	 * acquire a transaction lock on the extended/leaf page
1732	 */
1733	tlck = txLock(tid, ip, smp, tlckDTREE | type);
1734	dtlck = (struct dt_lock *) & tlck->lock;
1735	lv = & dtlck->lv[0];
1736
1737	/* update buffer extent descriptor of extended page */
1738	xlen = lengthPXD(pxd);
1739	xsize = xlen << JFS_SBI(sb)->l2bsize;
1740
1741	/*
1742	 * copy old stbl to new stbl at start of extended area
1743	 */
1744	oldstblindex = sp->header.stblindex;
1745	oldstblsize = (sp->header.maxslot + 31) >> L2DTSLOTSIZE;
1746	newstblindex = sp->header.maxslot;
1747	n = xsize >> L2DTSLOTSIZE;
1748	newstblsize = (n + 31) >> L2DTSLOTSIZE;
1749	memcpy(&sp->slot[newstblindex], &sp->slot[oldstblindex],
1750	       sp->header.nextindex);
1751
1752	/*
1753	 * in-line extension: linelock old area of extended page
1754	 */
1755	if (type == tlckEXTEND) {
1756		/* linelock header */
1757		lv->offset = 0;
1758		lv->length = 1;
1759		dtlck->index++;
1760		lv++;
1761
1762		/* linelock new stbl of extended page */
1763		lv->offset = newstblindex;
1764		lv->length = newstblsize;
1765	}
1766	/*
1767	 * relocation: linelock whole relocated area
1768	 */
1769	else {
1770		lv->offset = 0;
1771		lv->length = sp->header.maxslot + newstblsize;
1772	}
1773
1774	dtlck->index++;
1775
1776	sp->header.maxslot = n;
1777	sp->header.stblindex = newstblindex;
1778	/* sp->header.nextindex remains the same */
1779
1780	/*
1781	 * add old stbl region at head of freelist
1782	 */
1783	fsi = oldstblindex;
1784	f = &sp->slot[fsi];
1785	last = sp->header.freelist;
1786	for (n = 0; n < oldstblsize; n++, fsi++, f++) {
1787		f->next = last;
1788		last = fsi;
1789	}
1790	sp->header.freelist = last;
1791	sp->header.freecnt += oldstblsize;
1792
1793	/*
1794	 * append free region of newly extended area at tail of freelist
1795	 */
1796	/* init free region of newly extended area */
1797	fsi = n = newstblindex + newstblsize;
1798	f = &sp->slot[fsi];
1799	for (fsi++; fsi < sp->header.maxslot; f++, fsi++)
1800		f->next = fsi;
1801	f->next = -1;
1802
1803	/* append new free region at tail of old freelist */
1804	fsi = sp->header.freelist;
1805	if (fsi == -1)
1806		sp->header.freelist = n;
1807	else {
1808		do {
1809			f = &sp->slot[fsi];
1810			fsi = f->next;
1811		} while (fsi != -1);
1812
1813		f->next = n;
1814	}
1815
1816	sp->header.freecnt += sp->header.maxslot - n;
1817
1818	/*
1819	 * insert the new entry
1820	 */
1821	dtInsertEntry(sp, split->index, split->key, split->data, &dtlck);
1822
1823	BT_MARK_DIRTY(pmp, ip);
1824	/*
1825	 * linelock any freeslots residing in old extent
1826	 */
1827	if (type == tlckEXTEND) {
1828		n = sp->header.maxslot >> 2;
1829		if (sp->header.freelist < n)
1830			dtLinelockFreelist(sp, n, &dtlck);
1831	}
1832
1833	/*
1834	 *	update parent entry on the parent/root page
1835	 */
1836	/*
1837	 * acquire a transaction lock on the parent/root page
1838	 */
1839	tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY);
1840	dtlck = (struct dt_lock *) & tlck->lock;
1841	lv = & dtlck->lv[dtlck->index];
1842
1843	/* linelock parent entry - 1st slot */
1844	lv->offset = 1;
1845	lv->length = 1;
1846	dtlck->index++;
1847
1848	/* update the parent pxd for page extension */
1849	tpxd = (pxd_t *) & pp->slot[1];
1850	*tpxd = *pxd;
1851
1852	DT_PUTPAGE(pmp);
1853	return 0;
1854}
1855
1856
1857/*
1858 *	dtSplitRoot()
1859 *
1860 * function:
1861 *	split the full root page into
1862 *	original/root/split page and new right page
1863 *	i.e., root remains fixed in tree anchor (inode) and
1864 *	the root is copied to a single new right child page
1865 *	since root page << non-root page, and
1866 *	the split root page contains a single entry for the
1867 *	new right child page.
1868 *
1869 * parameter:
1870 *
1871 * return: 0 - success;
1872 *	   errno - failure;
1873 *	return new page pinned;
1874 */
1875static int dtSplitRoot(tid_t tid,
1876	    struct inode *ip, struct dtsplit * split, struct metapage ** rmpp)
1877{
1878	struct super_block *sb = ip->i_sb;
1879	struct metapage *smp;
1880	dtroot_t *sp;
1881	struct metapage *rmp;
1882	dtpage_t *rp;
1883	s64 rbn;
1884	int xlen;
1885	int xsize;
1886	struct dtslot *f;
1887	s8 *stbl;
1888	int fsi, stblsize, n;
1889	struct idtentry *s;
1890	pxd_t *ppxd;
1891	struct pxdlist *pxdlist;
1892	pxd_t *pxd;
1893	struct dt_lock *dtlck;
1894	struct tlock *tlck;
1895	struct lv *lv;
1896	int rc;
1897
1898	/* get split root page */
1899	smp = split->mp;
1900	sp = &JFS_IP(ip)->i_dtroot;
1901
1902	/*
1903	 *	allocate/initialize a single (right) child page
1904	 *
1905	 * N.B. at first split, a one (or two) block to fit new entry
1906	 * is allocated; at subsequent split, a full page is allocated;
1907	 */
1908	pxdlist = split->pxdlist;
1909	pxd = &pxdlist->pxd[pxdlist->npxd];
1910	pxdlist->npxd++;
1911	rbn = addressPXD(pxd);
1912	xlen = lengthPXD(pxd);
1913	xsize = xlen << JFS_SBI(sb)->l2bsize;
1914	rmp = get_metapage(ip, rbn, xsize, 1);
1915	if (!rmp)
1916		return -EIO;
1917
1918	rp = rmp->data;
1919
1920	/* Allocate blocks to quota. */
1921	rc = dquot_alloc_block(ip, lengthPXD(pxd));
1922	if (rc) {
1923		release_metapage(rmp);
1924		return rc;
1925	}
1926
1927	BT_MARK_DIRTY(rmp, ip);
1928	/*
1929	 * acquire a transaction lock on the new right page
1930	 */
1931	tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW);
1932	dtlck = (struct dt_lock *) & tlck->lock;
1933
1934	rp->header.flag =
1935	    (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL;
1936	rp->header.self = *pxd;
1937
1938	/* initialize sibling pointers */
1939	rp->header.next = 0;
1940	rp->header.prev = 0;
1941
1942	/*
1943	 *	move in-line root page into new right page extent
1944	 */
1945	/* linelock header + copied entries + new stbl (1st slot) in new page */
1946	ASSERT(dtlck->index == 0);
1947	lv = & dtlck->lv[0];
1948	lv->offset = 0;
1949	lv->length = 10;	/* 1 + 8 + 1 */
1950	dtlck->index++;
1951
1952	n = xsize >> L2DTSLOTSIZE;
1953	rp->header.maxslot = n;
1954	stblsize = (n + 31) >> L2DTSLOTSIZE;
1955
1956	/* copy old stbl to new stbl at start of extended area */
1957	rp->header.stblindex = DTROOTMAXSLOT;
1958	stbl = (s8 *) & rp->slot[DTROOTMAXSLOT];
1959	memcpy(stbl, sp->header.stbl, sp->header.nextindex);
1960	rp->header.nextindex = sp->header.nextindex;
1961
1962	/* copy old data area to start of new data area */
1963	memcpy(&rp->slot[1], &sp->slot[1], IDATASIZE);
1964
1965	/*
1966	 * append free region of newly extended area at tail of freelist
1967	 */
1968	/* init free region of newly extended area */
1969	fsi = n = DTROOTMAXSLOT + stblsize;
1970	f = &rp->slot[fsi];
1971	for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
1972		f->next = fsi;
1973	f->next = -1;
1974
1975	/* append new free region at tail of old freelist */
1976	fsi = sp->header.freelist;
1977	if (fsi == -1)
1978		rp->header.freelist = n;
1979	else {
1980		rp->header.freelist = fsi;
1981
1982		do {
1983			f = &rp->slot[fsi];
1984			fsi = f->next;
1985		} while (fsi != -1);
1986
1987		f->next = n;
1988	}
1989
1990	rp->header.freecnt = sp->header.freecnt + rp->header.maxslot - n;
1991
1992	/*
1993	 * Update directory index table for entries now in right page
1994	 */
1995	if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) {
1996		s64 lblock;
1997		struct metapage *mp = NULL;
1998		struct ldtentry *ldtentry;
1999
2000		stbl = DT_GETSTBL(rp);
2001		for (n = 0; n < rp->header.nextindex; n++) {
2002			ldtentry = (struct ldtentry *) & rp->slot[stbl[n]];
2003			modify_index(tid, ip, le32_to_cpu(ldtentry->index),
2004				     rbn, n, &mp, &lblock);
2005		}
2006		if (mp)
2007			release_metapage(mp);
2008	}
2009	/*
2010	 * insert the new entry into the new right/child page
2011	 * (skip index in the new right page will not change)
2012	 */
2013	dtInsertEntry(rp, split->index, split->key, split->data, &dtlck);
2014
2015	/*
2016	 *	reset parent/root page
2017	 *
2018	 * set the 1st entry offset to 0, which force the left-most key
2019	 * at any level of the tree to be less than any search key.
2020	 *
2021	 * The btree comparison code guarantees that the left-most key on any
2022	 * level of the tree is never used, so it doesn't need to be filled in.
2023	 */
2024	BT_MARK_DIRTY(smp, ip);
2025	/*
2026	 * acquire a transaction lock on the root page (in-memory inode)
2027	 */
2028	tlck = txLock(tid, ip, smp, tlckDTREE | tlckNEW | tlckBTROOT);
2029	dtlck = (struct dt_lock *) & tlck->lock;
2030
2031	/* linelock root */
2032	ASSERT(dtlck->index == 0);
2033	lv = & dtlck->lv[0];
2034	lv->offset = 0;
2035	lv->length = DTROOTMAXSLOT;
2036	dtlck->index++;
2037
2038	/* update page header of root */
2039	if (sp->header.flag & BT_LEAF) {
2040		sp->header.flag &= ~BT_LEAF;
2041		sp->header.flag |= BT_INTERNAL;
2042	}
2043
2044	/* init the first entry */
2045	s = (struct idtentry *) & sp->slot[DTENTRYSTART];
2046	ppxd = (pxd_t *) s;
2047	*ppxd = *pxd;
2048	s->next = -1;
2049	s->namlen = 0;
2050
2051	stbl = sp->header.stbl;
2052	stbl[0] = DTENTRYSTART;
2053	sp->header.nextindex = 1;
2054
2055	/* init freelist */
2056	fsi = DTENTRYSTART + 1;
2057	f = &sp->slot[fsi];
2058
2059	/* init free region of remaining area */
2060	for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++)
2061		f->next = fsi;
2062	f->next = -1;
2063
2064	sp->header.freelist = DTENTRYSTART + 1;
2065	sp->header.freecnt = DTROOTMAXSLOT - (DTENTRYSTART + 1);
2066
2067	*rmpp = rmp;
2068
2069	return 0;
2070}
2071
2072
2073/*
2074 *	dtDelete()
2075 *
2076 * function: delete the entry(s) referenced by a key.
2077 *
2078 * parameter:
2079 *
2080 * return:
2081 */
2082int dtDelete(tid_t tid,
2083	 struct inode *ip, struct component_name * key, ino_t * ino, int flag)
2084{
2085	int rc = 0;
2086	s64 bn;
2087	struct metapage *mp, *imp;
2088	dtpage_t *p;
2089	int index;
2090	struct btstack btstack;
2091	struct dt_lock *dtlck;
2092	struct tlock *tlck;
2093	struct lv *lv;
2094	int i;
2095	struct ldtentry *ldtentry;
2096	u8 *stbl;
2097	u32 table_index, next_index;
2098	struct metapage *nmp;
2099	dtpage_t *np;
2100
2101	/*
2102	 *	search for the entry to delete:
2103	 *
2104	 * dtSearch() returns (leaf page pinned, index at which to delete).
2105	 */
2106	if ((rc = dtSearch(ip, key, ino, &btstack, flag)))
2107		return rc;
2108
2109	/* retrieve search result */
2110	DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
2111
2112	/*
2113	 * We need to find put the index of the next entry into the
2114	 * directory index table in order to resume a readdir from this
2115	 * entry.
2116	 */
2117	if (DO_INDEX(ip)) {
2118		stbl = DT_GETSTBL(p);
2119		ldtentry = (struct ldtentry *) & p->slot[stbl[index]];
2120		table_index = le32_to_cpu(ldtentry->index);
2121		if (index == (p->header.nextindex - 1)) {
2122			/*
2123			 * Last entry in this leaf page
2124			 */
2125			if ((p->header.flag & BT_ROOT)
2126			    || (p->header.next == 0))
2127				next_index = -1;
2128			else {
2129				/* Read next leaf page */
2130				DT_GETPAGE(ip, le64_to_cpu(p->header.next),
2131					   nmp, PSIZE, np, rc);
2132				if (rc)
2133					next_index = -1;
2134				else {
2135					stbl = DT_GETSTBL(np);
2136					ldtentry =
2137					    (struct ldtentry *) & np->
2138					    slot[stbl[0]];
2139					next_index =
2140					    le32_to_cpu(ldtentry->index);
2141					DT_PUTPAGE(nmp);
2142				}
2143			}
2144		} else {
2145			ldtentry =
2146			    (struct ldtentry *) & p->slot[stbl[index + 1]];
2147			next_index = le32_to_cpu(ldtentry->index);
2148		}
2149		free_index(tid, ip, table_index, next_index);
2150	}
2151	/*
2152	 * the leaf page becomes empty, delete the page
2153	 */
2154	if (p->header.nextindex == 1) {
2155		/* delete empty page */
2156		rc = dtDeleteUp(tid, ip, mp, p, &btstack);
2157	}
2158	/*
2159	 * the leaf page has other entries remaining:
2160	 *
2161	 * delete the entry from the leaf page.
2162	 */
2163	else {
2164		BT_MARK_DIRTY(mp, ip);
2165		/*
2166		 * acquire a transaction lock on the leaf page
2167		 */
2168		tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
2169		dtlck = (struct dt_lock *) & tlck->lock;
2170
2171		/*
2172		 * Do not assume that dtlck->index will be zero.  During a
2173		 * rename within a directory, this transaction may have
2174		 * modified this page already when adding the new entry.
2175		 */
2176
2177		/* linelock header */
2178		if (dtlck->index >= dtlck->maxcnt)
2179			dtlck = (struct dt_lock *) txLinelock(dtlck);
2180		lv = & dtlck->lv[dtlck->index];
2181		lv->offset = 0;
2182		lv->length = 1;
2183		dtlck->index++;
2184
2185		/* linelock stbl of non-root leaf page */
2186		if (!(p->header.flag & BT_ROOT)) {
2187			if (dtlck->index >= dtlck->maxcnt)
2188				dtlck = (struct dt_lock *) txLinelock(dtlck);
2189			lv = & dtlck->lv[dtlck->index];
2190			i = index >> L2DTSLOTSIZE;
2191			lv->offset = p->header.stblindex + i;
2192			lv->length =
2193			    ((p->header.nextindex - 1) >> L2DTSLOTSIZE) -
2194			    i + 1;
2195			dtlck->index++;
2196		}
2197
2198		/* free the leaf entry */
2199		dtDeleteEntry(p, index, &dtlck);
2200
2201		/*
2202		 * Update directory index table for entries moved in stbl
2203		 */
2204		if (DO_INDEX(ip) && index < p->header.nextindex) {
2205			s64 lblock;
2206
2207			imp = NULL;
2208			stbl = DT_GETSTBL(p);
2209			for (i = index; i < p->header.nextindex; i++) {
2210				ldtentry =
2211				    (struct ldtentry *) & p->slot[stbl[i]];
2212				modify_index(tid, ip,
2213					     le32_to_cpu(ldtentry->index),
2214					     bn, i, &imp, &lblock);
2215			}
2216			if (imp)
2217				release_metapage(imp);
2218		}
2219
2220		DT_PUTPAGE(mp);
2221	}
2222
2223	return rc;
2224}
2225
2226
2227/*
2228 *	dtDeleteUp()
2229 *
2230 * function:
2231 *	free empty pages as propagating deletion up the tree
2232 *
2233 * parameter:
2234 *
2235 * return:
2236 */
2237static int dtDeleteUp(tid_t tid, struct inode *ip,
2238	   struct metapage * fmp, dtpage_t * fp, struct btstack * btstack)
2239{
2240	int rc = 0;
2241	struct metapage *mp;
2242	dtpage_t *p;
2243	int index, nextindex;
2244	int xlen;
2245	struct btframe *parent;
2246	struct dt_lock *dtlck;
2247	struct tlock *tlck;
2248	struct lv *lv;
2249	struct pxd_lock *pxdlock;
2250	int i;
2251
2252	/*
2253	 *	keep the root leaf page which has become empty
2254	 */
2255	if (BT_IS_ROOT(fmp)) {
2256		/*
2257		 * reset the root
2258		 *
2259		 * dtInitRoot() acquires txlock on the root
2260		 */
2261		dtInitRoot(tid, ip, PARENT(ip));
2262
2263		DT_PUTPAGE(fmp);
2264
2265		return 0;
2266	}
2267
2268	/*
2269	 *	free the non-root leaf page
2270	 */
2271	/*
2272	 * acquire a transaction lock on the page
2273	 *
2274	 * write FREEXTENT|NOREDOPAGE log record
2275	 * N.B. linelock is overlaid as freed extent descriptor, and
2276	 * the buffer page is freed;
2277	 */
2278	tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE);
2279	pxdlock = (struct pxd_lock *) & tlck->lock;
2280	pxdlock->flag = mlckFREEPXD;
2281	pxdlock->pxd = fp->header.self;
2282	pxdlock->index = 1;
2283
2284	/* update sibling pointers */
2285	if ((rc = dtRelink(tid, ip, fp))) {
2286		BT_PUTPAGE(fmp);
2287		return rc;
2288	}
2289
2290	xlen = lengthPXD(&fp->header.self);
2291
2292	/* Free quota allocation. */
2293	dquot_free_block(ip, xlen);
2294
2295	/* free/invalidate its buffer page */
2296	discard_metapage(fmp);
2297
2298	/*
2299	 *	propagate page deletion up the directory tree
2300	 *
2301	 * If the delete from the parent page makes it empty,
2302	 * continue all the way up the tree.
2303	 * stop if the root page is reached (which is never deleted) or
2304	 * if the entry deletion does not empty the page.
2305	 */
2306	while ((parent = BT_POP(btstack)) != NULL) {
2307		/* pin the parent page <sp> */
2308		DT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc);
2309		if (rc)
2310			return rc;
2311
2312		/*
2313		 * free the extent of the child page deleted
2314		 */
2315		index = parent->index;
2316
2317		/*
2318		 * delete the entry for the child page from parent
2319		 */
2320		nextindex = p->header.nextindex;
2321
2322		/*
2323		 * the parent has the single entry being deleted:
2324		 *
2325		 * free the parent page which has become empty.
2326		 */
2327		if (nextindex == 1) {
2328			/*
2329			 * keep the root internal page which has become empty
2330			 */
2331			if (p->header.flag & BT_ROOT) {
2332				/*
2333				 * reset the root
2334				 *
2335				 * dtInitRoot() acquires txlock on the root
2336				 */
2337				dtInitRoot(tid, ip, PARENT(ip));
2338
2339				DT_PUTPAGE(mp);
2340
2341				return 0;
2342			}
2343			/*
2344			 * free the parent page
2345			 */
2346			else {
2347				/*
2348				 * acquire a transaction lock on the page
2349				 *
2350				 * write FREEXTENT|NOREDOPAGE log record
2351				 */
2352				tlck =
2353				    txMaplock(tid, ip,
2354					      tlckDTREE | tlckFREE);
2355				pxdlock = (struct pxd_lock *) & tlck->lock;
2356				pxdlock->flag = mlckFREEPXD;
2357				pxdlock->pxd = p->header.self;
2358				pxdlock->index = 1;
2359
2360				/* update sibling pointers */
2361				if ((rc = dtRelink(tid, ip, p))) {
2362					DT_PUTPAGE(mp);
2363					return rc;
2364				}
2365
2366				xlen = lengthPXD(&p->header.self);
2367
2368				/* Free quota allocation */
2369				dquot_free_block(ip, xlen);
2370
2371				/* free/invalidate its buffer page */
2372				discard_metapage(mp);
2373
2374				/* propagate up */
2375				continue;
2376			}
2377		}
2378
2379		/*
2380		 * the parent has other entries remaining:
2381		 *
2382		 * delete the router entry from the parent page.
2383		 */
2384		BT_MARK_DIRTY(mp, ip);
2385		/*
2386		 * acquire a transaction lock on the page
2387		 *
2388		 * action: router entry deletion
2389		 */
2390		tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
2391		dtlck = (struct dt_lock *) & tlck->lock;
2392
2393		/* linelock header */
2394		if (dtlck->index >= dtlck->maxcnt)
2395			dtlck = (struct dt_lock *) txLinelock(dtlck);
2396		lv = & dtlck->lv[dtlck->index];
2397		lv->offset = 0;
2398		lv->length = 1;
2399		dtlck->index++;
2400
2401		/* linelock stbl of non-root leaf page */
2402		if (!(p->header.flag & BT_ROOT)) {
2403			if (dtlck->index < dtlck->maxcnt)
2404				lv++;
2405			else {
2406				dtlck = (struct dt_lock *) txLinelock(dtlck);
2407				lv = & dtlck->lv[0];
2408			}
2409			i = index >> L2DTSLOTSIZE;
2410			lv->offset = p->header.stblindex + i;
2411			lv->length =
2412			    ((p->header.nextindex - 1) >> L2DTSLOTSIZE) -
2413			    i + 1;
2414			dtlck->index++;
2415		}
2416
2417		/* free the router entry */
2418		dtDeleteEntry(p, index, &dtlck);
2419
2420		/* reset key of new leftmost entry of level (for consistency) */
2421		if (index == 0 &&
2422		    ((p->header.flag & BT_ROOT) || p->header.prev == 0))
2423			dtTruncateEntry(p, 0, &dtlck);
2424
2425		/* unpin the parent page */
2426		DT_PUTPAGE(mp);
2427
2428		/* exit propagation up */
2429		break;
2430	}
2431
2432	if (!DO_INDEX(ip))
2433		ip->i_size -= PSIZE;
2434
2435	return 0;
2436}
2437
2438#ifdef _NOTYET
2439/*
2440 * NAME:	dtRelocate()
2441 *
2442 * FUNCTION:	relocate dtpage (internal or leaf) of directory;
2443 *		This function is mainly used by defragfs utility.
2444 */
2445int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
2446	       s64 nxaddr)
2447{
2448	int rc = 0;
2449	struct metapage *mp, *pmp, *lmp, *rmp;
2450	dtpage_t *p, *pp, *rp = 0, *lp= 0;
2451	s64 bn;
2452	int index;
2453	struct btstack btstack;
2454	pxd_t *pxd;
2455	s64 oxaddr, nextbn, prevbn;
2456	int xlen, xsize;
2457	struct tlock *tlck;
2458	struct dt_lock *dtlck;
2459	struct pxd_lock *pxdlock;
2460	s8 *stbl;
2461	struct lv *lv;
2462
2463	oxaddr = addressPXD(opxd);
2464	xlen = lengthPXD(opxd);
2465
2466	jfs_info("dtRelocate: lmxaddr:%Ld xaddr:%Ld:%Ld xlen:%d",
2467		   (long long)lmxaddr, (long long)oxaddr, (long long)nxaddr,
2468		   xlen);
2469
2470	/*
2471	 *	1. get the internal parent dtpage covering
2472	 *	router entry for the tartget page to be relocated;
2473	 */
2474	rc = dtSearchNode(ip, lmxaddr, opxd, &btstack);
2475	if (rc)
2476		return rc;
2477
2478	/* retrieve search result */
2479	DT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
2480	jfs_info("dtRelocate: parent router entry validated.");
2481
2482	/*
2483	 *	2. relocate the target dtpage
2484	 */
2485	/* read in the target page from src extent */
2486	DT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc);
2487	if (rc) {
2488		/* release the pinned parent page */
2489		DT_PUTPAGE(pmp);
2490		return rc;
2491	}
2492
2493	/*
2494	 * read in sibling pages if any to update sibling pointers;
2495	 */
2496	rmp = NULL;
2497	if (p->header.next) {
2498		nextbn = le64_to_cpu(p->header.next);
2499		DT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc);
2500		if (rc) {
2501			DT_PUTPAGE(mp);
2502			DT_PUTPAGE(pmp);
2503			return (rc);
2504		}
2505	}
2506
2507	lmp = NULL;
2508	if (p->header.prev) {
2509		prevbn = le64_to_cpu(p->header.prev);
2510		DT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc);
2511		if (rc) {
2512			DT_PUTPAGE(mp);
2513			DT_PUTPAGE(pmp);
2514			if (rmp)
2515				DT_PUTPAGE(rmp);
2516			return (rc);
2517		}
2518	}
2519
2520	/* at this point, all xtpages to be updated are in memory */
2521
2522	/*
2523	 * update sibling pointers of sibling dtpages if any;
2524	 */
2525	if (lmp) {
2526		tlck = txLock(tid, ip, lmp, tlckDTREE | tlckRELINK);
2527		dtlck = (struct dt_lock *) & tlck->lock;
2528		/* linelock header */
2529		ASSERT(dtlck->index == 0);
2530		lv = & dtlck->lv[0];
2531		lv->offset = 0;
2532		lv->length = 1;
2533		dtlck->index++;
2534
2535		lp->header.next = cpu_to_le64(nxaddr);
2536		DT_PUTPAGE(lmp);
2537	}
2538
2539	if (rmp) {
2540		tlck = txLock(tid, ip, rmp, tlckDTREE | tlckRELINK);
2541		dtlck = (struct dt_lock *) & tlck->lock;
2542		/* linelock header */
2543		ASSERT(dtlck->index == 0);
2544		lv = & dtlck->lv[0];
2545		lv->offset = 0;
2546		lv->length = 1;
2547		dtlck->index++;
2548
2549		rp->header.prev = cpu_to_le64(nxaddr);
2550		DT_PUTPAGE(rmp);
2551	}
2552
2553	/*
2554	 * update the target dtpage to be relocated
2555	 *
2556	 * write LOG_REDOPAGE of LOG_NEW type for dst page
2557	 * for the whole target page (logredo() will apply
2558	 * after image and update bmap for allocation of the
2559	 * dst extent), and update bmap for allocation of
2560	 * the dst extent;
2561	 */
2562	tlck = txLock(tid, ip, mp, tlckDTREE | tlckNEW);
2563	dtlck = (struct dt_lock *) & tlck->lock;
2564	/* linelock header */
2565	ASSERT(dtlck->index == 0);
2566	lv = & dtlck->lv[0];
2567
2568	/* update the self address in the dtpage header */
2569	pxd = &p->header.self;
2570	PXDaddress(pxd, nxaddr);
2571
2572	/* the dst page is the same as the src page, i.e.,
2573	 * linelock for afterimage of the whole page;
2574	 */
2575	lv->offset = 0;
2576	lv->length = p->header.maxslot;
2577	dtlck->index++;
2578
2579	/* update the buffer extent descriptor of the dtpage */
2580	xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize;
2581
2582	/* unpin the relocated page */
2583	DT_PUTPAGE(mp);
2584	jfs_info("dtRelocate: target dtpage relocated.");
2585
2586	/* the moved extent is dtpage, then a LOG_NOREDOPAGE log rec
2587	 * needs to be written (in logredo(), the LOG_NOREDOPAGE log rec
2588	 * will also force a bmap update ).
2589	 */
2590
2591	/*
2592	 *	3. acquire maplock for the source extent to be freed;
2593	 */
2594	/* for dtpage relocation, write a LOG_NOREDOPAGE record
2595	 * for the source dtpage (logredo() will init NoRedoPage
2596	 * filter and will also update bmap for free of the source
2597	 * dtpage), and upadte bmap for free of the source dtpage;
2598	 */
2599	tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE);
2600	pxdlock = (struct pxd_lock *) & tlck->lock;
2601	pxdlock->flag = mlckFREEPXD;
2602	PXDaddress(&pxdlock->pxd, oxaddr);
2603	PXDlength(&pxdlock->pxd, xlen);
2604	pxdlock->index = 1;
2605
2606	/*
2607	 *	4. update the parent router entry for relocation;
2608	 *
2609	 * acquire tlck for the parent entry covering the target dtpage;
2610	 * write LOG_REDOPAGE to apply after image only;
2611	 */
2612	jfs_info("dtRelocate: update parent router entry.");
2613	tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY);
2614	dtlck = (struct dt_lock *) & tlck->lock;
2615	lv = & dtlck->lv[dtlck->index];
2616
2617	/* update the PXD with the new address */
2618	stbl = DT_GETSTBL(pp);
2619	pxd = (pxd_t *) & pp->slot[stbl[index]];
2620	PXDaddress(pxd, nxaddr);
2621	lv->offset = stbl[index];
2622	lv->length = 1;
2623	dtlck->index++;
2624
2625	/* unpin the parent dtpage */
2626	DT_PUTPAGE(pmp);
2627
2628	return rc;
2629}
2630
2631/*
2632 * NAME:	dtSearchNode()
2633 *
2634 * FUNCTION:	Search for an dtpage containing a specified address
2635 *		This function is mainly used by defragfs utility.
2636 *
2637 * NOTE:	Search result on stack, the found page is pinned at exit.
2638 *		The result page must be an internal dtpage.
2639 *		lmxaddr give the address of the left most page of the
2640 *		dtree level, in which the required dtpage resides.
2641 */
2642static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd,
2643			struct btstack * btstack)
2644{
2645	int rc = 0;
2646	s64 bn;
2647	struct metapage *mp;
2648	dtpage_t *p;
2649	int psize = 288;	/* initial in-line directory */
2650	s8 *stbl;
2651	int i;
2652	pxd_t *pxd;
2653	struct btframe *btsp;
2654
2655	BT_CLR(btstack);	/* reset stack */
2656
2657	/*
2658	 *	descend tree to the level with specified leftmost page
2659	 *
2660	 *  by convention, root bn = 0.
2661	 */
2662	for (bn = 0;;) {
2663		/* get/pin the page to search */
2664		DT_GETPAGE(ip, bn, mp, psize, p, rc);
2665		if (rc)
2666			return rc;
2667
2668		/* does the xaddr of leftmost page of the levevl
2669		 * matches levevl search key ?
2670		 */
2671		if (p->header.flag & BT_ROOT) {
2672			if (lmxaddr == 0)
2673				break;
2674		} else if (addressPXD(&p->header.self) == lmxaddr)
2675			break;
2676
2677		/*
2678		 * descend down to leftmost child page
2679		 */
2680		if (p->header.flag & BT_LEAF) {
2681			DT_PUTPAGE(mp);
2682			return -ESTALE;
2683		}
2684
2685		/* get the leftmost entry */
2686		stbl = DT_GETSTBL(p);
2687		pxd = (pxd_t *) & p->slot[stbl[0]];
2688
2689		/* get the child page block address */
2690		bn = addressPXD(pxd);
2691		psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize;
2692		/* unpin the parent page */
2693		DT_PUTPAGE(mp);
2694	}
2695
2696	/*
2697	 *	search each page at the current levevl
2698	 */
2699      loop:
2700	stbl = DT_GETSTBL(p);
2701	for (i = 0; i < p->header.nextindex; i++) {
2702		pxd = (pxd_t *) & p->slot[stbl[i]];
2703
2704		/* found the specified router entry */
2705		if (addressPXD(pxd) == addressPXD(kpxd) &&
2706		    lengthPXD(pxd) == lengthPXD(kpxd)) {
2707			btsp = btstack->top;
2708			btsp->bn = bn;
2709			btsp->index = i;
2710			btsp->mp = mp;
2711
2712			return 0;
2713		}
2714	}
2715
2716	/* get the right sibling page if any */
2717	if (p->header.next)
2718		bn = le64_to_cpu(p->header.next);
2719	else {
2720		DT_PUTPAGE(mp);
2721		return -ESTALE;
2722	}
2723
2724	/* unpin current page */
2725	DT_PUTPAGE(mp);
2726
2727	/* get the right sibling page */
2728	DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
2729	if (rc)
2730		return rc;
2731
2732	goto loop;
2733}
2734#endif /* _NOTYET */
2735
2736/*
2737 *	dtRelink()
2738 *
2739 * function:
2740 *	link around a freed page.
2741 *
2742 * parameter:
2743 *	fp:	page to be freed
2744 *
2745 * return:
2746 */
2747static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p)
2748{
2749	int rc;
2750	struct metapage *mp;
2751	s64 nextbn, prevbn;
2752	struct tlock *tlck;
2753	struct dt_lock *dtlck;
2754	struct lv *lv;
2755
2756	nextbn = le64_to_cpu(p->header.next);
2757	prevbn = le64_to_cpu(p->header.prev);
2758
2759	/* update prev pointer of the next page */
2760	if (nextbn != 0) {
2761		DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
2762		if (rc)
2763			return rc;
2764
2765		BT_MARK_DIRTY(mp, ip);
2766		/*
2767		 * acquire a transaction lock on the next page
2768		 *
2769		 * action: update prev pointer;
2770		 */
2771		tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
2772		jfs_info("dtRelink nextbn: tlck = 0x%p, ip = 0x%p, mp=0x%p",
2773			tlck, ip, mp);
2774		dtlck = (struct dt_lock *) & tlck->lock;
2775
2776		/* linelock header */
2777		if (dtlck->index >= dtlck->maxcnt)
2778			dtlck = (struct dt_lock *) txLinelock(dtlck);
2779		lv = & dtlck->lv[dtlck->index];
2780		lv->offset = 0;
2781		lv->length = 1;
2782		dtlck->index++;
2783
2784		p->header.prev = cpu_to_le64(prevbn);
2785		DT_PUTPAGE(mp);
2786	}
2787
2788	/* update next pointer of the previous page */
2789	if (prevbn != 0) {
2790		DT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc);
2791		if (rc)
2792			return rc;
2793
2794		BT_MARK_DIRTY(mp, ip);
2795		/*
2796		 * acquire a transaction lock on the prev page
2797		 *
2798		 * action: update next pointer;
2799		 */
2800		tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
2801		jfs_info("dtRelink prevbn: tlck = 0x%p, ip = 0x%p, mp=0x%p",
2802			tlck, ip, mp);
2803		dtlck = (struct dt_lock *) & tlck->lock;
2804
2805		/* linelock header */
2806		if (dtlck->index >= dtlck->maxcnt)
2807			dtlck = (struct dt_lock *) txLinelock(dtlck);
2808		lv = & dtlck->lv[dtlck->index];
2809		lv->offset = 0;
2810		lv->length = 1;
2811		dtlck->index++;
2812
2813		p->header.next = cpu_to_le64(nextbn);
2814		DT_PUTPAGE(mp);
2815	}
2816
2817	return 0;
2818}
2819
2820
2821/*
2822 *	dtInitRoot()
2823 *
2824 * initialize directory root (inline in inode)
2825 */
2826void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot)
2827{
2828	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
2829	dtroot_t *p;
2830	int fsi;
2831	struct dtslot *f;
2832	struct tlock *tlck;
2833	struct dt_lock *dtlck;
2834	struct lv *lv;
2835	u16 xflag_save;
2836
2837	/*
2838	 * If this was previously an non-empty directory, we need to remove
2839	 * the old directory table.
2840	 */
2841	if (DO_INDEX(ip)) {
2842		if (!jfs_dirtable_inline(ip)) {
2843			struct tblock *tblk = tid_to_tblock(tid);
2844			/*
2845			 * We're playing games with the tid's xflag.  If
2846			 * we're removing a regular file, the file's xtree
2847			 * is committed with COMMIT_PMAP, but we always
2848			 * commit the directories xtree with COMMIT_PWMAP.
2849			 */
2850			xflag_save = tblk->xflag;
2851			tblk->xflag = 0;
2852			/*
2853			 * xtTruncate isn't guaranteed to fully truncate
2854			 * the xtree.  The caller needs to check i_size
2855			 * after committing the transaction to see if
2856			 * additional truncation is needed.  The
2857			 * COMMIT_Stale flag tells caller that we
2858			 * initiated the truncation.
2859			 */
2860			xtTruncate(tid, ip, 0, COMMIT_PWMAP);
2861			set_cflag(COMMIT_Stale, ip);
2862
2863			tblk->xflag = xflag_save;
2864		} else
2865			ip->i_size = 1;
2866
2867		jfs_ip->next_index = 2;
2868	} else
2869		ip->i_size = IDATASIZE;
2870
2871	/*
2872	 * acquire a transaction lock on the root
2873	 *
2874	 * action: directory initialization;
2875	 */
2876	tlck = txLock(tid, ip, (struct metapage *) & jfs_ip->bxflag,
2877		      tlckDTREE | tlckENTRY | tlckBTROOT);
2878	dtlck = (struct dt_lock *) & tlck->lock;
2879
2880	/* linelock root */
2881	ASSERT(dtlck->index == 0);
2882	lv = & dtlck->lv[0];
2883	lv->offset = 0;
2884	lv->length = DTROOTMAXSLOT;
2885	dtlck->index++;
2886
2887	p = &jfs_ip->i_dtroot;
2888
2889	p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF;
2890
2891	p->header.nextindex = 0;
2892
2893	/* init freelist */
2894	fsi = 1;
2895	f = &p->slot[fsi];
2896
2897	/* init data area of root */
2898	for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++)
2899		f->next = fsi;
2900	f->next = -1;
2901
2902	p->header.freelist = 1;
2903	p->header.freecnt = 8;
2904
2905	/* init '..' entry */
2906	p->header.idotdot = cpu_to_le32(idotdot);
2907
2908	return;
2909}
2910
2911/*
2912 *	add_missing_indices()
2913 *
2914 * function: Fix dtree page in which one or more entries has an invalid index.
2915 *	     fsck.jfs should really fix this, but it currently does not.
2916 *	     Called from jfs_readdir when bad index is detected.
2917 */
2918static void add_missing_indices(struct inode *inode, s64 bn)
2919{
2920	struct ldtentry *d;
2921	struct dt_lock *dtlck;
2922	int i;
2923	uint index;
2924	struct lv *lv;
2925	struct metapage *mp;
2926	dtpage_t *p;
2927	int rc;
2928	s8 *stbl;
2929	tid_t tid;
2930	struct tlock *tlck;
2931
2932	tid = txBegin(inode->i_sb, 0);
2933
2934	DT_GETPAGE(inode, bn, mp, PSIZE, p, rc);
2935
2936	if (rc) {
2937		printk(KERN_ERR "DT_GETPAGE failed!\n");
2938		goto end;
2939	}
2940	BT_MARK_DIRTY(mp, inode);
2941
2942	ASSERT(p->header.flag & BT_LEAF);
2943
2944	tlck = txLock(tid, inode, mp, tlckDTREE | tlckENTRY);
2945	if (BT_IS_ROOT(mp))
2946		tlck->type |= tlckBTROOT;
2947
2948	dtlck = (struct dt_lock *) &tlck->lock;
2949
2950	stbl = DT_GETSTBL(p);
2951	for (i = 0; i < p->header.nextindex; i++) {
2952		d = (struct ldtentry *) &p->slot[stbl[i]];
2953		index = le32_to_cpu(d->index);
2954		if ((index < 2) || (index >= JFS_IP(inode)->next_index)) {
2955			d->index = cpu_to_le32(add_index(tid, inode, bn, i));
2956			if (dtlck->index >= dtlck->maxcnt)
2957				dtlck = (struct dt_lock *) txLinelock(dtlck);
2958			lv = &dtlck->lv[dtlck->index];
2959			lv->offset = stbl[i];
2960			lv->length = 1;
2961			dtlck->index++;
2962		}
2963	}
2964
2965	DT_PUTPAGE(mp);
2966	(void) txCommit(tid, 1, &inode, 0);
2967end:
2968	txEnd(tid);
2969}
2970
2971/*
2972 * Buffer to hold directory entry info while traversing a dtree page
2973 * before being fed to the filldir function
2974 */
2975struct jfs_dirent {
2976	loff_t position;
2977	int ino;
2978	u16 name_len;
2979	char name[0];
2980};
2981
2982/*
2983 * function to determine next variable-sized jfs_dirent in buffer
2984 */
2985static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent)
2986{
2987	return (struct jfs_dirent *)
2988		((char *)dirent +
2989		 ((sizeof (struct jfs_dirent) + dirent->name_len + 1 +
2990		   sizeof (loff_t) - 1) &
2991		  ~(sizeof (loff_t) - 1)));
2992}
2993
2994/*
2995 *	jfs_readdir()
2996 *
2997 * function: read directory entries sequentially
2998 *	from the specified entry offset
2999 *
3000 * parameter:
3001 *
3002 * return: offset = (pn, index) of start entry
3003 *	of next jfs_readdir()/dtRead()
3004 */
3005int jfs_readdir(struct file *file, struct dir_context *ctx)
3006{
3007	struct inode *ip = file_inode(file);
3008	struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab;
3009	int rc = 0;
3010	loff_t dtpos;	/* legacy OS/2 style position */
3011	struct dtoffset {
3012		s16 pn;
3013		s16 index;
3014		s32 unused;
3015	} *dtoffset = (struct dtoffset *) &dtpos;
3016	s64 bn;
3017	struct metapage *mp;
3018	dtpage_t *p;
3019	int index;
3020	s8 *stbl;
3021	struct btstack btstack;
3022	int i, next;
3023	struct ldtentry *d;
3024	struct dtslot *t;
3025	int d_namleft, len, outlen;
3026	unsigned long dirent_buf;
3027	char *name_ptr;
3028	u32 dir_index;
3029	int do_index = 0;
3030	uint loop_count = 0;
3031	struct jfs_dirent *jfs_dirent;
3032	int jfs_dirents;
3033	int overflow, fix_page, page_fixed = 0;
3034	static int unique_pos = 2;	/* If we can't fix broken index */
3035
3036	if (ctx->pos == DIREND)
3037		return 0;
3038
3039	if (DO_INDEX(ip)) {
3040		/*
3041		 * persistent index is stored in directory entries.
3042		 * Special cases:	 0 = .
3043		 *			 1 = ..
3044		 *			-1 = End of directory
3045		 */
3046		do_index = 1;
3047
3048		dir_index = (u32) ctx->pos;
3049
3050		/*
3051		 * NFSv4 reserves cookies 1 and 2 for . and .. so the value
3052		 * we return to the vfs is one greater than the one we use
3053		 * internally.
3054		 */
3055		if (dir_index)
3056			dir_index--;
3057
3058		if (dir_index > 1) {
3059			struct dir_table_slot dirtab_slot;
3060
3061			if (dtEmpty(ip) ||
3062			    (dir_index >= JFS_IP(ip)->next_index)) {
3063				/* Stale position.  Directory has shrunk */
3064				ctx->pos = DIREND;
3065				return 0;
3066			}
3067		      repeat:
3068			rc = read_index(ip, dir_index, &dirtab_slot);
3069			if (rc) {
3070				ctx->pos = DIREND;
3071				return rc;
3072			}
3073			if (dirtab_slot.flag == DIR_INDEX_FREE) {
3074				if (loop_count++ > JFS_IP(ip)->next_index) {
3075					jfs_err("jfs_readdir detected infinite loop!");
3076					ctx->pos = DIREND;
3077					return 0;
3078				}
3079				dir_index = le32_to_cpu(dirtab_slot.addr2);
3080				if (dir_index == -1) {
3081					ctx->pos = DIREND;
3082					return 0;
3083				}
3084				goto repeat;
3085			}
3086			bn = addressDTS(&dirtab_slot);
3087			index = dirtab_slot.slot;
3088			DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3089			if (rc) {
3090				ctx->pos = DIREND;
3091				return 0;
3092			}
3093			if (p->header.flag & BT_INTERNAL) {
3094				jfs_err("jfs_readdir: bad index table");
3095				DT_PUTPAGE(mp);
3096				ctx->pos = DIREND;
3097				return 0;
3098			}
3099		} else {
3100			if (dir_index == 0) {
3101				/*
3102				 * self "."
3103				 */
3104				ctx->pos = 1;
3105				if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
3106					return 0;
3107			}
3108			/*
3109			 * parent ".."
3110			 */
3111			ctx->pos = 2;
3112			if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
3113				return 0;
3114
3115			/*
3116			 * Find first entry of left-most leaf
3117			 */
3118			if (dtEmpty(ip)) {
3119				ctx->pos = DIREND;
3120				return 0;
3121			}
3122
3123			if ((rc = dtReadFirst(ip, &btstack)))
3124				return rc;
3125
3126			DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
3127		}
3128	} else {
3129		/*
3130		 * Legacy filesystem - OS/2 & Linux JFS < 0.3.6
3131		 *
3132		 * pn = 0; index = 1:	First entry "."
3133		 * pn = 0; index = 2:	Second entry ".."
3134		 * pn > 0:		Real entries, pn=1 -> leftmost page
3135		 * pn = index = -1:	No more entries
3136		 */
3137		dtpos = ctx->pos;
3138		if (dtpos < 2) {
3139			/* build "." entry */
3140			ctx->pos = 1;
3141			if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
3142				return 0;
3143			dtoffset->index = 2;
3144			ctx->pos = dtpos;
3145		}
3146
3147		if (dtoffset->pn == 0) {
3148			if (dtoffset->index == 2) {
3149				/* build ".." entry */
3150				if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
3151					return 0;
3152			} else {
3153				jfs_err("jfs_readdir called with invalid offset!");
3154			}
3155			dtoffset->pn = 1;
3156			dtoffset->index = 0;
3157			ctx->pos = dtpos;
3158		}
3159
3160		if (dtEmpty(ip)) {
3161			ctx->pos = DIREND;
3162			return 0;
3163		}
3164
3165		if ((rc = dtReadNext(ip, &ctx->pos, &btstack))) {
3166			jfs_err("jfs_readdir: unexpected rc = %d from dtReadNext",
3167				rc);
3168			ctx->pos = DIREND;
3169			return 0;
3170		}
3171		/* get start leaf page and index */
3172		DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
3173
3174		/* offset beyond directory eof ? */
3175		if (bn < 0) {
3176			ctx->pos = DIREND;
3177			return 0;
3178		}
3179	}
3180
3181	dirent_buf = __get_free_page(GFP_KERNEL);
3182	if (dirent_buf == 0) {
3183		DT_PUTPAGE(mp);
3184		jfs_warn("jfs_readdir: __get_free_page failed!");
3185		ctx->pos = DIREND;
3186		return -ENOMEM;
3187	}
3188
3189	while (1) {
3190		jfs_dirent = (struct jfs_dirent *) dirent_buf;
3191		jfs_dirents = 0;
3192		overflow = fix_page = 0;
3193
3194		stbl = DT_GETSTBL(p);
3195
3196		for (i = index; i < p->header.nextindex; i++) {
 
 
 
 
 
 
 
 
3197			d = (struct ldtentry *) & p->slot[stbl[i]];
3198
3199			if (((long) jfs_dirent + d->namlen + 1) >
3200			    (dirent_buf + PAGE_SIZE)) {
3201				/* DBCS codepages could overrun dirent_buf */
3202				index = i;
3203				overflow = 1;
3204				break;
3205			}
3206
3207			d_namleft = d->namlen;
3208			name_ptr = jfs_dirent->name;
3209			jfs_dirent->ino = le32_to_cpu(d->inumber);
3210
3211			if (do_index) {
3212				len = min(d_namleft, DTLHDRDATALEN);
3213				jfs_dirent->position = le32_to_cpu(d->index);
3214				/*
3215				 * d->index should always be valid, but it
3216				 * isn't.  fsck.jfs doesn't create the
3217				 * directory index for the lost+found
3218				 * directory.  Rather than let it go,
3219				 * we can try to fix it.
3220				 */
3221				if ((jfs_dirent->position < 2) ||
3222				    (jfs_dirent->position >=
3223				     JFS_IP(ip)->next_index)) {
3224					if (!page_fixed && !isReadOnly(ip)) {
3225						fix_page = 1;
3226						/*
3227						 * setting overflow and setting
3228						 * index to i will cause the
3229						 * same page to be processed
3230						 * again starting here
3231						 */
3232						overflow = 1;
3233						index = i;
3234						break;
3235					}
3236					jfs_dirent->position = unique_pos++;
3237				}
3238				/*
3239				 * We add 1 to the index because we may
3240				 * use a value of 2 internally, and NFSv4
3241				 * doesn't like that.
3242				 */
3243				jfs_dirent->position++;
3244			} else {
3245				jfs_dirent->position = dtpos;
3246				len = min(d_namleft, DTLHDRDATALEN_LEGACY);
3247			}
3248
3249			/* copy the name of head/only segment */
3250			outlen = jfs_strfromUCS_le(name_ptr, d->name, len,
3251						   codepage);
3252			jfs_dirent->name_len = outlen;
3253
3254			/* copy name in the additional segment(s) */
3255			next = d->next;
3256			while (next >= 0) {
3257				t = (struct dtslot *) & p->slot[next];
3258				name_ptr += outlen;
3259				d_namleft -= len;
3260				/* Sanity Check */
3261				if (d_namleft == 0) {
3262					jfs_error(ip->i_sb,
3263						  "JFS:Dtree error: ino = %ld, bn=%lld, index = %d\n",
3264						  (long)ip->i_ino,
3265						  (long long)bn,
3266						  i);
3267					goto skip_one;
3268				}
3269				len = min(d_namleft, DTSLOTDATALEN);
3270				outlen = jfs_strfromUCS_le(name_ptr, t->name,
3271							   len, codepage);
3272				jfs_dirent->name_len += outlen;
3273
3274				next = t->next;
3275			}
3276
3277			jfs_dirents++;
3278			jfs_dirent = next_jfs_dirent(jfs_dirent);
3279skip_one:
3280			if (!do_index)
3281				dtoffset->index++;
3282		}
3283
3284		if (!overflow) {
3285			/* Point to next leaf page */
3286			if (p->header.flag & BT_ROOT)
3287				bn = 0;
3288			else {
3289				bn = le64_to_cpu(p->header.next);
3290				index = 0;
3291				/* update offset (pn:index) for new page */
3292				if (!do_index) {
3293					dtoffset->pn++;
3294					dtoffset->index = 0;
3295				}
3296			}
3297			page_fixed = 0;
3298		}
3299
3300		/* unpin previous leaf page */
3301		DT_PUTPAGE(mp);
3302
3303		jfs_dirent = (struct jfs_dirent *) dirent_buf;
3304		while (jfs_dirents--) {
3305			ctx->pos = jfs_dirent->position;
3306			if (!dir_emit(ctx, jfs_dirent->name,
3307				    jfs_dirent->name_len,
3308				    jfs_dirent->ino, DT_UNKNOWN))
3309				goto out;
3310			jfs_dirent = next_jfs_dirent(jfs_dirent);
3311		}
3312
3313		if (fix_page) {
3314			add_missing_indices(ip, bn);
3315			page_fixed = 1;
3316		}
3317
3318		if (!overflow && (bn == 0)) {
3319			ctx->pos = DIREND;
3320			break;
3321		}
3322
3323		DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3324		if (rc) {
3325			free_page(dirent_buf);
3326			return rc;
3327		}
3328	}
3329
3330      out:
3331	free_page(dirent_buf);
3332
3333	return rc;
3334}
3335
3336
3337/*
3338 *	dtReadFirst()
3339 *
3340 * function: get the leftmost page of the directory
3341 */
3342static int dtReadFirst(struct inode *ip, struct btstack * btstack)
3343{
3344	int rc = 0;
3345	s64 bn;
3346	int psize = 288;	/* initial in-line directory */
3347	struct metapage *mp;
3348	dtpage_t *p;
3349	s8 *stbl;
3350	struct btframe *btsp;
3351	pxd_t *xd;
3352
3353	BT_CLR(btstack);	/* reset stack */
3354
3355	/*
3356	 *	descend leftmost path of the tree
3357	 *
3358	 * by convention, root bn = 0.
3359	 */
3360	for (bn = 0;;) {
3361		DT_GETPAGE(ip, bn, mp, psize, p, rc);
3362		if (rc)
3363			return rc;
3364
3365		/*
3366		 * leftmost leaf page
3367		 */
3368		if (p->header.flag & BT_LEAF) {
3369			/* return leftmost entry */
3370			btsp = btstack->top;
3371			btsp->bn = bn;
3372			btsp->index = 0;
3373			btsp->mp = mp;
3374
3375			return 0;
3376		}
3377
3378		/*
3379		 * descend down to leftmost child page
3380		 */
3381		if (BT_STACK_FULL(btstack)) {
3382			DT_PUTPAGE(mp);
3383			jfs_error(ip->i_sb, "btstack overrun\n");
3384			BT_STACK_DUMP(btstack);
3385			return -EIO;
3386		}
3387		/* push (bn, index) of the parent page/entry */
3388		BT_PUSH(btstack, bn, 0);
3389
3390		/* get the leftmost entry */
3391		stbl = DT_GETSTBL(p);
 
 
 
 
 
 
 
3392		xd = (pxd_t *) & p->slot[stbl[0]];
3393
3394		/* get the child page block address */
3395		bn = addressPXD(xd);
3396		psize = lengthPXD(xd) << JFS_SBI(ip->i_sb)->l2bsize;
3397
3398		/* unpin the parent page */
3399		DT_PUTPAGE(mp);
3400	}
3401}
3402
3403
3404/*
3405 *	dtReadNext()
3406 *
3407 * function: get the page of the specified offset (pn:index)
3408 *
3409 * return: if (offset > eof), bn = -1;
3410 *
3411 * note: if index > nextindex of the target leaf page,
3412 * start with 1st entry of next leaf page;
3413 */
3414static int dtReadNext(struct inode *ip, loff_t * offset,
3415		      struct btstack * btstack)
3416{
3417	int rc = 0;
3418	struct dtoffset {
3419		s16 pn;
3420		s16 index;
3421		s32 unused;
3422	} *dtoffset = (struct dtoffset *) offset;
3423	s64 bn;
3424	struct metapage *mp;
3425	dtpage_t *p;
3426	int index;
3427	int pn;
3428	s8 *stbl;
3429	struct btframe *btsp, *parent;
3430	pxd_t *xd;
3431
3432	/*
3433	 * get leftmost leaf page pinned
3434	 */
3435	if ((rc = dtReadFirst(ip, btstack)))
3436		return rc;
3437
3438	/* get leaf page */
3439	DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
3440
3441	/* get the start offset (pn:index) */
3442	pn = dtoffset->pn - 1;	/* Now pn = 0 represents leftmost leaf */
3443	index = dtoffset->index;
3444
3445	/* start at leftmost page ? */
3446	if (pn == 0) {
3447		/* offset beyond eof ? */
3448		if (index < p->header.nextindex)
3449			goto out;
3450
3451		if (p->header.flag & BT_ROOT) {
3452			bn = -1;
3453			goto out;
3454		}
3455
3456		/* start with 1st entry of next leaf page */
3457		dtoffset->pn++;
3458		dtoffset->index = index = 0;
3459		goto a;
3460	}
3461
3462	/* start at non-leftmost page: scan parent pages for large pn */
3463	if (p->header.flag & BT_ROOT) {
3464		bn = -1;
3465		goto out;
3466	}
3467
3468	/* start after next leaf page ? */
3469	if (pn > 1)
3470		goto b;
3471
3472	/* get leaf page pn = 1 */
3473      a:
3474	bn = le64_to_cpu(p->header.next);
3475
3476	/* unpin leaf page */
3477	DT_PUTPAGE(mp);
3478
3479	/* offset beyond eof ? */
3480	if (bn == 0) {
3481		bn = -1;
3482		goto out;
3483	}
3484
3485	goto c;
3486
3487	/*
3488	 * scan last internal page level to get target leaf page
3489	 */
3490      b:
3491	/* unpin leftmost leaf page */
3492	DT_PUTPAGE(mp);
3493
3494	/* get left most parent page */
3495	btsp = btstack->top;
3496	parent = btsp - 1;
3497	bn = parent->bn;
3498	DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3499	if (rc)
3500		return rc;
3501
3502	/* scan parent pages at last internal page level */
3503	while (pn >= p->header.nextindex) {
3504		pn -= p->header.nextindex;
3505
3506		/* get next parent page address */
3507		bn = le64_to_cpu(p->header.next);
3508
3509		/* unpin current parent page */
3510		DT_PUTPAGE(mp);
3511
3512		/* offset beyond eof ? */
3513		if (bn == 0) {
3514			bn = -1;
3515			goto out;
3516		}
3517
3518		/* get next parent page */
3519		DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3520		if (rc)
3521			return rc;
3522
3523		/* update parent page stack frame */
3524		parent->bn = bn;
3525	}
3526
3527	/* get leaf page address */
3528	stbl = DT_GETSTBL(p);
3529	xd = (pxd_t *) & p->slot[stbl[pn]];
3530	bn = addressPXD(xd);
3531
3532	/* unpin parent page */
3533	DT_PUTPAGE(mp);
3534
3535	/*
3536	 * get target leaf page
3537	 */
3538      c:
3539	DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3540	if (rc)
3541		return rc;
3542
3543	/*
3544	 * leaf page has been completed:
3545	 * start with 1st entry of next leaf page
3546	 */
3547	if (index >= p->header.nextindex) {
3548		bn = le64_to_cpu(p->header.next);
3549
3550		/* unpin leaf page */
3551		DT_PUTPAGE(mp);
3552
3553		/* offset beyond eof ? */
3554		if (bn == 0) {
3555			bn = -1;
3556			goto out;
3557		}
3558
3559		/* get next leaf page */
3560		DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3561		if (rc)
3562			return rc;
3563
3564		/* start with 1st entry of next leaf page */
3565		dtoffset->pn++;
3566		dtoffset->index = 0;
3567	}
3568
3569      out:
3570	/* return target leaf page pinned */
3571	btsp = btstack->top;
3572	btsp->bn = bn;
3573	btsp->index = dtoffset->index;
3574	btsp->mp = mp;
3575
3576	return 0;
3577}
3578
3579
3580/*
3581 *	dtCompare()
3582 *
3583 * function: compare search key with an internal entry
3584 *
3585 * return:
3586 *	< 0 if k is < record
3587 *	= 0 if k is = record
3588 *	> 0 if k is > record
3589 */
3590static int dtCompare(struct component_name * key,	/* search key */
3591		     dtpage_t * p,	/* directory page */
3592		     int si)
3593{				/* entry slot index */
3594	wchar_t *kname;
3595	__le16 *name;
3596	int klen, namlen, len, rc;
3597	struct idtentry *ih;
3598	struct dtslot *t;
3599
3600	/*
3601	 * force the left-most key on internal pages, at any level of
3602	 * the tree, to be less than any search key.
3603	 * this obviates having to update the leftmost key on an internal
3604	 * page when the user inserts a new key in the tree smaller than
3605	 * anything that has been stored.
3606	 *
3607	 * (? if/when dtSearch() narrows down to 1st entry (index = 0),
3608	 * at any internal page at any level of the tree,
3609	 * it descends to child of the entry anyway -
3610	 * ? make the entry as min size dummy entry)
3611	 *
3612	 * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF))
3613	 * return (1);
3614	 */
3615
3616	kname = key->name;
3617	klen = key->namlen;
3618
3619	ih = (struct idtentry *) & p->slot[si];
3620	si = ih->next;
3621	name = ih->name;
3622	namlen = ih->namlen;
3623	len = min(namlen, DTIHDRDATALEN);
3624
3625	/* compare with head/only segment */
3626	len = min(klen, len);
3627	if ((rc = UniStrncmp_le(kname, name, len)))
3628		return rc;
3629
3630	klen -= len;
3631	namlen -= len;
3632
3633	/* compare with additional segment(s) */
3634	kname += len;
3635	while (klen > 0 && namlen > 0) {
3636		/* compare with next name segment */
3637		t = (struct dtslot *) & p->slot[si];
3638		len = min(namlen, DTSLOTDATALEN);
3639		len = min(klen, len);
3640		name = t->name;
3641		if ((rc = UniStrncmp_le(kname, name, len)))
3642			return rc;
3643
3644		klen -= len;
3645		namlen -= len;
3646		kname += len;
3647		si = t->next;
3648	}
3649
3650	return (klen - namlen);
3651}
3652
3653
3654
3655
3656/*
3657 *	ciCompare()
3658 *
3659 * function: compare search key with an (leaf/internal) entry
3660 *
3661 * return:
3662 *	< 0 if k is < record
3663 *	= 0 if k is = record
3664 *	> 0 if k is > record
3665 */
3666static int ciCompare(struct component_name * key,	/* search key */
3667		     dtpage_t * p,	/* directory page */
3668		     int si,	/* entry slot index */
3669		     int flag)
3670{
3671	wchar_t *kname, x;
3672	__le16 *name;
3673	int klen, namlen, len, rc;
3674	struct ldtentry *lh;
3675	struct idtentry *ih;
3676	struct dtslot *t;
3677	int i;
3678
3679	/*
3680	 * force the left-most key on internal pages, at any level of
3681	 * the tree, to be less than any search key.
3682	 * this obviates having to update the leftmost key on an internal
3683	 * page when the user inserts a new key in the tree smaller than
3684	 * anything that has been stored.
3685	 *
3686	 * (? if/when dtSearch() narrows down to 1st entry (index = 0),
3687	 * at any internal page at any level of the tree,
3688	 * it descends to child of the entry anyway -
3689	 * ? make the entry as min size dummy entry)
3690	 *
3691	 * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF))
3692	 * return (1);
3693	 */
3694
3695	kname = key->name;
3696	klen = key->namlen;
3697
3698	/*
3699	 * leaf page entry
3700	 */
3701	if (p->header.flag & BT_LEAF) {
3702		lh = (struct ldtentry *) & p->slot[si];
3703		si = lh->next;
3704		name = lh->name;
3705		namlen = lh->namlen;
3706		if (flag & JFS_DIR_INDEX)
3707			len = min(namlen, DTLHDRDATALEN);
3708		else
3709			len = min(namlen, DTLHDRDATALEN_LEGACY);
3710	}
3711	/*
3712	 * internal page entry
3713	 */
3714	else {
3715		ih = (struct idtentry *) & p->slot[si];
3716		si = ih->next;
3717		name = ih->name;
3718		namlen = ih->namlen;
3719		len = min(namlen, DTIHDRDATALEN);
3720	}
3721
3722	/* compare with head/only segment */
3723	len = min(klen, len);
3724	for (i = 0; i < len; i++, kname++, name++) {
3725		/* only uppercase if case-insensitive support is on */
3726		if ((flag & JFS_OS2) == JFS_OS2)
3727			x = UniToupper(le16_to_cpu(*name));
3728		else
3729			x = le16_to_cpu(*name);
3730		if ((rc = *kname - x))
3731			return rc;
3732	}
3733
3734	klen -= len;
3735	namlen -= len;
3736
3737	/* compare with additional segment(s) */
3738	while (klen > 0 && namlen > 0) {
3739		/* compare with next name segment */
3740		t = (struct dtslot *) & p->slot[si];
3741		len = min(namlen, DTSLOTDATALEN);
3742		len = min(klen, len);
3743		name = t->name;
3744		for (i = 0; i < len; i++, kname++, name++) {
3745			/* only uppercase if case-insensitive support is on */
3746			if ((flag & JFS_OS2) == JFS_OS2)
3747				x = UniToupper(le16_to_cpu(*name));
3748			else
3749				x = le16_to_cpu(*name);
3750
3751			if ((rc = *kname - x))
3752				return rc;
3753		}
3754
3755		klen -= len;
3756		namlen -= len;
3757		si = t->next;
3758	}
3759
3760	return (klen - namlen);
3761}
3762
3763
3764/*
3765 *	ciGetLeafPrefixKey()
3766 *
3767 * function: compute prefix of suffix compression
3768 *	     from two adjacent leaf entries
3769 *	     across page boundary
3770 *
3771 * return: non-zero on error
3772 *
3773 */
3774static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp,
3775			       int ri, struct component_name * key, int flag)
3776{
3777	int klen, namlen;
3778	wchar_t *pl, *pr, *kname;
3779	struct component_name lkey;
3780	struct component_name rkey;
3781
3782	lkey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t),
3783					GFP_KERNEL);
3784	if (lkey.name == NULL)
3785		return -ENOMEM;
3786
3787	rkey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t),
3788					GFP_KERNEL);
3789	if (rkey.name == NULL) {
3790		kfree(lkey.name);
3791		return -ENOMEM;
3792	}
3793
3794	/* get left and right key */
3795	dtGetKey(lp, li, &lkey, flag);
3796	lkey.name[lkey.namlen] = 0;
3797
3798	if ((flag & JFS_OS2) == JFS_OS2)
3799		ciToUpper(&lkey);
3800
3801	dtGetKey(rp, ri, &rkey, flag);
3802	rkey.name[rkey.namlen] = 0;
3803
3804
3805	if ((flag & JFS_OS2) == JFS_OS2)
3806		ciToUpper(&rkey);
3807
3808	/* compute prefix */
3809	klen = 0;
3810	kname = key->name;
3811	namlen = min(lkey.namlen, rkey.namlen);
3812	for (pl = lkey.name, pr = rkey.name;
3813	     namlen; pl++, pr++, namlen--, klen++, kname++) {
3814		*kname = *pr;
3815		if (*pl != *pr) {
3816			key->namlen = klen + 1;
3817			goto free_names;
3818		}
3819	}
3820
3821	/* l->namlen <= r->namlen since l <= r */
3822	if (lkey.namlen < rkey.namlen) {
3823		*kname = *pr;
3824		key->namlen = klen + 1;
3825	} else			/* l->namelen == r->namelen */
3826		key->namlen = klen;
3827
3828free_names:
3829	kfree(lkey.name);
3830	kfree(rkey.name);
3831	return 0;
3832}
3833
3834
3835
3836/*
3837 *	dtGetKey()
3838 *
3839 * function: get key of the entry
3840 */
3841static void dtGetKey(dtpage_t * p, int i,	/* entry index */
3842		     struct component_name * key, int flag)
3843{
3844	int si;
3845	s8 *stbl;
3846	struct ldtentry *lh;
3847	struct idtentry *ih;
3848	struct dtslot *t;
3849	int namlen, len;
3850	wchar_t *kname;
3851	__le16 *name;
3852
3853	/* get entry */
3854	stbl = DT_GETSTBL(p);
3855	si = stbl[i];
3856	if (p->header.flag & BT_LEAF) {
3857		lh = (struct ldtentry *) & p->slot[si];
3858		si = lh->next;
3859		namlen = lh->namlen;
3860		name = lh->name;
3861		if (flag & JFS_DIR_INDEX)
3862			len = min(namlen, DTLHDRDATALEN);
3863		else
3864			len = min(namlen, DTLHDRDATALEN_LEGACY);
3865	} else {
3866		ih = (struct idtentry *) & p->slot[si];
3867		si = ih->next;
3868		namlen = ih->namlen;
3869		name = ih->name;
3870		len = min(namlen, DTIHDRDATALEN);
3871	}
3872
3873	key->namlen = namlen;
3874	kname = key->name;
3875
3876	/*
3877	 * move head/only segment
3878	 */
3879	UniStrncpy_from_le(kname, name, len);
3880
3881	/*
3882	 * move additional segment(s)
3883	 */
3884	while (si >= 0) {
3885		/* get next segment */
3886		t = &p->slot[si];
3887		kname += len;
3888		namlen -= len;
3889		len = min(namlen, DTSLOTDATALEN);
3890		UniStrncpy_from_le(kname, t->name, len);
3891
3892		si = t->next;
3893	}
3894}
3895
3896
3897/*
3898 *	dtInsertEntry()
3899 *
3900 * function: allocate free slot(s) and
3901 *	     write a leaf/internal entry
3902 *
3903 * return: entry slot index
3904 */
3905static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key,
3906			  ddata_t * data, struct dt_lock ** dtlock)
3907{
3908	struct dtslot *h, *t;
3909	struct ldtentry *lh = NULL;
3910	struct idtentry *ih = NULL;
3911	int hsi, fsi, klen, len, nextindex;
3912	wchar_t *kname;
3913	__le16 *name;
3914	s8 *stbl;
3915	pxd_t *xd;
3916	struct dt_lock *dtlck = *dtlock;
3917	struct lv *lv;
3918	int xsi, n;
3919	s64 bn = 0;
3920	struct metapage *mp = NULL;
3921
3922	klen = key->namlen;
3923	kname = key->name;
3924
3925	/* allocate a free slot */
3926	hsi = fsi = p->header.freelist;
3927	h = &p->slot[fsi];
3928	p->header.freelist = h->next;
3929	--p->header.freecnt;
3930
3931	/* open new linelock */
3932	if (dtlck->index >= dtlck->maxcnt)
3933		dtlck = (struct dt_lock *) txLinelock(dtlck);
3934
3935	lv = & dtlck->lv[dtlck->index];
3936	lv->offset = hsi;
3937
3938	/* write head/only segment */
3939	if (p->header.flag & BT_LEAF) {
3940		lh = (struct ldtentry *) h;
3941		lh->next = h->next;
3942		lh->inumber = cpu_to_le32(data->leaf.ino);
3943		lh->namlen = klen;
3944		name = lh->name;
3945		if (data->leaf.ip) {
3946			len = min(klen, DTLHDRDATALEN);
3947			if (!(p->header.flag & BT_ROOT))
3948				bn = addressPXD(&p->header.self);
3949			lh->index = cpu_to_le32(add_index(data->leaf.tid,
3950							  data->leaf.ip,
3951							  bn, index));
3952		} else
3953			len = min(klen, DTLHDRDATALEN_LEGACY);
3954	} else {
3955		ih = (struct idtentry *) h;
3956		ih->next = h->next;
3957		xd = (pxd_t *) ih;
3958		*xd = data->xd;
3959		ih->namlen = klen;
3960		name = ih->name;
3961		len = min(klen, DTIHDRDATALEN);
3962	}
3963
3964	UniStrncpy_to_le(name, kname, len);
3965
3966	n = 1;
3967	xsi = hsi;
3968
3969	/* write additional segment(s) */
3970	t = h;
3971	klen -= len;
3972	while (klen) {
3973		/* get free slot */
3974		fsi = p->header.freelist;
3975		t = &p->slot[fsi];
3976		p->header.freelist = t->next;
3977		--p->header.freecnt;
3978
3979		/* is next slot contiguous ? */
3980		if (fsi != xsi + 1) {
3981			/* close current linelock */
3982			lv->length = n;
3983			dtlck->index++;
3984
3985			/* open new linelock */
3986			if (dtlck->index < dtlck->maxcnt)
3987				lv++;
3988			else {
3989				dtlck = (struct dt_lock *) txLinelock(dtlck);
3990				lv = & dtlck->lv[0];
3991			}
3992
3993			lv->offset = fsi;
3994			n = 0;
3995		}
3996
3997		kname += len;
3998		len = min(klen, DTSLOTDATALEN);
3999		UniStrncpy_to_le(t->name, kname, len);
4000
4001		n++;
4002		xsi = fsi;
4003		klen -= len;
4004	}
4005
4006	/* close current linelock */
4007	lv->length = n;
4008	dtlck->index++;
4009
4010	*dtlock = dtlck;
4011
4012	/* terminate last/only segment */
4013	if (h == t) {
4014		/* single segment entry */
4015		if (p->header.flag & BT_LEAF)
4016			lh->next = -1;
4017		else
4018			ih->next = -1;
4019	} else
4020		/* multi-segment entry */
4021		t->next = -1;
4022
4023	/* if insert into middle, shift right succeeding entries in stbl */
4024	stbl = DT_GETSTBL(p);
4025	nextindex = p->header.nextindex;
4026	if (index < nextindex) {
4027		memmove(stbl + index + 1, stbl + index, nextindex - index);
4028
4029		if ((p->header.flag & BT_LEAF) && data->leaf.ip) {
4030			s64 lblock;
4031
4032			/*
4033			 * Need to update slot number for entries that moved
4034			 * in the stbl
4035			 */
4036			mp = NULL;
4037			for (n = index + 1; n <= nextindex; n++) {
4038				lh = (struct ldtentry *) & (p->slot[stbl[n]]);
4039				modify_index(data->leaf.tid, data->leaf.ip,
4040					     le32_to_cpu(lh->index), bn, n,
4041					     &mp, &lblock);
4042			}
4043			if (mp)
4044				release_metapage(mp);
4045		}
4046	}
4047
4048	stbl[index] = hsi;
4049
4050	/* advance next available entry index of stbl */
4051	++p->header.nextindex;
4052}
4053
4054
4055/*
4056 *	dtMoveEntry()
4057 *
4058 * function: move entries from split/left page to new/right page
4059 *
4060 *	nextindex of dst page and freelist/freecnt of both pages
4061 *	are updated.
4062 */
4063static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp,
4064			struct dt_lock ** sdtlock, struct dt_lock ** ddtlock,
4065			int do_index)
4066{
4067	int ssi, next;		/* src slot index */
4068	int di;			/* dst entry index */
4069	int dsi;		/* dst slot index */
4070	s8 *sstbl, *dstbl;	/* sorted entry table */
4071	int snamlen, len;
4072	struct ldtentry *slh, *dlh = NULL;
4073	struct idtentry *sih, *dih = NULL;
4074	struct dtslot *h, *s, *d;
4075	struct dt_lock *sdtlck = *sdtlock, *ddtlck = *ddtlock;
4076	struct lv *slv, *dlv;
4077	int xssi, ns, nd;
4078	int sfsi;
4079
4080	sstbl = (s8 *) & sp->slot[sp->header.stblindex];
4081	dstbl = (s8 *) & dp->slot[dp->header.stblindex];
4082
4083	dsi = dp->header.freelist;	/* first (whole page) free slot */
4084	sfsi = sp->header.freelist;
4085
4086	/* linelock destination entry slot */
4087	dlv = & ddtlck->lv[ddtlck->index];
4088	dlv->offset = dsi;
4089
4090	/* linelock source entry slot */
4091	slv = & sdtlck->lv[sdtlck->index];
4092	slv->offset = sstbl[si];
4093	xssi = slv->offset - 1;
4094
4095	/*
4096	 * move entries
4097	 */
4098	ns = nd = 0;
4099	for (di = 0; si < sp->header.nextindex; si++, di++) {
4100		ssi = sstbl[si];
4101		dstbl[di] = dsi;
4102
4103		/* is next slot contiguous ? */
4104		if (ssi != xssi + 1) {
4105			/* close current linelock */
4106			slv->length = ns;
4107			sdtlck->index++;
4108
4109			/* open new linelock */
4110			if (sdtlck->index < sdtlck->maxcnt)
4111				slv++;
4112			else {
4113				sdtlck = (struct dt_lock *) txLinelock(sdtlck);
4114				slv = & sdtlck->lv[0];
4115			}
4116
4117			slv->offset = ssi;
4118			ns = 0;
4119		}
4120
4121		/*
4122		 * move head/only segment of an entry
4123		 */
4124		/* get dst slot */
4125		h = d = &dp->slot[dsi];
4126
4127		/* get src slot and move */
4128		s = &sp->slot[ssi];
4129		if (sp->header.flag & BT_LEAF) {
4130			/* get source entry */
4131			slh = (struct ldtentry *) s;
4132			dlh = (struct ldtentry *) h;
4133			snamlen = slh->namlen;
4134
4135			if (do_index) {
4136				len = min(snamlen, DTLHDRDATALEN);
4137				dlh->index = slh->index; /* little-endian */
4138			} else
4139				len = min(snamlen, DTLHDRDATALEN_LEGACY);
4140
4141			memcpy(dlh, slh, 6 + len * 2);
4142
4143			next = slh->next;
4144
4145			/* update dst head/only segment next field */
4146			dsi++;
4147			dlh->next = dsi;
4148		} else {
4149			sih = (struct idtentry *) s;
4150			snamlen = sih->namlen;
4151
4152			len = min(snamlen, DTIHDRDATALEN);
4153			dih = (struct idtentry *) h;
4154			memcpy(dih, sih, 10 + len * 2);
4155			next = sih->next;
4156
4157			dsi++;
4158			dih->next = dsi;
4159		}
4160
4161		/* free src head/only segment */
4162		s->next = sfsi;
4163		s->cnt = 1;
4164		sfsi = ssi;
4165
4166		ns++;
4167		nd++;
4168		xssi = ssi;
4169
4170		/*
4171		 * move additional segment(s) of the entry
4172		 */
4173		snamlen -= len;
4174		while ((ssi = next) >= 0) {
4175			/* is next slot contiguous ? */
4176			if (ssi != xssi + 1) {
4177				/* close current linelock */
4178				slv->length = ns;
4179				sdtlck->index++;
4180
4181				/* open new linelock */
4182				if (sdtlck->index < sdtlck->maxcnt)
4183					slv++;
4184				else {
4185					sdtlck =
4186					    (struct dt_lock *)
4187					    txLinelock(sdtlck);
4188					slv = & sdtlck->lv[0];
4189				}
4190
4191				slv->offset = ssi;
4192				ns = 0;
4193			}
4194
4195			/* get next source segment */
4196			s = &sp->slot[ssi];
4197
4198			/* get next destination free slot */
4199			d++;
4200
4201			len = min(snamlen, DTSLOTDATALEN);
4202			UniStrncpy_le(d->name, s->name, len);
4203
4204			ns++;
4205			nd++;
4206			xssi = ssi;
4207
4208			dsi++;
4209			d->next = dsi;
4210
4211			/* free source segment */
4212			next = s->next;
4213			s->next = sfsi;
4214			s->cnt = 1;
4215			sfsi = ssi;
4216
4217			snamlen -= len;
4218		}		/* end while */
4219
4220		/* terminate dst last/only segment */
4221		if (h == d) {
4222			/* single segment entry */
4223			if (dp->header.flag & BT_LEAF)
4224				dlh->next = -1;
4225			else
4226				dih->next = -1;
4227		} else
4228			/* multi-segment entry */
4229			d->next = -1;
4230	}			/* end for */
4231
4232	/* close current linelock */
4233	slv->length = ns;
4234	sdtlck->index++;
4235	*sdtlock = sdtlck;
4236
4237	dlv->length = nd;
4238	ddtlck->index++;
4239	*ddtlock = ddtlck;
4240
4241	/* update source header */
4242	sp->header.freelist = sfsi;
4243	sp->header.freecnt += nd;
4244
4245	/* update destination header */
4246	dp->header.nextindex = di;
4247
4248	dp->header.freelist = dsi;
4249	dp->header.freecnt -= nd;
4250}
4251
4252
4253/*
4254 *	dtDeleteEntry()
4255 *
4256 * function: free a (leaf/internal) entry
4257 *
4258 * log freelist header, stbl, and each segment slot of entry
4259 * (even though last/only segment next field is modified,
4260 * physical image logging requires all segment slots of
4261 * the entry logged to avoid applying previous updates
4262 * to the same slots)
4263 */
4264static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock)
4265{
4266	int fsi;		/* free entry slot index */
4267	s8 *stbl;
4268	struct dtslot *t;
4269	int si, freecnt;
4270	struct dt_lock *dtlck = *dtlock;
4271	struct lv *lv;
4272	int xsi, n;
4273
4274	/* get free entry slot index */
4275	stbl = DT_GETSTBL(p);
4276	fsi = stbl[fi];
4277
4278	/* open new linelock */
4279	if (dtlck->index >= dtlck->maxcnt)
4280		dtlck = (struct dt_lock *) txLinelock(dtlck);
4281	lv = & dtlck->lv[dtlck->index];
4282
4283	lv->offset = fsi;
4284
4285	/* get the head/only segment */
4286	t = &p->slot[fsi];
4287	if (p->header.flag & BT_LEAF)
4288		si = ((struct ldtentry *) t)->next;
4289	else
4290		si = ((struct idtentry *) t)->next;
4291	t->next = si;
4292	t->cnt = 1;
4293
4294	n = freecnt = 1;
4295	xsi = fsi;
4296
4297	/* find the last/only segment */
4298	while (si >= 0) {
4299		/* is next slot contiguous ? */
4300		if (si != xsi + 1) {
4301			/* close current linelock */
4302			lv->length = n;
4303			dtlck->index++;
4304
4305			/* open new linelock */
4306			if (dtlck->index < dtlck->maxcnt)
4307				lv++;
4308			else {
4309				dtlck = (struct dt_lock *) txLinelock(dtlck);
4310				lv = & dtlck->lv[0];
4311			}
4312
4313			lv->offset = si;
4314			n = 0;
4315		}
4316
4317		n++;
4318		xsi = si;
4319		freecnt++;
4320
4321		t = &p->slot[si];
4322		t->cnt = 1;
4323		si = t->next;
4324	}
4325
4326	/* close current linelock */
4327	lv->length = n;
4328	dtlck->index++;
4329
4330	*dtlock = dtlck;
4331
4332	/* update freelist */
4333	t->next = p->header.freelist;
4334	p->header.freelist = fsi;
4335	p->header.freecnt += freecnt;
4336
4337	/* if delete from middle,
4338	 * shift left the succedding entries in the stbl
4339	 */
4340	si = p->header.nextindex;
4341	if (fi < si - 1)
4342		memmove(&stbl[fi], &stbl[fi + 1], si - fi - 1);
4343
4344	p->header.nextindex--;
4345}
4346
4347
4348/*
4349 *	dtTruncateEntry()
4350 *
4351 * function: truncate a (leaf/internal) entry
4352 *
4353 * log freelist header, stbl, and each segment slot of entry
4354 * (even though last/only segment next field is modified,
4355 * physical image logging requires all segment slots of
4356 * the entry logged to avoid applying previous updates
4357 * to the same slots)
4358 */
4359static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock)
4360{
4361	int tsi;		/* truncate entry slot index */
4362	s8 *stbl;
4363	struct dtslot *t;
4364	int si, freecnt;
4365	struct dt_lock *dtlck = *dtlock;
4366	struct lv *lv;
4367	int fsi, xsi, n;
4368
4369	/* get free entry slot index */
4370	stbl = DT_GETSTBL(p);
4371	tsi = stbl[ti];
4372
4373	/* open new linelock */
4374	if (dtlck->index >= dtlck->maxcnt)
4375		dtlck = (struct dt_lock *) txLinelock(dtlck);
4376	lv = & dtlck->lv[dtlck->index];
4377
4378	lv->offset = tsi;
4379
4380	/* get the head/only segment */
4381	t = &p->slot[tsi];
4382	ASSERT(p->header.flag & BT_INTERNAL);
4383	((struct idtentry *) t)->namlen = 0;
4384	si = ((struct idtentry *) t)->next;
4385	((struct idtentry *) t)->next = -1;
4386
4387	n = 1;
4388	freecnt = 0;
4389	fsi = si;
4390	xsi = tsi;
4391
4392	/* find the last/only segment */
4393	while (si >= 0) {
4394		/* is next slot contiguous ? */
4395		if (si != xsi + 1) {
4396			/* close current linelock */
4397			lv->length = n;
4398			dtlck->index++;
4399
4400			/* open new linelock */
4401			if (dtlck->index < dtlck->maxcnt)
4402				lv++;
4403			else {
4404				dtlck = (struct dt_lock *) txLinelock(dtlck);
4405				lv = & dtlck->lv[0];
4406			}
4407
4408			lv->offset = si;
4409			n = 0;
4410		}
4411
4412		n++;
4413		xsi = si;
4414		freecnt++;
4415
4416		t = &p->slot[si];
4417		t->cnt = 1;
4418		si = t->next;
4419	}
4420
4421	/* close current linelock */
4422	lv->length = n;
4423	dtlck->index++;
4424
4425	*dtlock = dtlck;
4426
4427	/* update freelist */
4428	if (freecnt == 0)
4429		return;
4430	t->next = p->header.freelist;
4431	p->header.freelist = fsi;
4432	p->header.freecnt += freecnt;
4433}
4434
4435
4436/*
4437 *	dtLinelockFreelist()
4438 */
4439static void dtLinelockFreelist(dtpage_t * p,	/* directory page */
4440			       int m,	/* max slot index */
4441			       struct dt_lock ** dtlock)
4442{
4443	int fsi;		/* free entry slot index */
4444	struct dtslot *t;
4445	int si;
4446	struct dt_lock *dtlck = *dtlock;
4447	struct lv *lv;
4448	int xsi, n;
4449
4450	/* get free entry slot index */
4451	fsi = p->header.freelist;
4452
4453	/* open new linelock */
4454	if (dtlck->index >= dtlck->maxcnt)
4455		dtlck = (struct dt_lock *) txLinelock(dtlck);
4456	lv = & dtlck->lv[dtlck->index];
4457
4458	lv->offset = fsi;
4459
4460	n = 1;
4461	xsi = fsi;
4462
4463	t = &p->slot[fsi];
4464	si = t->next;
4465
4466	/* find the last/only segment */
4467	while (si < m && si >= 0) {
4468		/* is next slot contiguous ? */
4469		if (si != xsi + 1) {
4470			/* close current linelock */
4471			lv->length = n;
4472			dtlck->index++;
4473
4474			/* open new linelock */
4475			if (dtlck->index < dtlck->maxcnt)
4476				lv++;
4477			else {
4478				dtlck = (struct dt_lock *) txLinelock(dtlck);
4479				lv = & dtlck->lv[0];
4480			}
4481
4482			lv->offset = si;
4483			n = 0;
4484		}
4485
4486		n++;
4487		xsi = si;
4488
4489		t = &p->slot[si];
4490		si = t->next;
4491	}
4492
4493	/* close current linelock */
4494	lv->length = n;
4495	dtlck->index++;
4496
4497	*dtlock = dtlck;
4498}
4499
4500
4501/*
4502 * NAME: dtModify
4503 *
4504 * FUNCTION: Modify the inode number part of a directory entry
4505 *
4506 * PARAMETERS:
4507 *	tid	- Transaction id
4508 *	ip	- Inode of parent directory
4509 *	key	- Name of entry to be modified
4510 *	orig_ino	- Original inode number expected in entry
4511 *	new_ino	- New inode number to put into entry
4512 *	flag	- JFS_RENAME
4513 *
4514 * RETURNS:
4515 *	-ESTALE	- If entry found does not match orig_ino passed in
4516 *	-ENOENT	- If no entry can be found to match key
4517 *	0	- If successfully modified entry
4518 */
4519int dtModify(tid_t tid, struct inode *ip,
4520	 struct component_name * key, ino_t * orig_ino, ino_t new_ino, int flag)
4521{
4522	int rc;
4523	s64 bn;
4524	struct metapage *mp;
4525	dtpage_t *p;
4526	int index;
4527	struct btstack btstack;
4528	struct tlock *tlck;
4529	struct dt_lock *dtlck;
4530	struct lv *lv;
4531	s8 *stbl;
4532	int entry_si;		/* entry slot index */
4533	struct ldtentry *entry;
4534
4535	/*
4536	 *	search for the entry to modify:
4537	 *
4538	 * dtSearch() returns (leaf page pinned, index at which to modify).
4539	 */
4540	if ((rc = dtSearch(ip, key, orig_ino, &btstack, flag)))
4541		return rc;
4542
4543	/* retrieve search result */
4544	DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
4545
4546	BT_MARK_DIRTY(mp, ip);
4547	/*
4548	 * acquire a transaction lock on the leaf page of named entry
4549	 */
4550	tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
4551	dtlck = (struct dt_lock *) & tlck->lock;
4552
4553	/* get slot index of the entry */
4554	stbl = DT_GETSTBL(p);
4555	entry_si = stbl[index];
4556
4557	/* linelock entry */
4558	ASSERT(dtlck->index == 0);
4559	lv = & dtlck->lv[0];
4560	lv->offset = entry_si;
4561	lv->length = 1;
4562	dtlck->index++;
4563
4564	/* get the head/only segment */
4565	entry = (struct ldtentry *) & p->slot[entry_si];
4566
4567	/* substitute the inode number of the entry */
4568	entry->inumber = cpu_to_le32(new_ino);
4569
4570	/* unpin the leaf page */
4571	DT_PUTPAGE(mp);
4572
4573	return 0;
4574}
v6.13.7
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 *   Copyright (C) International Business Machines Corp., 2000-2004
 
 
 
 
 
 
 
 
 
 
 
 
 
 
   4 */
   5
   6/*
   7 *	jfs_dtree.c: directory B+-tree manager
   8 *
   9 * B+-tree with variable length key directory:
  10 *
  11 * each directory page is structured as an array of 32-byte
  12 * directory entry slots initialized as a freelist
  13 * to avoid search/compaction of free space at insertion.
  14 * when an entry is inserted, a number of slots are allocated
  15 * from the freelist as required to store variable length data
  16 * of the entry; when the entry is deleted, slots of the entry
  17 * are returned to freelist.
  18 *
  19 * leaf entry stores full name as key and file serial number
  20 * (aka inode number) as data.
  21 * internal/router entry stores sufffix compressed name
  22 * as key and simple extent descriptor as data.
  23 *
  24 * each directory page maintains a sorted entry index table
  25 * which stores the start slot index of sorted entries
  26 * to allow binary search on the table.
  27 *
  28 * directory starts as a root/leaf page in on-disk inode
  29 * inline data area.
  30 * when it becomes full, it starts a leaf of a external extent
  31 * of length of 1 block. each time the first leaf becomes full,
  32 * it is extended rather than split (its size is doubled),
  33 * until its length becoms 4 KBytes, from then the extent is split
  34 * with new 4 Kbyte extent when it becomes full
  35 * to reduce external fragmentation of small directories.
  36 *
  37 * blah, blah, blah, for linear scan of directory in pieces by
  38 * readdir().
  39 *
  40 *
  41 *	case-insensitive directory file system
  42 *
  43 * names are stored in case-sensitive way in leaf entry.
  44 * but stored, searched and compared in case-insensitive (uppercase) order
  45 * (i.e., both search key and entry key are folded for search/compare):
  46 * (note that case-sensitive order is BROKEN in storage, e.g.,
  47 *  sensitive: Ad, aB, aC, aD -> insensitive: aB, aC, aD, Ad
  48 *
  49 *  entries which folds to the same key makes up a equivalent class
  50 *  whose members are stored as contiguous cluster (may cross page boundary)
  51 *  but whose order is arbitrary and acts as duplicate, e.g.,
  52 *  abc, Abc, aBc, abC)
  53 *
  54 * once match is found at leaf, requires scan forward/backward
  55 * either for, in case-insensitive search, duplicate
  56 * or for, in case-sensitive search, for exact match
  57 *
  58 * router entry must be created/stored in case-insensitive way
  59 * in internal entry:
  60 * (right most key of left page and left most key of right page
  61 * are folded, and its suffix compression is propagated as router
  62 * key in parent)
  63 * (e.g., if split occurs <abc> and <aBd>, <ABD> trather than <aB>
  64 * should be made the router key for the split)
  65 *
  66 * case-insensitive search:
  67 *
  68 *	fold search key;
  69 *
  70 *	case-insensitive search of B-tree:
  71 *	for internal entry, router key is already folded;
  72 *	for leaf entry, fold the entry key before comparison.
  73 *
  74 *	if (leaf entry case-insensitive match found)
  75 *		if (next entry satisfies case-insensitive match)
  76 *			return EDUPLICATE;
  77 *		if (prev entry satisfies case-insensitive match)
  78 *			return EDUPLICATE;
  79 *		return match;
  80 *	else
  81 *		return no match;
  82 *
  83 *	serialization:
  84 * target directory inode lock is being held on entry/exit
  85 * of all main directory service routines.
  86 *
  87 *	log based recovery:
  88 */
  89
  90#include <linux/fs.h>
  91#include <linux/quotaops.h>
  92#include <linux/slab.h>
  93#include "jfs_incore.h"
  94#include "jfs_superblock.h"
  95#include "jfs_filsys.h"
  96#include "jfs_metapage.h"
  97#include "jfs_dmap.h"
  98#include "jfs_unicode.h"
  99#include "jfs_debug.h"
 100
 101/* dtree split parameter */
 102struct dtsplit {
 103	struct metapage *mp;
 104	s16 index;
 105	s16 nslot;
 106	struct component_name *key;
 107	ddata_t *data;
 108	struct pxdlist *pxdlist;
 109};
 110
 111#define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot)
 112
 113/* get page buffer for specified block address */
 114#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC)				\
 115do {									\
 116	BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot);	\
 117	if (!(RC)) {							\
 118		if (((P)->header.nextindex >				\
 119		     (((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \
 120		    ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT))) {	\
 121			BT_PUTPAGE(MP);					\
 122			jfs_error((IP)->i_sb,				\
 123				  "DT_GETPAGE: dtree page corrupt\n");	\
 124			MP = NULL;					\
 125			RC = -EIO;					\
 126		}							\
 127	}								\
 128} while (0)
 129
 130/* for consistency */
 131#define DT_PUTPAGE(MP) BT_PUTPAGE(MP)
 132
 133#define DT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \
 134	BT_GETSEARCH(IP, LEAF, BN, MP, dtpage_t, P, INDEX, i_dtroot)
 135
 136/*
 137 * forward references
 138 */
 139static int dtSplitUp(tid_t tid, struct inode *ip,
 140		     struct dtsplit * split, struct btstack * btstack);
 141
 142static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
 143		       struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rxdp);
 144
 145static int dtExtendPage(tid_t tid, struct inode *ip,
 146			struct dtsplit * split, struct btstack * btstack);
 147
 148static int dtSplitRoot(tid_t tid, struct inode *ip,
 149		       struct dtsplit * split, struct metapage ** rmpp);
 150
 151static int dtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp,
 152		      dtpage_t * fp, struct btstack * btstack);
 153
 154static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p);
 155
 156static int dtReadFirst(struct inode *ip, struct btstack * btstack);
 157
 158static int dtReadNext(struct inode *ip,
 159		      loff_t * offset, struct btstack * btstack);
 160
 161static int dtCompare(struct component_name * key, dtpage_t * p, int si);
 162
 163static int ciCompare(struct component_name * key, dtpage_t * p, int si,
 164		     int flag);
 165
 166static void dtGetKey(dtpage_t * p, int i, struct component_name * key,
 167		     int flag);
 168
 169static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp,
 170			      int ri, struct component_name * key, int flag);
 171
 172static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key,
 173			  ddata_t * data, struct dt_lock **);
 174
 175static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp,
 176			struct dt_lock ** sdtlock, struct dt_lock ** ddtlock,
 177			int do_index);
 178
 179static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock);
 180
 181static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock);
 182
 183static void dtLinelockFreelist(dtpage_t * p, int m, struct dt_lock ** dtlock);
 184
 185#define ciToUpper(c)	UniStrupr((c)->name)
 186
 187/*
 188 *	read_index_page()
 189 *
 190 *	Reads a page of a directory's index table.
 191 *	Having metadata mapped into the directory inode's address space
 192 *	presents a multitude of problems.  We avoid this by mapping to
 193 *	the absolute address space outside of the *_metapage routines
 194 */
 195static struct metapage *read_index_page(struct inode *inode, s64 blkno)
 196{
 197	int rc;
 198	s64 xaddr;
 199	int xflag;
 200	s32 xlen;
 201
 202	rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1);
 203	if (rc || (xaddr == 0))
 204		return NULL;
 205
 206	return read_metapage(inode, xaddr, PSIZE, 1);
 207}
 208
 209/*
 210 *	get_index_page()
 211 *
 212 *	Same as get_index_page(), but get's a new page without reading
 213 */
 214static struct metapage *get_index_page(struct inode *inode, s64 blkno)
 215{
 216	int rc;
 217	s64 xaddr;
 218	int xflag;
 219	s32 xlen;
 220
 221	rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1);
 222	if (rc || (xaddr == 0))
 223		return NULL;
 224
 225	return get_metapage(inode, xaddr, PSIZE, 1);
 226}
 227
 228/*
 229 *	find_index()
 230 *
 231 *	Returns dtree page containing directory table entry for specified
 232 *	index and pointer to its entry.
 233 *
 234 *	mp must be released by caller.
 235 */
 236static struct dir_table_slot *find_index(struct inode *ip, u32 index,
 237					 struct metapage ** mp, s64 *lblock)
 238{
 239	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
 240	s64 blkno;
 241	s64 offset;
 242	int page_offset;
 243	struct dir_table_slot *slot;
 244	static int maxWarnings = 10;
 245
 246	if (index < 2) {
 247		if (maxWarnings) {
 248			jfs_warn("find_entry called with index = %d", index);
 249			maxWarnings--;
 250		}
 251		return NULL;
 252	}
 253
 254	if (index >= jfs_ip->next_index) {
 255		jfs_warn("find_entry called with index >= next_index");
 256		return NULL;
 257	}
 258
 259	if (jfs_dirtable_inline(ip)) {
 260		/*
 261		 * Inline directory table
 262		 */
 263		*mp = NULL;
 264		slot = &jfs_ip->i_dirtable[index - 2];
 265	} else {
 266		offset = (index - 2) * sizeof(struct dir_table_slot);
 267		page_offset = offset & (PSIZE - 1);
 268		blkno = ((offset + 1) >> L2PSIZE) <<
 269		    JFS_SBI(ip->i_sb)->l2nbperpage;
 270
 271		if (*mp && (*lblock != blkno)) {
 272			release_metapage(*mp);
 273			*mp = NULL;
 274		}
 275		if (!(*mp)) {
 276			*lblock = blkno;
 277			*mp = read_index_page(ip, blkno);
 278		}
 279		if (!(*mp)) {
 280			jfs_err("free_index: error reading directory table");
 281			return NULL;
 282		}
 283
 284		slot =
 285		    (struct dir_table_slot *) ((char *) (*mp)->data +
 286					       page_offset);
 287	}
 288	return slot;
 289}
 290
 291static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp,
 292			      u32 index)
 293{
 294	struct tlock *tlck;
 295	struct linelock *llck;
 296	struct lv *lv;
 297
 298	tlck = txLock(tid, ip, mp, tlckDATA);
 299	llck = (struct linelock *) tlck->lock;
 300
 301	if (llck->index >= llck->maxcnt)
 302		llck = txLinelock(llck);
 303	lv = &llck->lv[llck->index];
 304
 305	/*
 306	 *	Linelock slot size is twice the size of directory table
 307	 *	slot size.  512 entries per page.
 308	 */
 309	lv->offset = ((index - 2) & 511) >> 1;
 310	lv->length = 1;
 311	llck->index++;
 312}
 313
 314/*
 315 *	add_index()
 316 *
 317 *	Adds an entry to the directory index table.  This is used to provide
 318 *	each directory entry with a persistent index in which to resume
 319 *	directory traversals
 320 */
 321static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
 322{
 323	struct super_block *sb = ip->i_sb;
 324	struct jfs_sb_info *sbi = JFS_SBI(sb);
 325	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
 326	u64 blkno;
 327	struct dir_table_slot *dirtab_slot;
 328	u32 index;
 329	struct linelock *llck;
 330	struct lv *lv;
 331	struct metapage *mp;
 332	s64 offset;
 333	uint page_offset;
 334	struct tlock *tlck;
 335	s64 xaddr;
 336
 337	ASSERT(DO_INDEX(ip));
 338
 339	if (jfs_ip->next_index < 2) {
 340		jfs_warn("add_index: next_index = %d.  Resetting!",
 341			   jfs_ip->next_index);
 342		jfs_ip->next_index = 2;
 343	}
 344
 345	index = jfs_ip->next_index++;
 346
 347	if (index <= MAX_INLINE_DIRTABLE_ENTRY) {
 348		/*
 349		 * i_size reflects size of index table, or 8 bytes per entry.
 350		 */
 351		ip->i_size = (loff_t) (index - 1) << 3;
 352
 353		/*
 354		 * dir table fits inline within inode
 355		 */
 356		dirtab_slot = &jfs_ip->i_dirtable[index-2];
 357		dirtab_slot->flag = DIR_INDEX_VALID;
 358		dirtab_slot->slot = slot;
 359		DTSaddress(dirtab_slot, bn);
 360
 361		set_cflag(COMMIT_Dirtable, ip);
 362
 363		return index;
 364	}
 365	if (index == (MAX_INLINE_DIRTABLE_ENTRY + 1)) {
 366		struct dir_table_slot temp_table[12];
 367
 368		/*
 369		 * It's time to move the inline table to an external
 370		 * page and begin to build the xtree
 371		 */
 372		if (dquot_alloc_block(ip, sbi->nbperpage))
 373			goto clean_up;
 374		if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) {
 375			dquot_free_block(ip, sbi->nbperpage);
 376			goto clean_up;
 377		}
 378
 379		/*
 380		 * Save the table, we're going to overwrite it with the
 381		 * xtree root
 382		 */
 383		memcpy(temp_table, &jfs_ip->i_dirtable, sizeof(temp_table));
 384
 385		/*
 386		 * Initialize empty x-tree
 387		 */
 388		xtInitRoot(tid, ip);
 389
 390		/*
 391		 * Add the first block to the xtree
 392		 */
 393		if (xtInsert(tid, ip, 0, 0, sbi->nbperpage, &xaddr, 0)) {
 394			/* This really shouldn't fail */
 395			jfs_warn("add_index: xtInsert failed!");
 396			memcpy(&jfs_ip->i_dirtable, temp_table,
 397			       sizeof (temp_table));
 398			dbFree(ip, xaddr, sbi->nbperpage);
 399			dquot_free_block(ip, sbi->nbperpage);
 400			goto clean_up;
 401		}
 402		ip->i_size = PSIZE;
 403
 404		mp = get_index_page(ip, 0);
 405		if (!mp) {
 406			jfs_err("add_index: get_metapage failed!");
 407			xtTruncate(tid, ip, 0, COMMIT_PWMAP);
 408			memcpy(&jfs_ip->i_dirtable, temp_table,
 409			       sizeof (temp_table));
 410			goto clean_up;
 411		}
 412		tlck = txLock(tid, ip, mp, tlckDATA);
 413		llck = (struct linelock *) & tlck->lock;
 414		ASSERT(llck->index == 0);
 415		lv = &llck->lv[0];
 416
 417		lv->offset = 0;
 418		lv->length = 6;	/* tlckDATA slot size is 16 bytes */
 419		llck->index++;
 420
 421		memcpy(mp->data, temp_table, sizeof(temp_table));
 422
 423		mark_metapage_dirty(mp);
 424		release_metapage(mp);
 425
 426		/*
 427		 * Logging is now directed by xtree tlocks
 428		 */
 429		clear_cflag(COMMIT_Dirtable, ip);
 430	}
 431
 432	offset = (index - 2) * sizeof(struct dir_table_slot);
 433	page_offset = offset & (PSIZE - 1);
 434	blkno = ((offset + 1) >> L2PSIZE) << sbi->l2nbperpage;
 435	if (page_offset == 0) {
 436		/*
 437		 * This will be the beginning of a new page
 438		 */
 439		xaddr = 0;
 440		if (xtInsert(tid, ip, 0, blkno, sbi->nbperpage, &xaddr, 0)) {
 441			jfs_warn("add_index: xtInsert failed!");
 442			goto clean_up;
 443		}
 444		ip->i_size += PSIZE;
 445
 446		if ((mp = get_index_page(ip, blkno)))
 447			memset(mp->data, 0, PSIZE);	/* Just looks better */
 448		else
 449			xtTruncate(tid, ip, offset, COMMIT_PWMAP);
 450	} else
 451		mp = read_index_page(ip, blkno);
 452
 453	if (!mp) {
 454		jfs_err("add_index: get/read_metapage failed!");
 455		goto clean_up;
 456	}
 457
 458	lock_index(tid, ip, mp, index);
 459
 460	dirtab_slot =
 461	    (struct dir_table_slot *) ((char *) mp->data + page_offset);
 462	dirtab_slot->flag = DIR_INDEX_VALID;
 463	dirtab_slot->slot = slot;
 464	DTSaddress(dirtab_slot, bn);
 465
 466	mark_metapage_dirty(mp);
 467	release_metapage(mp);
 468
 469	return index;
 470
 471      clean_up:
 472
 473	jfs_ip->next_index--;
 474
 475	return 0;
 476}
 477
 478/*
 479 *	free_index()
 480 *
 481 *	Marks an entry to the directory index table as free.
 482 */
 483static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next)
 484{
 485	struct dir_table_slot *dirtab_slot;
 486	s64 lblock;
 487	struct metapage *mp = NULL;
 488
 489	dirtab_slot = find_index(ip, index, &mp, &lblock);
 490
 491	if (!dirtab_slot)
 492		return;
 493
 494	dirtab_slot->flag = DIR_INDEX_FREE;
 495	dirtab_slot->slot = dirtab_slot->addr1 = 0;
 496	dirtab_slot->addr2 = cpu_to_le32(next);
 497
 498	if (mp) {
 499		lock_index(tid, ip, mp, index);
 500		mark_metapage_dirty(mp);
 501		release_metapage(mp);
 502	} else
 503		set_cflag(COMMIT_Dirtable, ip);
 504}
 505
 506/*
 507 *	modify_index()
 508 *
 509 *	Changes an entry in the directory index table
 510 */
 511static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn,
 512			 int slot, struct metapage ** mp, s64 *lblock)
 513{
 514	struct dir_table_slot *dirtab_slot;
 515
 516	dirtab_slot = find_index(ip, index, mp, lblock);
 517
 518	if (!dirtab_slot)
 519		return;
 520
 521	DTSaddress(dirtab_slot, bn);
 522	dirtab_slot->slot = slot;
 523
 524	if (*mp) {
 525		lock_index(tid, ip, *mp, index);
 526		mark_metapage_dirty(*mp);
 527	} else
 528		set_cflag(COMMIT_Dirtable, ip);
 529}
 530
 531/*
 532 *	read_index()
 533 *
 534 *	reads a directory table slot
 535 */
 536static int read_index(struct inode *ip, u32 index,
 537		     struct dir_table_slot * dirtab_slot)
 538{
 539	s64 lblock;
 540	struct metapage *mp = NULL;
 541	struct dir_table_slot *slot;
 542
 543	slot = find_index(ip, index, &mp, &lblock);
 544	if (!slot) {
 545		return -EIO;
 546	}
 547
 548	memcpy(dirtab_slot, slot, sizeof(struct dir_table_slot));
 549
 550	if (mp)
 551		release_metapage(mp);
 552
 553	return 0;
 554}
 555
 556/*
 557 *	dtSearch()
 558 *
 559 * function:
 560 *	Search for the entry with specified key
 561 *
 562 * parameter:
 563 *
 564 * return: 0 - search result on stack, leaf page pinned;
 565 *	   errno - I/O error
 566 */
 567int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
 568	     struct btstack * btstack, int flag)
 569{
 570	int rc = 0;
 571	int cmp = 1;		/* init for empty page */
 572	s64 bn;
 573	struct metapage *mp;
 574	dtpage_t *p;
 575	s8 *stbl;
 576	int base, index, lim;
 577	struct btframe *btsp;
 578	pxd_t *pxd;
 579	int psize = 288;	/* initial in-line directory */
 580	ino_t inumber;
 581	struct component_name ciKey;
 582	struct super_block *sb = ip->i_sb;
 583
 584	ciKey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t),
 585				   GFP_NOFS);
 586	if (!ciKey.name) {
 587		rc = -ENOMEM;
 588		goto dtSearch_Exit2;
 589	}
 590
 591
 592	/* uppercase search key for c-i directory */
 593	UniStrcpy(ciKey.name, key->name);
 594	ciKey.namlen = key->namlen;
 595
 596	/* only uppercase if case-insensitive support is on */
 597	if ((JFS_SBI(sb)->mntflag & JFS_OS2) == JFS_OS2) {
 598		ciToUpper(&ciKey);
 599	}
 600	BT_CLR(btstack);	/* reset stack */
 601
 602	/* init level count for max pages to split */
 603	btstack->nsplit = 1;
 604
 605	/*
 606	 *	search down tree from root:
 607	 *
 608	 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
 609	 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
 610	 *
 611	 * if entry with search key K is not found
 612	 * internal page search find the entry with largest key Ki
 613	 * less than K which point to the child page to search;
 614	 * leaf page search find the entry with smallest key Kj
 615	 * greater than K so that the returned index is the position of
 616	 * the entry to be shifted right for insertion of new entry.
 617	 * for empty tree, search key is greater than any key of the tree.
 618	 *
 619	 * by convention, root bn = 0.
 620	 */
 621	for (bn = 0;;) {
 622		/* get/pin the page to search */
 623		DT_GETPAGE(ip, bn, mp, psize, p, rc);
 624		if (rc)
 625			goto dtSearch_Exit1;
 626
 627		/* get sorted entry table of the page */
 628		stbl = DT_GETSTBL(p);
 629
 630		/*
 631		 * binary search with search key K on the current page.
 632		 */
 633		for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) {
 634			index = base + (lim >> 1);
 635
 636			if (stbl[index] < 0) {
 637				rc = -EIO;
 638				goto out;
 639			}
 640
 641			if (p->header.flag & BT_LEAF) {
 642				/* uppercase leaf name to compare */
 643				cmp =
 644				    ciCompare(&ciKey, p, stbl[index],
 645					      JFS_SBI(sb)->mntflag);
 646			} else {
 647				/* router key is in uppercase */
 648
 649				cmp = dtCompare(&ciKey, p, stbl[index]);
 650
 651
 652			}
 653			if (cmp == 0) {
 654				/*
 655				 *	search hit
 656				 */
 657				/* search hit - leaf page:
 658				 * return the entry found
 659				 */
 660				if (p->header.flag & BT_LEAF) {
 661					inumber = le32_to_cpu(
 662			((struct ldtentry *) & p->slot[stbl[index]])->inumber);
 663
 664					/*
 665					 * search for JFS_LOOKUP
 666					 */
 667					if (flag == JFS_LOOKUP) {
 668						*data = inumber;
 669						rc = 0;
 670						goto out;
 671					}
 672
 673					/*
 674					 * search for JFS_CREATE
 675					 */
 676					if (flag == JFS_CREATE) {
 677						*data = inumber;
 678						rc = -EEXIST;
 679						goto out;
 680					}
 681
 682					/*
 683					 * search for JFS_REMOVE or JFS_RENAME
 684					 */
 685					if ((flag == JFS_REMOVE ||
 686					     flag == JFS_RENAME) &&
 687					    *data != inumber) {
 688						rc = -ESTALE;
 689						goto out;
 690					}
 691
 692					/*
 693					 * JFS_REMOVE|JFS_FINDDIR|JFS_RENAME
 694					 */
 695					/* save search result */
 696					*data = inumber;
 697					btsp = btstack->top;
 698					btsp->bn = bn;
 699					btsp->index = index;
 700					btsp->mp = mp;
 701
 702					rc = 0;
 703					goto dtSearch_Exit1;
 704				}
 705
 706				/* search hit - internal page:
 707				 * descend/search its child page
 708				 */
 709				goto getChild;
 710			}
 711
 712			if (cmp > 0) {
 713				base = index + 1;
 714				--lim;
 715			}
 716		}
 717
 718		/*
 719		 *	search miss
 720		 *
 721		 * base is the smallest index with key (Kj) greater than
 722		 * search key (K) and may be zero or (maxindex + 1) index.
 723		 */
 724		/*
 725		 * search miss - leaf page
 726		 *
 727		 * return location of entry (base) where new entry with
 728		 * search key K is to be inserted.
 729		 */
 730		if (p->header.flag & BT_LEAF) {
 731			/*
 732			 * search for JFS_LOOKUP, JFS_REMOVE, or JFS_RENAME
 733			 */
 734			if (flag == JFS_LOOKUP || flag == JFS_REMOVE ||
 735			    flag == JFS_RENAME) {
 736				rc = -ENOENT;
 737				goto out;
 738			}
 739
 740			/*
 741			 * search for JFS_CREATE|JFS_FINDDIR:
 742			 *
 743			 * save search result
 744			 */
 745			*data = 0;
 746			btsp = btstack->top;
 747			btsp->bn = bn;
 748			btsp->index = base;
 749			btsp->mp = mp;
 750
 751			rc = 0;
 752			goto dtSearch_Exit1;
 753		}
 754
 755		/*
 756		 * search miss - internal page
 757		 *
 758		 * if base is non-zero, decrement base by one to get the parent
 759		 * entry of the child page to search.
 760		 */
 761		index = base ? base - 1 : base;
 762
 763		/*
 764		 * go down to child page
 765		 */
 766	      getChild:
 767		/* update max. number of pages to split */
 768		if (BT_STACK_FULL(btstack)) {
 769			/* Something's corrupted, mark filesystem dirty so
 770			 * chkdsk will fix it.
 771			 */
 772			jfs_error(sb, "stack overrun!\n");
 773			BT_STACK_DUMP(btstack);
 774			rc = -EIO;
 775			goto out;
 776		}
 777		btstack->nsplit++;
 778
 779		/* push (bn, index) of the parent page/entry */
 780		BT_PUSH(btstack, bn, index);
 781
 782		/* get the child page block number */
 783		pxd = (pxd_t *) & p->slot[stbl[index]];
 784		bn = addressPXD(pxd);
 785		psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize;
 786
 787		/* unpin the parent page */
 788		DT_PUTPAGE(mp);
 789	}
 790
 791      out:
 792	DT_PUTPAGE(mp);
 793
 794      dtSearch_Exit1:
 795
 796	kfree(ciKey.name);
 797
 798      dtSearch_Exit2:
 799
 800	return rc;
 801}
 802
 803
 804/*
 805 *	dtInsert()
 806 *
 807 * function: insert an entry to directory tree
 808 *
 809 * parameter:
 810 *
 811 * return: 0 - success;
 812 *	   errno - failure;
 813 */
 814int dtInsert(tid_t tid, struct inode *ip,
 815	 struct component_name * name, ino_t * fsn, struct btstack * btstack)
 816{
 817	int rc = 0;
 818	struct metapage *mp;	/* meta-page buffer */
 819	dtpage_t *p;		/* base B+-tree index page */
 820	s64 bn;
 821	int index;
 822	struct dtsplit split;	/* split information */
 823	ddata_t data;
 824	struct dt_lock *dtlck;
 825	int n;
 826	struct tlock *tlck;
 827	struct lv *lv;
 828
 829	/*
 830	 *	retrieve search result
 831	 *
 832	 * dtSearch() returns (leaf page pinned, index at which to insert).
 833	 * n.b. dtSearch() may return index of (maxindex + 1) of
 834	 * the full page.
 835	 */
 836	DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
 837	if (p->header.freelist == 0)
 838		return -EINVAL;
 839
 840	/*
 841	 *	insert entry for new key
 842	 */
 843	if (DO_INDEX(ip)) {
 844		if (JFS_IP(ip)->next_index == DIREND) {
 845			DT_PUTPAGE(mp);
 846			return -EMLINK;
 847		}
 848		n = NDTLEAF(name->namlen);
 849		data.leaf.tid = tid;
 850		data.leaf.ip = ip;
 851	} else {
 852		n = NDTLEAF_LEGACY(name->namlen);
 853		data.leaf.ip = NULL;	/* signifies legacy directory format */
 854	}
 855	data.leaf.ino = *fsn;
 856
 857	/*
 858	 *	leaf page does not have enough room for new entry:
 859	 *
 860	 *	extend/split the leaf page;
 861	 *
 862	 * dtSplitUp() will insert the entry and unpin the leaf page.
 863	 */
 864	if (n > p->header.freecnt) {
 865		split.mp = mp;
 866		split.index = index;
 867		split.nslot = n;
 868		split.key = name;
 869		split.data = &data;
 870		rc = dtSplitUp(tid, ip, &split, btstack);
 871		return rc;
 872	}
 873
 874	/*
 875	 *	leaf page does have enough room for new entry:
 876	 *
 877	 *	insert the new data entry into the leaf page;
 878	 */
 879	BT_MARK_DIRTY(mp, ip);
 880	/*
 881	 * acquire a transaction lock on the leaf page
 882	 */
 883	tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
 884	dtlck = (struct dt_lock *) & tlck->lock;
 885	ASSERT(dtlck->index == 0);
 886	lv = & dtlck->lv[0];
 887
 888	/* linelock header */
 889	lv->offset = 0;
 890	lv->length = 1;
 891	dtlck->index++;
 892
 893	dtInsertEntry(p, index, name, &data, &dtlck);
 894
 895	/* linelock stbl of non-root leaf page */
 896	if (!(p->header.flag & BT_ROOT)) {
 897		if (dtlck->index >= dtlck->maxcnt)
 898			dtlck = (struct dt_lock *) txLinelock(dtlck);
 899		lv = & dtlck->lv[dtlck->index];
 900		n = index >> L2DTSLOTSIZE;
 901		lv->offset = p->header.stblindex + n;
 902		lv->length =
 903		    ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1;
 904		dtlck->index++;
 905	}
 906
 907	/* unpin the leaf page */
 908	DT_PUTPAGE(mp);
 909
 910	return 0;
 911}
 912
 913
 914/*
 915 *	dtSplitUp()
 916 *
 917 * function: propagate insertion bottom up;
 918 *
 919 * parameter:
 920 *
 921 * return: 0 - success;
 922 *	   errno - failure;
 923 *	leaf page unpinned;
 924 */
 925static int dtSplitUp(tid_t tid,
 926	  struct inode *ip, struct dtsplit * split, struct btstack * btstack)
 927{
 928	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
 929	int rc = 0;
 930	struct metapage *smp;
 931	dtpage_t *sp;		/* split page */
 932	struct metapage *rmp;
 933	dtpage_t *rp;		/* new right page split from sp */
 934	pxd_t rpxd;		/* new right page extent descriptor */
 935	struct metapage *lmp;
 936	dtpage_t *lp;		/* left child page */
 937	int skip;		/* index of entry of insertion */
 938	struct btframe *parent;	/* parent page entry on traverse stack */
 939	s64 xaddr, nxaddr;
 940	int xlen, xsize;
 941	struct pxdlist pxdlist;
 942	pxd_t *pxd;
 943	struct component_name key = { 0, NULL };
 944	ddata_t *data = split->data;
 945	int n;
 946	struct dt_lock *dtlck;
 947	struct tlock *tlck;
 948	struct lv *lv;
 949	int quota_allocation = 0;
 950
 951	/* get split page */
 952	smp = split->mp;
 953	sp = DT_PAGE(ip, smp);
 954
 955	key.name = kmalloc_array(JFS_NAME_MAX + 2, sizeof(wchar_t), GFP_NOFS);
 956	if (!key.name) {
 957		DT_PUTPAGE(smp);
 958		rc = -ENOMEM;
 959		goto dtSplitUp_Exit;
 960	}
 961
 962	/*
 963	 *	split leaf page
 964	 *
 965	 * The split routines insert the new entry, and
 966	 * acquire txLock as appropriate.
 967	 */
 968	/*
 969	 *	split root leaf page:
 970	 */
 971	if (sp->header.flag & BT_ROOT) {
 972		/*
 973		 * allocate a single extent child page
 974		 */
 975		xlen = 1;
 976		n = sbi->bsize >> L2DTSLOTSIZE;
 977		n -= (n + 31) >> L2DTSLOTSIZE;	/* stbl size */
 978		n -= DTROOTMAXSLOT - sp->header.freecnt; /* header + entries */
 979		if (n <= split->nslot)
 980			xlen++;
 981		if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr))) {
 982			DT_PUTPAGE(smp);
 983			goto freeKeyName;
 984		}
 985
 986		pxdlist.maxnpxd = 1;
 987		pxdlist.npxd = 0;
 988		pxd = &pxdlist.pxd[0];
 989		PXDaddress(pxd, xaddr);
 990		PXDlength(pxd, xlen);
 991		split->pxdlist = &pxdlist;
 992		rc = dtSplitRoot(tid, ip, split, &rmp);
 993
 994		if (rc)
 995			dbFree(ip, xaddr, xlen);
 996		else
 997			DT_PUTPAGE(rmp);
 998
 999		DT_PUTPAGE(smp);
1000
1001		if (!DO_INDEX(ip))
1002			ip->i_size = xlen << sbi->l2bsize;
1003
1004		goto freeKeyName;
1005	}
1006
1007	/*
1008	 *	extend first leaf page
1009	 *
1010	 * extend the 1st extent if less than buffer page size
1011	 * (dtExtendPage() reurns leaf page unpinned)
1012	 */
1013	pxd = &sp->header.self;
1014	xlen = lengthPXD(pxd);
1015	xsize = xlen << sbi->l2bsize;
1016	if (xsize < PSIZE) {
1017		xaddr = addressPXD(pxd);
1018		n = xsize >> L2DTSLOTSIZE;
1019		n -= (n + 31) >> L2DTSLOTSIZE;	/* stbl size */
1020		if ((n + sp->header.freecnt) <= split->nslot)
1021			n = xlen + (xlen << 1);
1022		else
1023			n = xlen;
1024
1025		/* Allocate blocks to quota. */
1026		rc = dquot_alloc_block(ip, n);
1027		if (rc)
1028			goto extendOut;
1029		quota_allocation += n;
1030
1031		if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen,
1032				    (s64) n, &nxaddr)))
1033			goto extendOut;
1034
1035		pxdlist.maxnpxd = 1;
1036		pxdlist.npxd = 0;
1037		pxd = &pxdlist.pxd[0];
1038		PXDaddress(pxd, nxaddr);
1039		PXDlength(pxd, xlen + n);
1040		split->pxdlist = &pxdlist;
1041		if ((rc = dtExtendPage(tid, ip, split, btstack))) {
1042			nxaddr = addressPXD(pxd);
1043			if (xaddr != nxaddr) {
1044				/* free relocated extent */
1045				xlen = lengthPXD(pxd);
1046				dbFree(ip, nxaddr, (s64) xlen);
1047			} else {
1048				/* free extended delta */
1049				xlen = lengthPXD(pxd) - n;
1050				xaddr = addressPXD(pxd) + xlen;
1051				dbFree(ip, xaddr, (s64) n);
1052			}
1053		} else if (!DO_INDEX(ip))
1054			ip->i_size = lengthPXD(pxd) << sbi->l2bsize;
1055
1056
1057	      extendOut:
1058		DT_PUTPAGE(smp);
1059		goto freeKeyName;
1060	}
1061
1062	/*
1063	 *	split leaf page <sp> into <sp> and a new right page <rp>.
1064	 *
1065	 * return <rp> pinned and its extent descriptor <rpxd>
1066	 */
1067	/*
1068	 * allocate new directory page extent and
1069	 * new index page(s) to cover page split(s)
1070	 *
1071	 * allocation hint: ?
1072	 */
1073	n = btstack->nsplit;
1074	pxdlist.maxnpxd = pxdlist.npxd = 0;
1075	xlen = sbi->nbperpage;
1076	for (pxd = pxdlist.pxd; n > 0; n--, pxd++) {
1077		if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr)) == 0) {
1078			PXDaddress(pxd, xaddr);
1079			PXDlength(pxd, xlen);
1080			pxdlist.maxnpxd++;
1081			continue;
1082		}
1083
1084		DT_PUTPAGE(smp);
1085
1086		/* undo allocation */
1087		goto splitOut;
1088	}
1089
1090	split->pxdlist = &pxdlist;
1091	if ((rc = dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd))) {
1092		DT_PUTPAGE(smp);
1093
1094		/* undo allocation */
1095		goto splitOut;
1096	}
1097
1098	if (!DO_INDEX(ip))
1099		ip->i_size += PSIZE;
1100
1101	/*
1102	 * propagate up the router entry for the leaf page just split
1103	 *
1104	 * insert a router entry for the new page into the parent page,
1105	 * propagate the insert/split up the tree by walking back the stack
1106	 * of (bn of parent page, index of child page entry in parent page)
1107	 * that were traversed during the search for the page that split.
1108	 *
1109	 * the propagation of insert/split up the tree stops if the root
1110	 * splits or the page inserted into doesn't have to split to hold
1111	 * the new entry.
1112	 *
1113	 * the parent entry for the split page remains the same, and
1114	 * a new entry is inserted at its right with the first key and
1115	 * block number of the new right page.
1116	 *
1117	 * There are a maximum of 4 pages pinned at any time:
1118	 * two children, left parent and right parent (when the parent splits).
1119	 * keep the child pages pinned while working on the parent.
1120	 * make sure that all pins are released at exit.
1121	 */
1122	while ((parent = BT_POP(btstack)) != NULL) {
1123		/* parent page specified by stack frame <parent> */
1124
1125		/* keep current child pages (<lp>, <rp>) pinned */
1126		lmp = smp;
1127		lp = sp;
1128
1129		/*
1130		 * insert router entry in parent for new right child page <rp>
1131		 */
1132		/* get the parent page <sp> */
1133		DT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc);
1134		if (rc) {
1135			DT_PUTPAGE(lmp);
1136			DT_PUTPAGE(rmp);
1137			goto splitOut;
1138		}
1139
1140		/*
1141		 * The new key entry goes ONE AFTER the index of parent entry,
1142		 * because the split was to the right.
1143		 */
1144		skip = parent->index + 1;
1145
1146		/*
1147		 * compute the key for the router entry
1148		 *
1149		 * key suffix compression:
1150		 * for internal pages that have leaf pages as children,
1151		 * retain only what's needed to distinguish between
1152		 * the new entry and the entry on the page to its left.
1153		 * If the keys compare equal, retain the entire key.
1154		 *
1155		 * note that compression is performed only at computing
1156		 * router key at the lowest internal level.
1157		 * further compression of the key between pairs of higher
1158		 * level internal pages loses too much information and
1159		 * the search may fail.
1160		 * (e.g., two adjacent leaf pages of {a, ..., x} {xx, ...,}
1161		 * results in two adjacent parent entries (a)(xx).
1162		 * if split occurs between these two entries, and
1163		 * if compression is applied, the router key of parent entry
1164		 * of right page (x) will divert search for x into right
1165		 * subtree and miss x in the left subtree.)
1166		 *
1167		 * the entire key must be retained for the next-to-leftmost
1168		 * internal key at any level of the tree, or search may fail
1169		 * (e.g., ?)
1170		 */
1171		switch (rp->header.flag & BT_TYPE) {
1172		case BT_LEAF:
1173			/*
1174			 * compute the length of prefix for suffix compression
1175			 * between last entry of left page and first entry
1176			 * of right page
1177			 */
1178			if ((sp->header.flag & BT_ROOT && skip > 1) ||
1179			    sp->header.prev != 0 || skip > 1) {
1180				/* compute uppercase router prefix key */
1181				rc = ciGetLeafPrefixKey(lp,
1182							lp->header.nextindex-1,
1183							rp, 0, &key,
1184							sbi->mntflag);
1185				if (rc) {
1186					DT_PUTPAGE(lmp);
1187					DT_PUTPAGE(rmp);
1188					DT_PUTPAGE(smp);
1189					goto splitOut;
1190				}
1191			} else {
1192				/* next to leftmost entry of
1193				   lowest internal level */
1194
1195				/* compute uppercase router key */
1196				dtGetKey(rp, 0, &key, sbi->mntflag);
1197				key.name[key.namlen] = 0;
1198
1199				if ((sbi->mntflag & JFS_OS2) == JFS_OS2)
1200					ciToUpper(&key);
1201			}
1202
1203			n = NDTINTERNAL(key.namlen);
1204			break;
1205
1206		case BT_INTERNAL:
1207			dtGetKey(rp, 0, &key, sbi->mntflag);
1208			n = NDTINTERNAL(key.namlen);
1209			break;
1210
1211		default:
1212			jfs_err("dtSplitUp(): UFO!");
1213			break;
1214		}
1215
1216		/* unpin left child page */
1217		DT_PUTPAGE(lmp);
1218
1219		/*
1220		 * compute the data for the router entry
1221		 */
1222		data->xd = rpxd;	/* child page xd */
1223
1224		/*
1225		 * parent page is full - split the parent page
1226		 */
1227		if (n > sp->header.freecnt) {
1228			/* init for parent page split */
1229			split->mp = smp;
1230			split->index = skip;	/* index at insert */
1231			split->nslot = n;
1232			split->key = &key;
1233			/* split->data = data; */
1234
1235			/* unpin right child page */
1236			DT_PUTPAGE(rmp);
1237
1238			/* The split routines insert the new entry,
1239			 * acquire txLock as appropriate.
1240			 * return <rp> pinned and its block number <rbn>.
1241			 */
1242			rc = (sp->header.flag & BT_ROOT) ?
1243			    dtSplitRoot(tid, ip, split, &rmp) :
1244			    dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd);
1245			if (rc) {
1246				DT_PUTPAGE(smp);
1247				goto splitOut;
1248			}
1249
1250			/* smp and rmp are pinned */
1251		}
1252		/*
1253		 * parent page is not full - insert router entry in parent page
1254		 */
1255		else {
1256			BT_MARK_DIRTY(smp, ip);
1257			/*
1258			 * acquire a transaction lock on the parent page
1259			 */
1260			tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY);
1261			dtlck = (struct dt_lock *) & tlck->lock;
1262			ASSERT(dtlck->index == 0);
1263			lv = & dtlck->lv[0];
1264
1265			/* linelock header */
1266			lv->offset = 0;
1267			lv->length = 1;
1268			dtlck->index++;
1269
1270			/* linelock stbl of non-root parent page */
1271			if (!(sp->header.flag & BT_ROOT)) {
1272				lv++;
1273				n = skip >> L2DTSLOTSIZE;
1274				lv->offset = sp->header.stblindex + n;
1275				lv->length =
1276				    ((sp->header.nextindex -
1277				      1) >> L2DTSLOTSIZE) - n + 1;
1278				dtlck->index++;
1279			}
1280
1281			dtInsertEntry(sp, skip, &key, data, &dtlck);
1282
1283			/* exit propagate up */
1284			break;
1285		}
1286	}
1287
1288	/* unpin current split and its right page */
1289	DT_PUTPAGE(smp);
1290	DT_PUTPAGE(rmp);
1291
1292	/*
1293	 * free remaining extents allocated for split
1294	 */
1295      splitOut:
1296	n = pxdlist.npxd;
1297	pxd = &pxdlist.pxd[n];
1298	for (; n < pxdlist.maxnpxd; n++, pxd++)
1299		dbFree(ip, addressPXD(pxd), (s64) lengthPXD(pxd));
1300
1301      freeKeyName:
1302	kfree(key.name);
1303
1304	/* Rollback quota allocation */
1305	if (rc && quota_allocation)
1306		dquot_free_block(ip, quota_allocation);
1307
1308      dtSplitUp_Exit:
1309
1310	return rc;
1311}
1312
1313
1314/*
1315 *	dtSplitPage()
1316 *
1317 * function: Split a non-root page of a btree.
1318 *
1319 * parameter:
1320 *
1321 * return: 0 - success;
1322 *	   errno - failure;
1323 *	return split and new page pinned;
1324 */
1325static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
1326	    struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rpxdp)
1327{
1328	int rc = 0;
1329	struct metapage *smp;
1330	dtpage_t *sp;
1331	struct metapage *rmp;
1332	dtpage_t *rp;		/* new right page allocated */
1333	s64 rbn;		/* new right page block number */
1334	struct metapage *mp;
1335	dtpage_t *p;
1336	s64 nextbn;
1337	struct pxdlist *pxdlist;
1338	pxd_t *pxd;
1339	int skip, nextindex, half, left, nxt, off, si;
1340	struct ldtentry *ldtentry;
1341	struct idtentry *idtentry;
1342	u8 *stbl;
1343	struct dtslot *f;
1344	int fsi, stblsize;
1345	int n;
1346	struct dt_lock *sdtlck, *rdtlck;
1347	struct tlock *tlck;
1348	struct dt_lock *dtlck;
1349	struct lv *slv, *rlv, *lv;
1350
1351	/* get split page */
1352	smp = split->mp;
1353	sp = DT_PAGE(ip, smp);
1354
1355	/*
1356	 * allocate the new right page for the split
1357	 */
1358	pxdlist = split->pxdlist;
1359	pxd = &pxdlist->pxd[pxdlist->npxd];
1360	pxdlist->npxd++;
1361	rbn = addressPXD(pxd);
1362	rmp = get_metapage(ip, rbn, PSIZE, 1);
1363	if (rmp == NULL)
1364		return -EIO;
1365
1366	/* Allocate blocks to quota. */
1367	rc = dquot_alloc_block(ip, lengthPXD(pxd));
1368	if (rc) {
1369		release_metapage(rmp);
1370		return rc;
1371	}
1372
1373	jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp);
1374
1375	BT_MARK_DIRTY(rmp, ip);
1376	/*
1377	 * acquire a transaction lock on the new right page
1378	 */
1379	tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW);
1380	rdtlck = (struct dt_lock *) & tlck->lock;
1381
1382	rp = (dtpage_t *) rmp->data;
1383	*rpp = rp;
1384	rp->header.self = *pxd;
1385
1386	BT_MARK_DIRTY(smp, ip);
1387	/*
1388	 * acquire a transaction lock on the split page
1389	 *
1390	 * action:
1391	 */
1392	tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY);
1393	sdtlck = (struct dt_lock *) & tlck->lock;
1394
1395	/* linelock header of split page */
1396	ASSERT(sdtlck->index == 0);
1397	slv = & sdtlck->lv[0];
1398	slv->offset = 0;
1399	slv->length = 1;
1400	sdtlck->index++;
1401
1402	/*
1403	 * initialize/update sibling pointers between sp and rp
1404	 */
1405	nextbn = le64_to_cpu(sp->header.next);
1406	rp->header.next = cpu_to_le64(nextbn);
1407	rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self));
1408	sp->header.next = cpu_to_le64(rbn);
1409
1410	/*
1411	 * initialize new right page
1412	 */
1413	rp->header.flag = sp->header.flag;
1414
1415	/* compute sorted entry table at start of extent data area */
1416	rp->header.nextindex = 0;
1417	rp->header.stblindex = 1;
1418
1419	n = PSIZE >> L2DTSLOTSIZE;
1420	rp->header.maxslot = n;
1421	stblsize = (n + 31) >> L2DTSLOTSIZE;	/* in unit of slot */
1422
1423	/* init freelist */
1424	fsi = rp->header.stblindex + stblsize;
1425	rp->header.freelist = fsi;
1426	rp->header.freecnt = rp->header.maxslot - fsi;
1427
1428	/*
1429	 *	sequential append at tail: append without split
1430	 *
1431	 * If splitting the last page on a level because of appending
1432	 * a entry to it (skip is maxentry), it's likely that the access is
1433	 * sequential. Adding an empty page on the side of the level is less
1434	 * work and can push the fill factor much higher than normal.
1435	 * If we're wrong it's no big deal, we'll just do the split the right
1436	 * way next time.
1437	 * (It may look like it's equally easy to do a similar hack for
1438	 * reverse sorted data, that is, split the tree left,
1439	 * but it's not. Be my guest.)
1440	 */
1441	if (nextbn == 0 && split->index == sp->header.nextindex) {
1442		/* linelock header + stbl (first slot) of new page */
1443		rlv = & rdtlck->lv[rdtlck->index];
1444		rlv->offset = 0;
1445		rlv->length = 2;
1446		rdtlck->index++;
1447
1448		/*
1449		 * initialize freelist of new right page
1450		 */
1451		f = &rp->slot[fsi];
1452		for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
1453			f->next = fsi;
1454		f->next = -1;
1455
1456		/* insert entry at the first entry of the new right page */
1457		dtInsertEntry(rp, 0, split->key, split->data, &rdtlck);
1458
1459		goto out;
1460	}
1461
1462	/*
1463	 *	non-sequential insert (at possibly middle page)
1464	 */
1465
1466	/*
1467	 * update prev pointer of previous right sibling page;
1468	 */
1469	if (nextbn != 0) {
1470		DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
1471		if (rc) {
1472			discard_metapage(rmp);
1473			return rc;
1474		}
1475
1476		BT_MARK_DIRTY(mp, ip);
1477		/*
1478		 * acquire a transaction lock on the next page
1479		 */
1480		tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
1481		jfs_info("dtSplitPage: tlck = 0x%p, ip = 0x%p, mp=0x%p",
1482			tlck, ip, mp);
1483		dtlck = (struct dt_lock *) & tlck->lock;
1484
1485		/* linelock header of previous right sibling page */
1486		lv = & dtlck->lv[dtlck->index];
1487		lv->offset = 0;
1488		lv->length = 1;
1489		dtlck->index++;
1490
1491		p->header.prev = cpu_to_le64(rbn);
1492
1493		DT_PUTPAGE(mp);
1494	}
1495
1496	/*
1497	 * split the data between the split and right pages.
1498	 */
1499	skip = split->index;
1500	half = (PSIZE >> L2DTSLOTSIZE) >> 1;	/* swag */
1501	left = 0;
1502
1503	/*
1504	 *	compute fill factor for split pages
1505	 *
1506	 * <nxt> traces the next entry to move to rp
1507	 * <off> traces the next entry to stay in sp
1508	 */
1509	stbl = (u8 *) & sp->slot[sp->header.stblindex];
1510	nextindex = sp->header.nextindex;
1511	for (nxt = off = 0; nxt < nextindex; ++off) {
1512		if (off == skip)
1513			/* check for fill factor with new entry size */
1514			n = split->nslot;
1515		else {
1516			si = stbl[nxt];
1517			switch (sp->header.flag & BT_TYPE) {
1518			case BT_LEAF:
1519				ldtentry = (struct ldtentry *) & sp->slot[si];
1520				if (DO_INDEX(ip))
1521					n = NDTLEAF(ldtentry->namlen);
1522				else
1523					n = NDTLEAF_LEGACY(ldtentry->
1524							   namlen);
1525				break;
1526
1527			case BT_INTERNAL:
1528				idtentry = (struct idtentry *) & sp->slot[si];
1529				n = NDTINTERNAL(idtentry->namlen);
1530				break;
1531
1532			default:
1533				break;
1534			}
1535
1536			++nxt;	/* advance to next entry to move in sp */
1537		}
1538
1539		left += n;
1540		if (left >= half)
1541			break;
1542	}
1543
1544	/* <nxt> poins to the 1st entry to move */
1545
1546	/*
1547	 *	move entries to right page
1548	 *
1549	 * dtMoveEntry() initializes rp and reserves entry for insertion
1550	 *
1551	 * split page moved out entries are linelocked;
1552	 * new/right page moved in entries are linelocked;
1553	 */
1554	/* linelock header + stbl of new right page */
1555	rlv = & rdtlck->lv[rdtlck->index];
1556	rlv->offset = 0;
1557	rlv->length = 5;
1558	rdtlck->index++;
1559
1560	dtMoveEntry(sp, nxt, rp, &sdtlck, &rdtlck, DO_INDEX(ip));
1561
1562	sp->header.nextindex = nxt;
1563
1564	/*
1565	 * finalize freelist of new right page
1566	 */
1567	fsi = rp->header.freelist;
1568	f = &rp->slot[fsi];
1569	for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
1570		f->next = fsi;
1571	f->next = -1;
1572
1573	/*
1574	 * Update directory index table for entries now in right page
1575	 */
1576	if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) {
1577		s64 lblock;
1578
1579		mp = NULL;
1580		stbl = DT_GETSTBL(rp);
1581		for (n = 0; n < rp->header.nextindex; n++) {
1582			ldtentry = (struct ldtentry *) & rp->slot[stbl[n]];
1583			modify_index(tid, ip, le32_to_cpu(ldtentry->index),
1584				     rbn, n, &mp, &lblock);
1585		}
1586		if (mp)
1587			release_metapage(mp);
1588	}
1589
1590	/*
1591	 * the skipped index was on the left page,
1592	 */
1593	if (skip <= off) {
1594		/* insert the new entry in the split page */
1595		dtInsertEntry(sp, skip, split->key, split->data, &sdtlck);
1596
1597		/* linelock stbl of split page */
1598		if (sdtlck->index >= sdtlck->maxcnt)
1599			sdtlck = (struct dt_lock *) txLinelock(sdtlck);
1600		slv = & sdtlck->lv[sdtlck->index];
1601		n = skip >> L2DTSLOTSIZE;
1602		slv->offset = sp->header.stblindex + n;
1603		slv->length =
1604		    ((sp->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1;
1605		sdtlck->index++;
1606	}
1607	/*
1608	 * the skipped index was on the right page,
1609	 */
1610	else {
1611		/* adjust the skip index to reflect the new position */
1612		skip -= nxt;
1613
1614		/* insert the new entry in the right page */
1615		dtInsertEntry(rp, skip, split->key, split->data, &rdtlck);
1616	}
1617
1618      out:
1619	*rmpp = rmp;
1620	*rpxdp = *pxd;
1621
1622	return rc;
1623}
1624
1625
1626/*
1627 *	dtExtendPage()
1628 *
1629 * function: extend 1st/only directory leaf page
1630 *
1631 * parameter:
1632 *
1633 * return: 0 - success;
1634 *	   errno - failure;
1635 *	return extended page pinned;
1636 */
1637static int dtExtendPage(tid_t tid,
1638	     struct inode *ip, struct dtsplit * split, struct btstack * btstack)
1639{
1640	struct super_block *sb = ip->i_sb;
1641	int rc;
1642	struct metapage *smp, *pmp, *mp;
1643	dtpage_t *sp, *pp;
1644	struct pxdlist *pxdlist;
1645	pxd_t *pxd, *tpxd;
1646	int xlen, xsize;
1647	int newstblindex, newstblsize;
1648	int oldstblindex, oldstblsize;
1649	int fsi, last;
1650	struct dtslot *f;
1651	struct btframe *parent;
1652	int n;
1653	struct dt_lock *dtlck;
1654	s64 xaddr, txaddr;
1655	struct tlock *tlck;
1656	struct pxd_lock *pxdlock;
1657	struct lv *lv;
1658	uint type;
1659	struct ldtentry *ldtentry;
1660	u8 *stbl;
1661
1662	/* get page to extend */
1663	smp = split->mp;
1664	sp = DT_PAGE(ip, smp);
1665
1666	/* get parent/root page */
1667	parent = BT_POP(btstack);
1668	DT_GETPAGE(ip, parent->bn, pmp, PSIZE, pp, rc);
1669	if (rc)
1670		return (rc);
1671
1672	/*
1673	 *	extend the extent
1674	 */
1675	pxdlist = split->pxdlist;
1676	pxd = &pxdlist->pxd[pxdlist->npxd];
1677	pxdlist->npxd++;
1678
1679	xaddr = addressPXD(pxd);
1680	tpxd = &sp->header.self;
1681	txaddr = addressPXD(tpxd);
1682	/* in-place extension */
1683	if (xaddr == txaddr) {
1684		type = tlckEXTEND;
1685	}
1686	/* relocation */
1687	else {
1688		type = tlckNEW;
1689
1690		/* save moved extent descriptor for later free */
1691		tlck = txMaplock(tid, ip, tlckDTREE | tlckRELOCATE);
1692		pxdlock = (struct pxd_lock *) & tlck->lock;
1693		pxdlock->flag = mlckFREEPXD;
1694		pxdlock->pxd = sp->header.self;
1695		pxdlock->index = 1;
1696
1697		/*
1698		 * Update directory index table to reflect new page address
1699		 */
1700		if (DO_INDEX(ip)) {
1701			s64 lblock;
1702
1703			mp = NULL;
1704			stbl = DT_GETSTBL(sp);
1705			for (n = 0; n < sp->header.nextindex; n++) {
1706				ldtentry =
1707				    (struct ldtentry *) & sp->slot[stbl[n]];
1708				modify_index(tid, ip,
1709					     le32_to_cpu(ldtentry->index),
1710					     xaddr, n, &mp, &lblock);
1711			}
1712			if (mp)
1713				release_metapage(mp);
1714		}
1715	}
1716
1717	/*
1718	 *	extend the page
1719	 */
1720	sp->header.self = *pxd;
1721
1722	jfs_info("dtExtendPage: ip:0x%p smp:0x%p sp:0x%p", ip, smp, sp);
1723
1724	BT_MARK_DIRTY(smp, ip);
1725	/*
1726	 * acquire a transaction lock on the extended/leaf page
1727	 */
1728	tlck = txLock(tid, ip, smp, tlckDTREE | type);
1729	dtlck = (struct dt_lock *) & tlck->lock;
1730	lv = & dtlck->lv[0];
1731
1732	/* update buffer extent descriptor of extended page */
1733	xlen = lengthPXD(pxd);
1734	xsize = xlen << JFS_SBI(sb)->l2bsize;
1735
1736	/*
1737	 * copy old stbl to new stbl at start of extended area
1738	 */
1739	oldstblindex = sp->header.stblindex;
1740	oldstblsize = (sp->header.maxslot + 31) >> L2DTSLOTSIZE;
1741	newstblindex = sp->header.maxslot;
1742	n = xsize >> L2DTSLOTSIZE;
1743	newstblsize = (n + 31) >> L2DTSLOTSIZE;
1744	memcpy(&sp->slot[newstblindex], &sp->slot[oldstblindex],
1745	       sp->header.nextindex);
1746
1747	/*
1748	 * in-line extension: linelock old area of extended page
1749	 */
1750	if (type == tlckEXTEND) {
1751		/* linelock header */
1752		lv->offset = 0;
1753		lv->length = 1;
1754		dtlck->index++;
1755		lv++;
1756
1757		/* linelock new stbl of extended page */
1758		lv->offset = newstblindex;
1759		lv->length = newstblsize;
1760	}
1761	/*
1762	 * relocation: linelock whole relocated area
1763	 */
1764	else {
1765		lv->offset = 0;
1766		lv->length = sp->header.maxslot + newstblsize;
1767	}
1768
1769	dtlck->index++;
1770
1771	sp->header.maxslot = n;
1772	sp->header.stblindex = newstblindex;
1773	/* sp->header.nextindex remains the same */
1774
1775	/*
1776	 * add old stbl region at head of freelist
1777	 */
1778	fsi = oldstblindex;
1779	f = &sp->slot[fsi];
1780	last = sp->header.freelist;
1781	for (n = 0; n < oldstblsize; n++, fsi++, f++) {
1782		f->next = last;
1783		last = fsi;
1784	}
1785	sp->header.freelist = last;
1786	sp->header.freecnt += oldstblsize;
1787
1788	/*
1789	 * append free region of newly extended area at tail of freelist
1790	 */
1791	/* init free region of newly extended area */
1792	fsi = n = newstblindex + newstblsize;
1793	f = &sp->slot[fsi];
1794	for (fsi++; fsi < sp->header.maxslot; f++, fsi++)
1795		f->next = fsi;
1796	f->next = -1;
1797
1798	/* append new free region at tail of old freelist */
1799	fsi = sp->header.freelist;
1800	if (fsi == -1)
1801		sp->header.freelist = n;
1802	else {
1803		do {
1804			f = &sp->slot[fsi];
1805			fsi = f->next;
1806		} while (fsi != -1);
1807
1808		f->next = n;
1809	}
1810
1811	sp->header.freecnt += sp->header.maxslot - n;
1812
1813	/*
1814	 * insert the new entry
1815	 */
1816	dtInsertEntry(sp, split->index, split->key, split->data, &dtlck);
1817
1818	BT_MARK_DIRTY(pmp, ip);
1819	/*
1820	 * linelock any freeslots residing in old extent
1821	 */
1822	if (type == tlckEXTEND) {
1823		n = sp->header.maxslot >> 2;
1824		if (sp->header.freelist < n)
1825			dtLinelockFreelist(sp, n, &dtlck);
1826	}
1827
1828	/*
1829	 *	update parent entry on the parent/root page
1830	 */
1831	/*
1832	 * acquire a transaction lock on the parent/root page
1833	 */
1834	tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY);
1835	dtlck = (struct dt_lock *) & tlck->lock;
1836	lv = & dtlck->lv[dtlck->index];
1837
1838	/* linelock parent entry - 1st slot */
1839	lv->offset = 1;
1840	lv->length = 1;
1841	dtlck->index++;
1842
1843	/* update the parent pxd for page extension */
1844	tpxd = (pxd_t *) & pp->slot[1];
1845	*tpxd = *pxd;
1846
1847	DT_PUTPAGE(pmp);
1848	return 0;
1849}
1850
1851
1852/*
1853 *	dtSplitRoot()
1854 *
1855 * function:
1856 *	split the full root page into
1857 *	original/root/split page and new right page
1858 *	i.e., root remains fixed in tree anchor (inode) and
1859 *	the root is copied to a single new right child page
1860 *	since root page << non-root page, and
1861 *	the split root page contains a single entry for the
1862 *	new right child page.
1863 *
1864 * parameter:
1865 *
1866 * return: 0 - success;
1867 *	   errno - failure;
1868 *	return new page pinned;
1869 */
1870static int dtSplitRoot(tid_t tid,
1871	    struct inode *ip, struct dtsplit * split, struct metapage ** rmpp)
1872{
1873	struct super_block *sb = ip->i_sb;
1874	struct metapage *smp;
1875	dtroot_t *sp;
1876	struct metapage *rmp;
1877	dtpage_t *rp;
1878	s64 rbn;
1879	int xlen;
1880	int xsize;
1881	struct dtslot *f;
1882	s8 *stbl;
1883	int fsi, stblsize, n;
1884	struct idtentry *s;
1885	pxd_t *ppxd;
1886	struct pxdlist *pxdlist;
1887	pxd_t *pxd;
1888	struct dt_lock *dtlck;
1889	struct tlock *tlck;
1890	struct lv *lv;
1891	int rc;
1892
1893	/* get split root page */
1894	smp = split->mp;
1895	sp = &JFS_IP(ip)->i_dtroot;
1896
1897	/*
1898	 *	allocate/initialize a single (right) child page
1899	 *
1900	 * N.B. at first split, a one (or two) block to fit new entry
1901	 * is allocated; at subsequent split, a full page is allocated;
1902	 */
1903	pxdlist = split->pxdlist;
1904	pxd = &pxdlist->pxd[pxdlist->npxd];
1905	pxdlist->npxd++;
1906	rbn = addressPXD(pxd);
1907	xlen = lengthPXD(pxd);
1908	xsize = xlen << JFS_SBI(sb)->l2bsize;
1909	rmp = get_metapage(ip, rbn, xsize, 1);
1910	if (!rmp)
1911		return -EIO;
1912
1913	rp = rmp->data;
1914
1915	/* Allocate blocks to quota. */
1916	rc = dquot_alloc_block(ip, lengthPXD(pxd));
1917	if (rc) {
1918		release_metapage(rmp);
1919		return rc;
1920	}
1921
1922	BT_MARK_DIRTY(rmp, ip);
1923	/*
1924	 * acquire a transaction lock on the new right page
1925	 */
1926	tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW);
1927	dtlck = (struct dt_lock *) & tlck->lock;
1928
1929	rp->header.flag =
1930	    (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL;
1931	rp->header.self = *pxd;
1932
1933	/* initialize sibling pointers */
1934	rp->header.next = 0;
1935	rp->header.prev = 0;
1936
1937	/*
1938	 *	move in-line root page into new right page extent
1939	 */
1940	/* linelock header + copied entries + new stbl (1st slot) in new page */
1941	ASSERT(dtlck->index == 0);
1942	lv = & dtlck->lv[0];
1943	lv->offset = 0;
1944	lv->length = 10;	/* 1 + 8 + 1 */
1945	dtlck->index++;
1946
1947	n = xsize >> L2DTSLOTSIZE;
1948	rp->header.maxslot = n;
1949	stblsize = (n + 31) >> L2DTSLOTSIZE;
1950
1951	/* copy old stbl to new stbl at start of extended area */
1952	rp->header.stblindex = DTROOTMAXSLOT;
1953	stbl = (s8 *) & rp->slot[DTROOTMAXSLOT];
1954	memcpy(stbl, sp->header.stbl, sp->header.nextindex);
1955	rp->header.nextindex = sp->header.nextindex;
1956
1957	/* copy old data area to start of new data area */
1958	memcpy(&rp->slot[1], &sp->slot[1], IDATASIZE);
1959
1960	/*
1961	 * append free region of newly extended area at tail of freelist
1962	 */
1963	/* init free region of newly extended area */
1964	fsi = n = DTROOTMAXSLOT + stblsize;
1965	f = &rp->slot[fsi];
1966	for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
1967		f->next = fsi;
1968	f->next = -1;
1969
1970	/* append new free region at tail of old freelist */
1971	fsi = sp->header.freelist;
1972	if (fsi == -1)
1973		rp->header.freelist = n;
1974	else {
1975		rp->header.freelist = fsi;
1976
1977		do {
1978			f = &rp->slot[fsi];
1979			fsi = f->next;
1980		} while (fsi >= 0);
1981
1982		f->next = n;
1983	}
1984
1985	rp->header.freecnt = sp->header.freecnt + rp->header.maxslot - n;
1986
1987	/*
1988	 * Update directory index table for entries now in right page
1989	 */
1990	if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) {
1991		s64 lblock;
1992		struct metapage *mp = NULL;
1993		struct ldtentry *ldtentry;
1994
1995		stbl = DT_GETSTBL(rp);
1996		for (n = 0; n < rp->header.nextindex; n++) {
1997			ldtentry = (struct ldtentry *) & rp->slot[stbl[n]];
1998			modify_index(tid, ip, le32_to_cpu(ldtentry->index),
1999				     rbn, n, &mp, &lblock);
2000		}
2001		if (mp)
2002			release_metapage(mp);
2003	}
2004	/*
2005	 * insert the new entry into the new right/child page
2006	 * (skip index in the new right page will not change)
2007	 */
2008	dtInsertEntry(rp, split->index, split->key, split->data, &dtlck);
2009
2010	/*
2011	 *	reset parent/root page
2012	 *
2013	 * set the 1st entry offset to 0, which force the left-most key
2014	 * at any level of the tree to be less than any search key.
2015	 *
2016	 * The btree comparison code guarantees that the left-most key on any
2017	 * level of the tree is never used, so it doesn't need to be filled in.
2018	 */
2019	BT_MARK_DIRTY(smp, ip);
2020	/*
2021	 * acquire a transaction lock on the root page (in-memory inode)
2022	 */
2023	tlck = txLock(tid, ip, smp, tlckDTREE | tlckNEW | tlckBTROOT);
2024	dtlck = (struct dt_lock *) & tlck->lock;
2025
2026	/* linelock root */
2027	ASSERT(dtlck->index == 0);
2028	lv = & dtlck->lv[0];
2029	lv->offset = 0;
2030	lv->length = DTROOTMAXSLOT;
2031	dtlck->index++;
2032
2033	/* update page header of root */
2034	if (sp->header.flag & BT_LEAF) {
2035		sp->header.flag &= ~BT_LEAF;
2036		sp->header.flag |= BT_INTERNAL;
2037	}
2038
2039	/* init the first entry */
2040	s = (struct idtentry *) & sp->slot[DTENTRYSTART];
2041	ppxd = (pxd_t *) s;
2042	*ppxd = *pxd;
2043	s->next = -1;
2044	s->namlen = 0;
2045
2046	stbl = sp->header.stbl;
2047	stbl[0] = DTENTRYSTART;
2048	sp->header.nextindex = 1;
2049
2050	/* init freelist */
2051	fsi = DTENTRYSTART + 1;
2052	f = &sp->slot[fsi];
2053
2054	/* init free region of remaining area */
2055	for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++)
2056		f->next = fsi;
2057	f->next = -1;
2058
2059	sp->header.freelist = DTENTRYSTART + 1;
2060	sp->header.freecnt = DTROOTMAXSLOT - (DTENTRYSTART + 1);
2061
2062	*rmpp = rmp;
2063
2064	return 0;
2065}
2066
2067
2068/*
2069 *	dtDelete()
2070 *
2071 * function: delete the entry(s) referenced by a key.
2072 *
2073 * parameter:
2074 *
2075 * return:
2076 */
2077int dtDelete(tid_t tid,
2078	 struct inode *ip, struct component_name * key, ino_t * ino, int flag)
2079{
2080	int rc = 0;
2081	s64 bn;
2082	struct metapage *mp, *imp;
2083	dtpage_t *p;
2084	int index;
2085	struct btstack btstack;
2086	struct dt_lock *dtlck;
2087	struct tlock *tlck;
2088	struct lv *lv;
2089	int i;
2090	struct ldtentry *ldtentry;
2091	u8 *stbl;
2092	u32 table_index, next_index;
2093	struct metapage *nmp;
2094	dtpage_t *np;
2095
2096	/*
2097	 *	search for the entry to delete:
2098	 *
2099	 * dtSearch() returns (leaf page pinned, index at which to delete).
2100	 */
2101	if ((rc = dtSearch(ip, key, ino, &btstack, flag)))
2102		return rc;
2103
2104	/* retrieve search result */
2105	DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
2106
2107	/*
2108	 * We need to find put the index of the next entry into the
2109	 * directory index table in order to resume a readdir from this
2110	 * entry.
2111	 */
2112	if (DO_INDEX(ip)) {
2113		stbl = DT_GETSTBL(p);
2114		ldtentry = (struct ldtentry *) & p->slot[stbl[index]];
2115		table_index = le32_to_cpu(ldtentry->index);
2116		if (index == (p->header.nextindex - 1)) {
2117			/*
2118			 * Last entry in this leaf page
2119			 */
2120			if ((p->header.flag & BT_ROOT)
2121			    || (p->header.next == 0))
2122				next_index = -1;
2123			else {
2124				/* Read next leaf page */
2125				DT_GETPAGE(ip, le64_to_cpu(p->header.next),
2126					   nmp, PSIZE, np, rc);
2127				if (rc)
2128					next_index = -1;
2129				else {
2130					stbl = DT_GETSTBL(np);
2131					ldtentry =
2132					    (struct ldtentry *) & np->
2133					    slot[stbl[0]];
2134					next_index =
2135					    le32_to_cpu(ldtentry->index);
2136					DT_PUTPAGE(nmp);
2137				}
2138			}
2139		} else {
2140			ldtentry =
2141			    (struct ldtentry *) & p->slot[stbl[index + 1]];
2142			next_index = le32_to_cpu(ldtentry->index);
2143		}
2144		free_index(tid, ip, table_index, next_index);
2145	}
2146	/*
2147	 * the leaf page becomes empty, delete the page
2148	 */
2149	if (p->header.nextindex == 1) {
2150		/* delete empty page */
2151		rc = dtDeleteUp(tid, ip, mp, p, &btstack);
2152	}
2153	/*
2154	 * the leaf page has other entries remaining:
2155	 *
2156	 * delete the entry from the leaf page.
2157	 */
2158	else {
2159		BT_MARK_DIRTY(mp, ip);
2160		/*
2161		 * acquire a transaction lock on the leaf page
2162		 */
2163		tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
2164		dtlck = (struct dt_lock *) & tlck->lock;
2165
2166		/*
2167		 * Do not assume that dtlck->index will be zero.  During a
2168		 * rename within a directory, this transaction may have
2169		 * modified this page already when adding the new entry.
2170		 */
2171
2172		/* linelock header */
2173		if (dtlck->index >= dtlck->maxcnt)
2174			dtlck = (struct dt_lock *) txLinelock(dtlck);
2175		lv = & dtlck->lv[dtlck->index];
2176		lv->offset = 0;
2177		lv->length = 1;
2178		dtlck->index++;
2179
2180		/* linelock stbl of non-root leaf page */
2181		if (!(p->header.flag & BT_ROOT)) {
2182			if (dtlck->index >= dtlck->maxcnt)
2183				dtlck = (struct dt_lock *) txLinelock(dtlck);
2184			lv = & dtlck->lv[dtlck->index];
2185			i = index >> L2DTSLOTSIZE;
2186			lv->offset = p->header.stblindex + i;
2187			lv->length =
2188			    ((p->header.nextindex - 1) >> L2DTSLOTSIZE) -
2189			    i + 1;
2190			dtlck->index++;
2191		}
2192
2193		/* free the leaf entry */
2194		dtDeleteEntry(p, index, &dtlck);
2195
2196		/*
2197		 * Update directory index table for entries moved in stbl
2198		 */
2199		if (DO_INDEX(ip) && index < p->header.nextindex) {
2200			s64 lblock;
2201
2202			imp = NULL;
2203			stbl = DT_GETSTBL(p);
2204			for (i = index; i < p->header.nextindex; i++) {
2205				ldtentry =
2206				    (struct ldtentry *) & p->slot[stbl[i]];
2207				modify_index(tid, ip,
2208					     le32_to_cpu(ldtentry->index),
2209					     bn, i, &imp, &lblock);
2210			}
2211			if (imp)
2212				release_metapage(imp);
2213		}
2214
2215		DT_PUTPAGE(mp);
2216	}
2217
2218	return rc;
2219}
2220
2221
2222/*
2223 *	dtDeleteUp()
2224 *
2225 * function:
2226 *	free empty pages as propagating deletion up the tree
2227 *
2228 * parameter:
2229 *
2230 * return:
2231 */
2232static int dtDeleteUp(tid_t tid, struct inode *ip,
2233	   struct metapage * fmp, dtpage_t * fp, struct btstack * btstack)
2234{
2235	int rc = 0;
2236	struct metapage *mp;
2237	dtpage_t *p;
2238	int index, nextindex;
2239	int xlen;
2240	struct btframe *parent;
2241	struct dt_lock *dtlck;
2242	struct tlock *tlck;
2243	struct lv *lv;
2244	struct pxd_lock *pxdlock;
2245	int i;
2246
2247	/*
2248	 *	keep the root leaf page which has become empty
2249	 */
2250	if (BT_IS_ROOT(fmp)) {
2251		/*
2252		 * reset the root
2253		 *
2254		 * dtInitRoot() acquires txlock on the root
2255		 */
2256		dtInitRoot(tid, ip, PARENT(ip));
2257
2258		DT_PUTPAGE(fmp);
2259
2260		return 0;
2261	}
2262
2263	/*
2264	 *	free the non-root leaf page
2265	 */
2266	/*
2267	 * acquire a transaction lock on the page
2268	 *
2269	 * write FREEXTENT|NOREDOPAGE log record
2270	 * N.B. linelock is overlaid as freed extent descriptor, and
2271	 * the buffer page is freed;
2272	 */
2273	tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE);
2274	pxdlock = (struct pxd_lock *) & tlck->lock;
2275	pxdlock->flag = mlckFREEPXD;
2276	pxdlock->pxd = fp->header.self;
2277	pxdlock->index = 1;
2278
2279	/* update sibling pointers */
2280	if ((rc = dtRelink(tid, ip, fp))) {
2281		BT_PUTPAGE(fmp);
2282		return rc;
2283	}
2284
2285	xlen = lengthPXD(&fp->header.self);
2286
2287	/* Free quota allocation. */
2288	dquot_free_block(ip, xlen);
2289
2290	/* free/invalidate its buffer page */
2291	discard_metapage(fmp);
2292
2293	/*
2294	 *	propagate page deletion up the directory tree
2295	 *
2296	 * If the delete from the parent page makes it empty,
2297	 * continue all the way up the tree.
2298	 * stop if the root page is reached (which is never deleted) or
2299	 * if the entry deletion does not empty the page.
2300	 */
2301	while ((parent = BT_POP(btstack)) != NULL) {
2302		/* pin the parent page <sp> */
2303		DT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc);
2304		if (rc)
2305			return rc;
2306
2307		/*
2308		 * free the extent of the child page deleted
2309		 */
2310		index = parent->index;
2311
2312		/*
2313		 * delete the entry for the child page from parent
2314		 */
2315		nextindex = p->header.nextindex;
2316
2317		/*
2318		 * the parent has the single entry being deleted:
2319		 *
2320		 * free the parent page which has become empty.
2321		 */
2322		if (nextindex == 1) {
2323			/*
2324			 * keep the root internal page which has become empty
2325			 */
2326			if (p->header.flag & BT_ROOT) {
2327				/*
2328				 * reset the root
2329				 *
2330				 * dtInitRoot() acquires txlock on the root
2331				 */
2332				dtInitRoot(tid, ip, PARENT(ip));
2333
2334				DT_PUTPAGE(mp);
2335
2336				return 0;
2337			}
2338			/*
2339			 * free the parent page
2340			 */
2341			else {
2342				/*
2343				 * acquire a transaction lock on the page
2344				 *
2345				 * write FREEXTENT|NOREDOPAGE log record
2346				 */
2347				tlck =
2348				    txMaplock(tid, ip,
2349					      tlckDTREE | tlckFREE);
2350				pxdlock = (struct pxd_lock *) & tlck->lock;
2351				pxdlock->flag = mlckFREEPXD;
2352				pxdlock->pxd = p->header.self;
2353				pxdlock->index = 1;
2354
2355				/* update sibling pointers */
2356				if ((rc = dtRelink(tid, ip, p))) {
2357					DT_PUTPAGE(mp);
2358					return rc;
2359				}
2360
2361				xlen = lengthPXD(&p->header.self);
2362
2363				/* Free quota allocation */
2364				dquot_free_block(ip, xlen);
2365
2366				/* free/invalidate its buffer page */
2367				discard_metapage(mp);
2368
2369				/* propagate up */
2370				continue;
2371			}
2372		}
2373
2374		/*
2375		 * the parent has other entries remaining:
2376		 *
2377		 * delete the router entry from the parent page.
2378		 */
2379		BT_MARK_DIRTY(mp, ip);
2380		/*
2381		 * acquire a transaction lock on the page
2382		 *
2383		 * action: router entry deletion
2384		 */
2385		tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
2386		dtlck = (struct dt_lock *) & tlck->lock;
2387
2388		/* linelock header */
2389		if (dtlck->index >= dtlck->maxcnt)
2390			dtlck = (struct dt_lock *) txLinelock(dtlck);
2391		lv = & dtlck->lv[dtlck->index];
2392		lv->offset = 0;
2393		lv->length = 1;
2394		dtlck->index++;
2395
2396		/* linelock stbl of non-root leaf page */
2397		if (!(p->header.flag & BT_ROOT)) {
2398			if (dtlck->index < dtlck->maxcnt)
2399				lv++;
2400			else {
2401				dtlck = (struct dt_lock *) txLinelock(dtlck);
2402				lv = & dtlck->lv[0];
2403			}
2404			i = index >> L2DTSLOTSIZE;
2405			lv->offset = p->header.stblindex + i;
2406			lv->length =
2407			    ((p->header.nextindex - 1) >> L2DTSLOTSIZE) -
2408			    i + 1;
2409			dtlck->index++;
2410		}
2411
2412		/* free the router entry */
2413		dtDeleteEntry(p, index, &dtlck);
2414
2415		/* reset key of new leftmost entry of level (for consistency) */
2416		if (index == 0 &&
2417		    ((p->header.flag & BT_ROOT) || p->header.prev == 0))
2418			dtTruncateEntry(p, 0, &dtlck);
2419
2420		/* unpin the parent page */
2421		DT_PUTPAGE(mp);
2422
2423		/* exit propagation up */
2424		break;
2425	}
2426
2427	if (!DO_INDEX(ip))
2428		ip->i_size -= PSIZE;
2429
2430	return 0;
2431}
2432
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2433/*
2434 *	dtRelink()
2435 *
2436 * function:
2437 *	link around a freed page.
2438 *
2439 * parameter:
2440 *	fp:	page to be freed
2441 *
2442 * return:
2443 */
2444static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p)
2445{
2446	int rc;
2447	struct metapage *mp;
2448	s64 nextbn, prevbn;
2449	struct tlock *tlck;
2450	struct dt_lock *dtlck;
2451	struct lv *lv;
2452
2453	nextbn = le64_to_cpu(p->header.next);
2454	prevbn = le64_to_cpu(p->header.prev);
2455
2456	/* update prev pointer of the next page */
2457	if (nextbn != 0) {
2458		DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
2459		if (rc)
2460			return rc;
2461
2462		BT_MARK_DIRTY(mp, ip);
2463		/*
2464		 * acquire a transaction lock on the next page
2465		 *
2466		 * action: update prev pointer;
2467		 */
2468		tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
2469		jfs_info("dtRelink nextbn: tlck = 0x%p, ip = 0x%p, mp=0x%p",
2470			tlck, ip, mp);
2471		dtlck = (struct dt_lock *) & tlck->lock;
2472
2473		/* linelock header */
2474		if (dtlck->index >= dtlck->maxcnt)
2475			dtlck = (struct dt_lock *) txLinelock(dtlck);
2476		lv = & dtlck->lv[dtlck->index];
2477		lv->offset = 0;
2478		lv->length = 1;
2479		dtlck->index++;
2480
2481		p->header.prev = cpu_to_le64(prevbn);
2482		DT_PUTPAGE(mp);
2483	}
2484
2485	/* update next pointer of the previous page */
2486	if (prevbn != 0) {
2487		DT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc);
2488		if (rc)
2489			return rc;
2490
2491		BT_MARK_DIRTY(mp, ip);
2492		/*
2493		 * acquire a transaction lock on the prev page
2494		 *
2495		 * action: update next pointer;
2496		 */
2497		tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
2498		jfs_info("dtRelink prevbn: tlck = 0x%p, ip = 0x%p, mp=0x%p",
2499			tlck, ip, mp);
2500		dtlck = (struct dt_lock *) & tlck->lock;
2501
2502		/* linelock header */
2503		if (dtlck->index >= dtlck->maxcnt)
2504			dtlck = (struct dt_lock *) txLinelock(dtlck);
2505		lv = & dtlck->lv[dtlck->index];
2506		lv->offset = 0;
2507		lv->length = 1;
2508		dtlck->index++;
2509
2510		p->header.next = cpu_to_le64(nextbn);
2511		DT_PUTPAGE(mp);
2512	}
2513
2514	return 0;
2515}
2516
2517
2518/*
2519 *	dtInitRoot()
2520 *
2521 * initialize directory root (inline in inode)
2522 */
2523void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot)
2524{
2525	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
2526	dtroot_t *p;
2527	int fsi;
2528	struct dtslot *f;
2529	struct tlock *tlck;
2530	struct dt_lock *dtlck;
2531	struct lv *lv;
2532	u16 xflag_save;
2533
2534	/*
2535	 * If this was previously an non-empty directory, we need to remove
2536	 * the old directory table.
2537	 */
2538	if (DO_INDEX(ip)) {
2539		if (!jfs_dirtable_inline(ip)) {
2540			struct tblock *tblk = tid_to_tblock(tid);
2541			/*
2542			 * We're playing games with the tid's xflag.  If
2543			 * we're removing a regular file, the file's xtree
2544			 * is committed with COMMIT_PMAP, but we always
2545			 * commit the directories xtree with COMMIT_PWMAP.
2546			 */
2547			xflag_save = tblk->xflag;
2548			tblk->xflag = 0;
2549			/*
2550			 * xtTruncate isn't guaranteed to fully truncate
2551			 * the xtree.  The caller needs to check i_size
2552			 * after committing the transaction to see if
2553			 * additional truncation is needed.  The
2554			 * COMMIT_Stale flag tells caller that we
2555			 * initiated the truncation.
2556			 */
2557			xtTruncate(tid, ip, 0, COMMIT_PWMAP);
2558			set_cflag(COMMIT_Stale, ip);
2559
2560			tblk->xflag = xflag_save;
2561		} else
2562			ip->i_size = 1;
2563
2564		jfs_ip->next_index = 2;
2565	} else
2566		ip->i_size = IDATASIZE;
2567
2568	/*
2569	 * acquire a transaction lock on the root
2570	 *
2571	 * action: directory initialization;
2572	 */
2573	tlck = txLock(tid, ip, (struct metapage *) & jfs_ip->bxflag,
2574		      tlckDTREE | tlckENTRY | tlckBTROOT);
2575	dtlck = (struct dt_lock *) & tlck->lock;
2576
2577	/* linelock root */
2578	ASSERT(dtlck->index == 0);
2579	lv = & dtlck->lv[0];
2580	lv->offset = 0;
2581	lv->length = DTROOTMAXSLOT;
2582	dtlck->index++;
2583
2584	p = &jfs_ip->i_dtroot;
2585
2586	p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF;
2587
2588	p->header.nextindex = 0;
2589
2590	/* init freelist */
2591	fsi = 1;
2592	f = &p->slot[fsi];
2593
2594	/* init data area of root */
2595	for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++)
2596		f->next = fsi;
2597	f->next = -1;
2598
2599	p->header.freelist = 1;
2600	p->header.freecnt = 8;
2601
2602	/* init '..' entry */
2603	p->header.idotdot = cpu_to_le32(idotdot);
2604
2605	return;
2606}
2607
2608/*
2609 *	add_missing_indices()
2610 *
2611 * function: Fix dtree page in which one or more entries has an invalid index.
2612 *	     fsck.jfs should really fix this, but it currently does not.
2613 *	     Called from jfs_readdir when bad index is detected.
2614 */
2615static void add_missing_indices(struct inode *inode, s64 bn)
2616{
2617	struct ldtentry *d;
2618	struct dt_lock *dtlck;
2619	int i;
2620	uint index;
2621	struct lv *lv;
2622	struct metapage *mp;
2623	dtpage_t *p;
2624	int rc;
2625	s8 *stbl;
2626	tid_t tid;
2627	struct tlock *tlck;
2628
2629	tid = txBegin(inode->i_sb, 0);
2630
2631	DT_GETPAGE(inode, bn, mp, PSIZE, p, rc);
2632
2633	if (rc) {
2634		printk(KERN_ERR "DT_GETPAGE failed!\n");
2635		goto end;
2636	}
2637	BT_MARK_DIRTY(mp, inode);
2638
2639	ASSERT(p->header.flag & BT_LEAF);
2640
2641	tlck = txLock(tid, inode, mp, tlckDTREE | tlckENTRY);
2642	if (BT_IS_ROOT(mp))
2643		tlck->type |= tlckBTROOT;
2644
2645	dtlck = (struct dt_lock *) &tlck->lock;
2646
2647	stbl = DT_GETSTBL(p);
2648	for (i = 0; i < p->header.nextindex; i++) {
2649		d = (struct ldtentry *) &p->slot[stbl[i]];
2650		index = le32_to_cpu(d->index);
2651		if ((index < 2) || (index >= JFS_IP(inode)->next_index)) {
2652			d->index = cpu_to_le32(add_index(tid, inode, bn, i));
2653			if (dtlck->index >= dtlck->maxcnt)
2654				dtlck = (struct dt_lock *) txLinelock(dtlck);
2655			lv = &dtlck->lv[dtlck->index];
2656			lv->offset = stbl[i];
2657			lv->length = 1;
2658			dtlck->index++;
2659		}
2660	}
2661
2662	DT_PUTPAGE(mp);
2663	(void) txCommit(tid, 1, &inode, 0);
2664end:
2665	txEnd(tid);
2666}
2667
2668/*
2669 * Buffer to hold directory entry info while traversing a dtree page
2670 * before being fed to the filldir function
2671 */
2672struct jfs_dirent {
2673	loff_t position;
2674	int ino;
2675	u16 name_len;
2676	char name[];
2677};
2678
2679/*
2680 * function to determine next variable-sized jfs_dirent in buffer
2681 */
2682static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent)
2683{
2684	return (struct jfs_dirent *)
2685		((char *)dirent +
2686		 ((sizeof (struct jfs_dirent) + dirent->name_len + 1 +
2687		   sizeof (loff_t) - 1) &
2688		  ~(sizeof (loff_t) - 1)));
2689}
2690
2691/*
2692 *	jfs_readdir()
2693 *
2694 * function: read directory entries sequentially
2695 *	from the specified entry offset
2696 *
2697 * parameter:
2698 *
2699 * return: offset = (pn, index) of start entry
2700 *	of next jfs_readdir()/dtRead()
2701 */
2702int jfs_readdir(struct file *file, struct dir_context *ctx)
2703{
2704	struct inode *ip = file_inode(file);
2705	struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab;
2706	int rc = 0;
2707	loff_t dtpos;	/* legacy OS/2 style position */
2708	struct dtoffset {
2709		s16 pn;
2710		s16 index;
2711		s32 unused;
2712	} *dtoffset = (struct dtoffset *) &dtpos;
2713	s64 bn;
2714	struct metapage *mp;
2715	dtpage_t *p;
2716	int index;
2717	s8 *stbl;
2718	struct btstack btstack;
2719	int i, next;
2720	struct ldtentry *d;
2721	struct dtslot *t;
2722	int d_namleft, len, outlen;
2723	unsigned long dirent_buf;
2724	char *name_ptr;
2725	u32 dir_index;
2726	int do_index = 0;
2727	uint loop_count = 0;
2728	struct jfs_dirent *jfs_dirent;
2729	int jfs_dirents;
2730	int overflow, fix_page, page_fixed = 0;
2731	static int unique_pos = 2;	/* If we can't fix broken index */
2732
2733	if (ctx->pos == DIREND)
2734		return 0;
2735
2736	if (DO_INDEX(ip)) {
2737		/*
2738		 * persistent index is stored in directory entries.
2739		 * Special cases:	 0 = .
2740		 *			 1 = ..
2741		 *			-1 = End of directory
2742		 */
2743		do_index = 1;
2744
2745		dir_index = (u32) ctx->pos;
2746
2747		/*
2748		 * NFSv4 reserves cookies 1 and 2 for . and .. so the value
2749		 * we return to the vfs is one greater than the one we use
2750		 * internally.
2751		 */
2752		if (dir_index)
2753			dir_index--;
2754
2755		if (dir_index > 1) {
2756			struct dir_table_slot dirtab_slot;
2757
2758			if (dtEmpty(ip) ||
2759			    (dir_index >= JFS_IP(ip)->next_index)) {
2760				/* Stale position.  Directory has shrunk */
2761				ctx->pos = DIREND;
2762				return 0;
2763			}
2764		      repeat:
2765			rc = read_index(ip, dir_index, &dirtab_slot);
2766			if (rc) {
2767				ctx->pos = DIREND;
2768				return rc;
2769			}
2770			if (dirtab_slot.flag == DIR_INDEX_FREE) {
2771				if (loop_count++ > JFS_IP(ip)->next_index) {
2772					jfs_err("jfs_readdir detected infinite loop!");
2773					ctx->pos = DIREND;
2774					return 0;
2775				}
2776				dir_index = le32_to_cpu(dirtab_slot.addr2);
2777				if (dir_index == -1) {
2778					ctx->pos = DIREND;
2779					return 0;
2780				}
2781				goto repeat;
2782			}
2783			bn = addressDTS(&dirtab_slot);
2784			index = dirtab_slot.slot;
2785			DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
2786			if (rc) {
2787				ctx->pos = DIREND;
2788				return 0;
2789			}
2790			if (p->header.flag & BT_INTERNAL) {
2791				jfs_err("jfs_readdir: bad index table");
2792				DT_PUTPAGE(mp);
2793				ctx->pos = DIREND;
2794				return 0;
2795			}
2796		} else {
2797			if (dir_index == 0) {
2798				/*
2799				 * self "."
2800				 */
2801				ctx->pos = 1;
2802				if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
2803					return 0;
2804			}
2805			/*
2806			 * parent ".."
2807			 */
2808			ctx->pos = 2;
2809			if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
2810				return 0;
2811
2812			/*
2813			 * Find first entry of left-most leaf
2814			 */
2815			if (dtEmpty(ip)) {
2816				ctx->pos = DIREND;
2817				return 0;
2818			}
2819
2820			if ((rc = dtReadFirst(ip, &btstack)))
2821				return rc;
2822
2823			DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
2824		}
2825	} else {
2826		/*
2827		 * Legacy filesystem - OS/2 & Linux JFS < 0.3.6
2828		 *
2829		 * pn = 0; index = 1:	First entry "."
2830		 * pn = 0; index = 2:	Second entry ".."
2831		 * pn > 0:		Real entries, pn=1 -> leftmost page
2832		 * pn = index = -1:	No more entries
2833		 */
2834		dtpos = ctx->pos;
2835		if (dtpos < 2) {
2836			/* build "." entry */
2837			ctx->pos = 1;
2838			if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
2839				return 0;
2840			dtoffset->index = 2;
2841			ctx->pos = dtpos;
2842		}
2843
2844		if (dtoffset->pn == 0) {
2845			if (dtoffset->index == 2) {
2846				/* build ".." entry */
2847				if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
2848					return 0;
2849			} else {
2850				jfs_err("jfs_readdir called with invalid offset!");
2851			}
2852			dtoffset->pn = 1;
2853			dtoffset->index = 0;
2854			ctx->pos = dtpos;
2855		}
2856
2857		if (dtEmpty(ip)) {
2858			ctx->pos = DIREND;
2859			return 0;
2860		}
2861
2862		if ((rc = dtReadNext(ip, &ctx->pos, &btstack))) {
2863			jfs_err("jfs_readdir: unexpected rc = %d from dtReadNext",
2864				rc);
2865			ctx->pos = DIREND;
2866			return 0;
2867		}
2868		/* get start leaf page and index */
2869		DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
2870
2871		/* offset beyond directory eof ? */
2872		if (bn < 0) {
2873			ctx->pos = DIREND;
2874			return 0;
2875		}
2876	}
2877
2878	dirent_buf = __get_free_page(GFP_KERNEL);
2879	if (dirent_buf == 0) {
2880		DT_PUTPAGE(mp);
2881		jfs_warn("jfs_readdir: __get_free_page failed!");
2882		ctx->pos = DIREND;
2883		return -ENOMEM;
2884	}
2885
2886	while (1) {
2887		jfs_dirent = (struct jfs_dirent *) dirent_buf;
2888		jfs_dirents = 0;
2889		overflow = fix_page = 0;
2890
2891		stbl = DT_GETSTBL(p);
2892
2893		for (i = index; i < p->header.nextindex; i++) {
2894			if (stbl[i] < 0 || stbl[i] > 127) {
2895				jfs_err("JFS: Invalid stbl[%d] = %d for inode %ld, block = %lld",
2896					i, stbl[i], (long)ip->i_ino, (long long)bn);
2897				free_page(dirent_buf);
2898				DT_PUTPAGE(mp);
2899				return -EIO;
2900			}
2901
2902			d = (struct ldtentry *) & p->slot[stbl[i]];
2903
2904			if (((long) jfs_dirent + d->namlen + 1) >
2905			    (dirent_buf + PAGE_SIZE)) {
2906				/* DBCS codepages could overrun dirent_buf */
2907				index = i;
2908				overflow = 1;
2909				break;
2910			}
2911
2912			d_namleft = d->namlen;
2913			name_ptr = jfs_dirent->name;
2914			jfs_dirent->ino = le32_to_cpu(d->inumber);
2915
2916			if (do_index) {
2917				len = min(d_namleft, DTLHDRDATALEN);
2918				jfs_dirent->position = le32_to_cpu(d->index);
2919				/*
2920				 * d->index should always be valid, but it
2921				 * isn't.  fsck.jfs doesn't create the
2922				 * directory index for the lost+found
2923				 * directory.  Rather than let it go,
2924				 * we can try to fix it.
2925				 */
2926				if ((jfs_dirent->position < 2) ||
2927				    (jfs_dirent->position >=
2928				     JFS_IP(ip)->next_index)) {
2929					if (!page_fixed && !isReadOnly(ip)) {
2930						fix_page = 1;
2931						/*
2932						 * setting overflow and setting
2933						 * index to i will cause the
2934						 * same page to be processed
2935						 * again starting here
2936						 */
2937						overflow = 1;
2938						index = i;
2939						break;
2940					}
2941					jfs_dirent->position = unique_pos++;
2942				}
2943				/*
2944				 * We add 1 to the index because we may
2945				 * use a value of 2 internally, and NFSv4
2946				 * doesn't like that.
2947				 */
2948				jfs_dirent->position++;
2949			} else {
2950				jfs_dirent->position = dtpos;
2951				len = min(d_namleft, DTLHDRDATALEN_LEGACY);
2952			}
2953
2954			/* copy the name of head/only segment */
2955			outlen = jfs_strfromUCS_le(name_ptr, d->name, len,
2956						   codepage);
2957			jfs_dirent->name_len = outlen;
2958
2959			/* copy name in the additional segment(s) */
2960			next = d->next;
2961			while (next >= 0) {
2962				t = (struct dtslot *) & p->slot[next];
2963				name_ptr += outlen;
2964				d_namleft -= len;
2965				/* Sanity Check */
2966				if (d_namleft == 0) {
2967					jfs_error(ip->i_sb,
2968						  "JFS:Dtree error: ino = %ld, bn=%lld, index = %d\n",
2969						  (long)ip->i_ino,
2970						  (long long)bn,
2971						  i);
2972					goto skip_one;
2973				}
2974				len = min(d_namleft, DTSLOTDATALEN);
2975				outlen = jfs_strfromUCS_le(name_ptr, t->name,
2976							   len, codepage);
2977				jfs_dirent->name_len += outlen;
2978
2979				next = t->next;
2980			}
2981
2982			jfs_dirents++;
2983			jfs_dirent = next_jfs_dirent(jfs_dirent);
2984skip_one:
2985			if (!do_index)
2986				dtoffset->index++;
2987		}
2988
2989		if (!overflow) {
2990			/* Point to next leaf page */
2991			if (p->header.flag & BT_ROOT)
2992				bn = 0;
2993			else {
2994				bn = le64_to_cpu(p->header.next);
2995				index = 0;
2996				/* update offset (pn:index) for new page */
2997				if (!do_index) {
2998					dtoffset->pn++;
2999					dtoffset->index = 0;
3000				}
3001			}
3002			page_fixed = 0;
3003		}
3004
3005		/* unpin previous leaf page */
3006		DT_PUTPAGE(mp);
3007
3008		jfs_dirent = (struct jfs_dirent *) dirent_buf;
3009		while (jfs_dirents--) {
3010			ctx->pos = jfs_dirent->position;
3011			if (!dir_emit(ctx, jfs_dirent->name,
3012				    jfs_dirent->name_len,
3013				    jfs_dirent->ino, DT_UNKNOWN))
3014				goto out;
3015			jfs_dirent = next_jfs_dirent(jfs_dirent);
3016		}
3017
3018		if (fix_page) {
3019			add_missing_indices(ip, bn);
3020			page_fixed = 1;
3021		}
3022
3023		if (!overflow && (bn == 0)) {
3024			ctx->pos = DIREND;
3025			break;
3026		}
3027
3028		DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3029		if (rc) {
3030			free_page(dirent_buf);
3031			return rc;
3032		}
3033	}
3034
3035      out:
3036	free_page(dirent_buf);
3037
3038	return rc;
3039}
3040
3041
3042/*
3043 *	dtReadFirst()
3044 *
3045 * function: get the leftmost page of the directory
3046 */
3047static int dtReadFirst(struct inode *ip, struct btstack * btstack)
3048{
3049	int rc = 0;
3050	s64 bn;
3051	int psize = 288;	/* initial in-line directory */
3052	struct metapage *mp;
3053	dtpage_t *p;
3054	s8 *stbl;
3055	struct btframe *btsp;
3056	pxd_t *xd;
3057
3058	BT_CLR(btstack);	/* reset stack */
3059
3060	/*
3061	 *	descend leftmost path of the tree
3062	 *
3063	 * by convention, root bn = 0.
3064	 */
3065	for (bn = 0;;) {
3066		DT_GETPAGE(ip, bn, mp, psize, p, rc);
3067		if (rc)
3068			return rc;
3069
3070		/*
3071		 * leftmost leaf page
3072		 */
3073		if (p->header.flag & BT_LEAF) {
3074			/* return leftmost entry */
3075			btsp = btstack->top;
3076			btsp->bn = bn;
3077			btsp->index = 0;
3078			btsp->mp = mp;
3079
3080			return 0;
3081		}
3082
3083		/*
3084		 * descend down to leftmost child page
3085		 */
3086		if (BT_STACK_FULL(btstack)) {
3087			DT_PUTPAGE(mp);
3088			jfs_error(ip->i_sb, "btstack overrun\n");
3089			BT_STACK_DUMP(btstack);
3090			return -EIO;
3091		}
3092		/* push (bn, index) of the parent page/entry */
3093		BT_PUSH(btstack, bn, 0);
3094
3095		/* get the leftmost entry */
3096		stbl = DT_GETSTBL(p);
3097
3098		if (stbl[0] < 0 || stbl[0] > 127) {
3099			DT_PUTPAGE(mp);
3100			jfs_error(ip->i_sb, "stbl[0] out of bound\n");
3101			return -EIO;
3102		}
3103
3104		xd = (pxd_t *) & p->slot[stbl[0]];
3105
3106		/* get the child page block address */
3107		bn = addressPXD(xd);
3108		psize = lengthPXD(xd) << JFS_SBI(ip->i_sb)->l2bsize;
3109
3110		/* unpin the parent page */
3111		DT_PUTPAGE(mp);
3112	}
3113}
3114
3115
3116/*
3117 *	dtReadNext()
3118 *
3119 * function: get the page of the specified offset (pn:index)
3120 *
3121 * return: if (offset > eof), bn = -1;
3122 *
3123 * note: if index > nextindex of the target leaf page,
3124 * start with 1st entry of next leaf page;
3125 */
3126static int dtReadNext(struct inode *ip, loff_t * offset,
3127		      struct btstack * btstack)
3128{
3129	int rc = 0;
3130	struct dtoffset {
3131		s16 pn;
3132		s16 index;
3133		s32 unused;
3134	} *dtoffset = (struct dtoffset *) offset;
3135	s64 bn;
3136	struct metapage *mp;
3137	dtpage_t *p;
3138	int index;
3139	int pn;
3140	s8 *stbl;
3141	struct btframe *btsp, *parent;
3142	pxd_t *xd;
3143
3144	/*
3145	 * get leftmost leaf page pinned
3146	 */
3147	if ((rc = dtReadFirst(ip, btstack)))
3148		return rc;
3149
3150	/* get leaf page */
3151	DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
3152
3153	/* get the start offset (pn:index) */
3154	pn = dtoffset->pn - 1;	/* Now pn = 0 represents leftmost leaf */
3155	index = dtoffset->index;
3156
3157	/* start at leftmost page ? */
3158	if (pn == 0) {
3159		/* offset beyond eof ? */
3160		if (index < p->header.nextindex)
3161			goto out;
3162
3163		if (p->header.flag & BT_ROOT) {
3164			bn = -1;
3165			goto out;
3166		}
3167
3168		/* start with 1st entry of next leaf page */
3169		dtoffset->pn++;
3170		dtoffset->index = index = 0;
3171		goto a;
3172	}
3173
3174	/* start at non-leftmost page: scan parent pages for large pn */
3175	if (p->header.flag & BT_ROOT) {
3176		bn = -1;
3177		goto out;
3178	}
3179
3180	/* start after next leaf page ? */
3181	if (pn > 1)
3182		goto b;
3183
3184	/* get leaf page pn = 1 */
3185      a:
3186	bn = le64_to_cpu(p->header.next);
3187
3188	/* unpin leaf page */
3189	DT_PUTPAGE(mp);
3190
3191	/* offset beyond eof ? */
3192	if (bn == 0) {
3193		bn = -1;
3194		goto out;
3195	}
3196
3197	goto c;
3198
3199	/*
3200	 * scan last internal page level to get target leaf page
3201	 */
3202      b:
3203	/* unpin leftmost leaf page */
3204	DT_PUTPAGE(mp);
3205
3206	/* get left most parent page */
3207	btsp = btstack->top;
3208	parent = btsp - 1;
3209	bn = parent->bn;
3210	DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3211	if (rc)
3212		return rc;
3213
3214	/* scan parent pages at last internal page level */
3215	while (pn >= p->header.nextindex) {
3216		pn -= p->header.nextindex;
3217
3218		/* get next parent page address */
3219		bn = le64_to_cpu(p->header.next);
3220
3221		/* unpin current parent page */
3222		DT_PUTPAGE(mp);
3223
3224		/* offset beyond eof ? */
3225		if (bn == 0) {
3226			bn = -1;
3227			goto out;
3228		}
3229
3230		/* get next parent page */
3231		DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3232		if (rc)
3233			return rc;
3234
3235		/* update parent page stack frame */
3236		parent->bn = bn;
3237	}
3238
3239	/* get leaf page address */
3240	stbl = DT_GETSTBL(p);
3241	xd = (pxd_t *) & p->slot[stbl[pn]];
3242	bn = addressPXD(xd);
3243
3244	/* unpin parent page */
3245	DT_PUTPAGE(mp);
3246
3247	/*
3248	 * get target leaf page
3249	 */
3250      c:
3251	DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3252	if (rc)
3253		return rc;
3254
3255	/*
3256	 * leaf page has been completed:
3257	 * start with 1st entry of next leaf page
3258	 */
3259	if (index >= p->header.nextindex) {
3260		bn = le64_to_cpu(p->header.next);
3261
3262		/* unpin leaf page */
3263		DT_PUTPAGE(mp);
3264
3265		/* offset beyond eof ? */
3266		if (bn == 0) {
3267			bn = -1;
3268			goto out;
3269		}
3270
3271		/* get next leaf page */
3272		DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3273		if (rc)
3274			return rc;
3275
3276		/* start with 1st entry of next leaf page */
3277		dtoffset->pn++;
3278		dtoffset->index = 0;
3279	}
3280
3281      out:
3282	/* return target leaf page pinned */
3283	btsp = btstack->top;
3284	btsp->bn = bn;
3285	btsp->index = dtoffset->index;
3286	btsp->mp = mp;
3287
3288	return 0;
3289}
3290
3291
3292/*
3293 *	dtCompare()
3294 *
3295 * function: compare search key with an internal entry
3296 *
3297 * return:
3298 *	< 0 if k is < record
3299 *	= 0 if k is = record
3300 *	> 0 if k is > record
3301 */
3302static int dtCompare(struct component_name * key,	/* search key */
3303		     dtpage_t * p,	/* directory page */
3304		     int si)
3305{				/* entry slot index */
3306	wchar_t *kname;
3307	__le16 *name;
3308	int klen, namlen, len, rc;
3309	struct idtentry *ih;
3310	struct dtslot *t;
3311
3312	/*
3313	 * force the left-most key on internal pages, at any level of
3314	 * the tree, to be less than any search key.
3315	 * this obviates having to update the leftmost key on an internal
3316	 * page when the user inserts a new key in the tree smaller than
3317	 * anything that has been stored.
3318	 *
3319	 * (? if/when dtSearch() narrows down to 1st entry (index = 0),
3320	 * at any internal page at any level of the tree,
3321	 * it descends to child of the entry anyway -
3322	 * ? make the entry as min size dummy entry)
3323	 *
3324	 * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF))
3325	 * return (1);
3326	 */
3327
3328	kname = key->name;
3329	klen = key->namlen;
3330
3331	ih = (struct idtentry *) & p->slot[si];
3332	si = ih->next;
3333	name = ih->name;
3334	namlen = ih->namlen;
3335	len = min(namlen, DTIHDRDATALEN);
3336
3337	/* compare with head/only segment */
3338	len = min(klen, len);
3339	if ((rc = UniStrncmp_le(kname, name, len)))
3340		return rc;
3341
3342	klen -= len;
3343	namlen -= len;
3344
3345	/* compare with additional segment(s) */
3346	kname += len;
3347	while (klen > 0 && namlen > 0) {
3348		/* compare with next name segment */
3349		t = (struct dtslot *) & p->slot[si];
3350		len = min(namlen, DTSLOTDATALEN);
3351		len = min(klen, len);
3352		name = t->name;
3353		if ((rc = UniStrncmp_le(kname, name, len)))
3354			return rc;
3355
3356		klen -= len;
3357		namlen -= len;
3358		kname += len;
3359		si = t->next;
3360	}
3361
3362	return (klen - namlen);
3363}
3364
3365
3366
3367
3368/*
3369 *	ciCompare()
3370 *
3371 * function: compare search key with an (leaf/internal) entry
3372 *
3373 * return:
3374 *	< 0 if k is < record
3375 *	= 0 if k is = record
3376 *	> 0 if k is > record
3377 */
3378static int ciCompare(struct component_name * key,	/* search key */
3379		     dtpage_t * p,	/* directory page */
3380		     int si,	/* entry slot index */
3381		     int flag)
3382{
3383	wchar_t *kname, x;
3384	__le16 *name;
3385	int klen, namlen, len, rc;
3386	struct ldtentry *lh;
3387	struct idtentry *ih;
3388	struct dtslot *t;
3389	int i;
3390
3391	/*
3392	 * force the left-most key on internal pages, at any level of
3393	 * the tree, to be less than any search key.
3394	 * this obviates having to update the leftmost key on an internal
3395	 * page when the user inserts a new key in the tree smaller than
3396	 * anything that has been stored.
3397	 *
3398	 * (? if/when dtSearch() narrows down to 1st entry (index = 0),
3399	 * at any internal page at any level of the tree,
3400	 * it descends to child of the entry anyway -
3401	 * ? make the entry as min size dummy entry)
3402	 *
3403	 * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF))
3404	 * return (1);
3405	 */
3406
3407	kname = key->name;
3408	klen = key->namlen;
3409
3410	/*
3411	 * leaf page entry
3412	 */
3413	if (p->header.flag & BT_LEAF) {
3414		lh = (struct ldtentry *) & p->slot[si];
3415		si = lh->next;
3416		name = lh->name;
3417		namlen = lh->namlen;
3418		if (flag & JFS_DIR_INDEX)
3419			len = min(namlen, DTLHDRDATALEN);
3420		else
3421			len = min(namlen, DTLHDRDATALEN_LEGACY);
3422	}
3423	/*
3424	 * internal page entry
3425	 */
3426	else {
3427		ih = (struct idtentry *) & p->slot[si];
3428		si = ih->next;
3429		name = ih->name;
3430		namlen = ih->namlen;
3431		len = min(namlen, DTIHDRDATALEN);
3432	}
3433
3434	/* compare with head/only segment */
3435	len = min(klen, len);
3436	for (i = 0; i < len; i++, kname++, name++) {
3437		/* only uppercase if case-insensitive support is on */
3438		if ((flag & JFS_OS2) == JFS_OS2)
3439			x = UniToupper(le16_to_cpu(*name));
3440		else
3441			x = le16_to_cpu(*name);
3442		if ((rc = *kname - x))
3443			return rc;
3444	}
3445
3446	klen -= len;
3447	namlen -= len;
3448
3449	/* compare with additional segment(s) */
3450	while (klen > 0 && namlen > 0) {
3451		/* compare with next name segment */
3452		t = (struct dtslot *) & p->slot[si];
3453		len = min(namlen, DTSLOTDATALEN);
3454		len = min(klen, len);
3455		name = t->name;
3456		for (i = 0; i < len; i++, kname++, name++) {
3457			/* only uppercase if case-insensitive support is on */
3458			if ((flag & JFS_OS2) == JFS_OS2)
3459				x = UniToupper(le16_to_cpu(*name));
3460			else
3461				x = le16_to_cpu(*name);
3462
3463			if ((rc = *kname - x))
3464				return rc;
3465		}
3466
3467		klen -= len;
3468		namlen -= len;
3469		si = t->next;
3470	}
3471
3472	return (klen - namlen);
3473}
3474
3475
3476/*
3477 *	ciGetLeafPrefixKey()
3478 *
3479 * function: compute prefix of suffix compression
3480 *	     from two adjacent leaf entries
3481 *	     across page boundary
3482 *
3483 * return: non-zero on error
3484 *
3485 */
3486static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp,
3487			       int ri, struct component_name * key, int flag)
3488{
3489	int klen, namlen;
3490	wchar_t *pl, *pr, *kname;
3491	struct component_name lkey;
3492	struct component_name rkey;
3493
3494	lkey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t),
3495					GFP_KERNEL);
3496	if (lkey.name == NULL)
3497		return -ENOMEM;
3498
3499	rkey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t),
3500					GFP_KERNEL);
3501	if (rkey.name == NULL) {
3502		kfree(lkey.name);
3503		return -ENOMEM;
3504	}
3505
3506	/* get left and right key */
3507	dtGetKey(lp, li, &lkey, flag);
3508	lkey.name[lkey.namlen] = 0;
3509
3510	if ((flag & JFS_OS2) == JFS_OS2)
3511		ciToUpper(&lkey);
3512
3513	dtGetKey(rp, ri, &rkey, flag);
3514	rkey.name[rkey.namlen] = 0;
3515
3516
3517	if ((flag & JFS_OS2) == JFS_OS2)
3518		ciToUpper(&rkey);
3519
3520	/* compute prefix */
3521	klen = 0;
3522	kname = key->name;
3523	namlen = min(lkey.namlen, rkey.namlen);
3524	for (pl = lkey.name, pr = rkey.name;
3525	     namlen; pl++, pr++, namlen--, klen++, kname++) {
3526		*kname = *pr;
3527		if (*pl != *pr) {
3528			key->namlen = klen + 1;
3529			goto free_names;
3530		}
3531	}
3532
3533	/* l->namlen <= r->namlen since l <= r */
3534	if (lkey.namlen < rkey.namlen) {
3535		*kname = *pr;
3536		key->namlen = klen + 1;
3537	} else			/* l->namelen == r->namelen */
3538		key->namlen = klen;
3539
3540free_names:
3541	kfree(lkey.name);
3542	kfree(rkey.name);
3543	return 0;
3544}
3545
3546
3547
3548/*
3549 *	dtGetKey()
3550 *
3551 * function: get key of the entry
3552 */
3553static void dtGetKey(dtpage_t * p, int i,	/* entry index */
3554		     struct component_name * key, int flag)
3555{
3556	int si;
3557	s8 *stbl;
3558	struct ldtentry *lh;
3559	struct idtentry *ih;
3560	struct dtslot *t;
3561	int namlen, len;
3562	wchar_t *kname;
3563	__le16 *name;
3564
3565	/* get entry */
3566	stbl = DT_GETSTBL(p);
3567	si = stbl[i];
3568	if (p->header.flag & BT_LEAF) {
3569		lh = (struct ldtentry *) & p->slot[si];
3570		si = lh->next;
3571		namlen = lh->namlen;
3572		name = lh->name;
3573		if (flag & JFS_DIR_INDEX)
3574			len = min(namlen, DTLHDRDATALEN);
3575		else
3576			len = min(namlen, DTLHDRDATALEN_LEGACY);
3577	} else {
3578		ih = (struct idtentry *) & p->slot[si];
3579		si = ih->next;
3580		namlen = ih->namlen;
3581		name = ih->name;
3582		len = min(namlen, DTIHDRDATALEN);
3583	}
3584
3585	key->namlen = namlen;
3586	kname = key->name;
3587
3588	/*
3589	 * move head/only segment
3590	 */
3591	UniStrncpy_from_le(kname, name, len);
3592
3593	/*
3594	 * move additional segment(s)
3595	 */
3596	while (si >= 0) {
3597		/* get next segment */
3598		t = &p->slot[si];
3599		kname += len;
3600		namlen -= len;
3601		len = min(namlen, DTSLOTDATALEN);
3602		UniStrncpy_from_le(kname, t->name, len);
3603
3604		si = t->next;
3605	}
3606}
3607
3608
3609/*
3610 *	dtInsertEntry()
3611 *
3612 * function: allocate free slot(s) and
3613 *	     write a leaf/internal entry
3614 *
3615 * return: entry slot index
3616 */
3617static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key,
3618			  ddata_t * data, struct dt_lock ** dtlock)
3619{
3620	struct dtslot *h, *t;
3621	struct ldtentry *lh = NULL;
3622	struct idtentry *ih = NULL;
3623	int hsi, fsi, klen, len, nextindex;
3624	wchar_t *kname;
3625	__le16 *name;
3626	s8 *stbl;
3627	pxd_t *xd;
3628	struct dt_lock *dtlck = *dtlock;
3629	struct lv *lv;
3630	int xsi, n;
3631	s64 bn = 0;
3632	struct metapage *mp = NULL;
3633
3634	klen = key->namlen;
3635	kname = key->name;
3636
3637	/* allocate a free slot */
3638	hsi = fsi = p->header.freelist;
3639	h = &p->slot[fsi];
3640	p->header.freelist = h->next;
3641	--p->header.freecnt;
3642
3643	/* open new linelock */
3644	if (dtlck->index >= dtlck->maxcnt)
3645		dtlck = (struct dt_lock *) txLinelock(dtlck);
3646
3647	lv = & dtlck->lv[dtlck->index];
3648	lv->offset = hsi;
3649
3650	/* write head/only segment */
3651	if (p->header.flag & BT_LEAF) {
3652		lh = (struct ldtentry *) h;
3653		lh->next = h->next;
3654		lh->inumber = cpu_to_le32(data->leaf.ino);
3655		lh->namlen = klen;
3656		name = lh->name;
3657		if (data->leaf.ip) {
3658			len = min(klen, DTLHDRDATALEN);
3659			if (!(p->header.flag & BT_ROOT))
3660				bn = addressPXD(&p->header.self);
3661			lh->index = cpu_to_le32(add_index(data->leaf.tid,
3662							  data->leaf.ip,
3663							  bn, index));
3664		} else
3665			len = min(klen, DTLHDRDATALEN_LEGACY);
3666	} else {
3667		ih = (struct idtentry *) h;
3668		ih->next = h->next;
3669		xd = (pxd_t *) ih;
3670		*xd = data->xd;
3671		ih->namlen = klen;
3672		name = ih->name;
3673		len = min(klen, DTIHDRDATALEN);
3674	}
3675
3676	UniStrncpy_to_le(name, kname, len);
3677
3678	n = 1;
3679	xsi = hsi;
3680
3681	/* write additional segment(s) */
3682	t = h;
3683	klen -= len;
3684	while (klen) {
3685		/* get free slot */
3686		fsi = p->header.freelist;
3687		t = &p->slot[fsi];
3688		p->header.freelist = t->next;
3689		--p->header.freecnt;
3690
3691		/* is next slot contiguous ? */
3692		if (fsi != xsi + 1) {
3693			/* close current linelock */
3694			lv->length = n;
3695			dtlck->index++;
3696
3697			/* open new linelock */
3698			if (dtlck->index < dtlck->maxcnt)
3699				lv++;
3700			else {
3701				dtlck = (struct dt_lock *) txLinelock(dtlck);
3702				lv = & dtlck->lv[0];
3703			}
3704
3705			lv->offset = fsi;
3706			n = 0;
3707		}
3708
3709		kname += len;
3710		len = min(klen, DTSLOTDATALEN);
3711		UniStrncpy_to_le(t->name, kname, len);
3712
3713		n++;
3714		xsi = fsi;
3715		klen -= len;
3716	}
3717
3718	/* close current linelock */
3719	lv->length = n;
3720	dtlck->index++;
3721
3722	*dtlock = dtlck;
3723
3724	/* terminate last/only segment */
3725	if (h == t) {
3726		/* single segment entry */
3727		if (p->header.flag & BT_LEAF)
3728			lh->next = -1;
3729		else
3730			ih->next = -1;
3731	} else
3732		/* multi-segment entry */
3733		t->next = -1;
3734
3735	/* if insert into middle, shift right succeeding entries in stbl */
3736	stbl = DT_GETSTBL(p);
3737	nextindex = p->header.nextindex;
3738	if (index < nextindex) {
3739		memmove(stbl + index + 1, stbl + index, nextindex - index);
3740
3741		if ((p->header.flag & BT_LEAF) && data->leaf.ip) {
3742			s64 lblock;
3743
3744			/*
3745			 * Need to update slot number for entries that moved
3746			 * in the stbl
3747			 */
3748			mp = NULL;
3749			for (n = index + 1; n <= nextindex; n++) {
3750				lh = (struct ldtentry *) & (p->slot[stbl[n]]);
3751				modify_index(data->leaf.tid, data->leaf.ip,
3752					     le32_to_cpu(lh->index), bn, n,
3753					     &mp, &lblock);
3754			}
3755			if (mp)
3756				release_metapage(mp);
3757		}
3758	}
3759
3760	stbl[index] = hsi;
3761
3762	/* advance next available entry index of stbl */
3763	++p->header.nextindex;
3764}
3765
3766
3767/*
3768 *	dtMoveEntry()
3769 *
3770 * function: move entries from split/left page to new/right page
3771 *
3772 *	nextindex of dst page and freelist/freecnt of both pages
3773 *	are updated.
3774 */
3775static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp,
3776			struct dt_lock ** sdtlock, struct dt_lock ** ddtlock,
3777			int do_index)
3778{
3779	int ssi, next;		/* src slot index */
3780	int di;			/* dst entry index */
3781	int dsi;		/* dst slot index */
3782	s8 *sstbl, *dstbl;	/* sorted entry table */
3783	int snamlen, len;
3784	struct ldtentry *slh, *dlh = NULL;
3785	struct idtentry *sih, *dih = NULL;
3786	struct dtslot *h, *s, *d;
3787	struct dt_lock *sdtlck = *sdtlock, *ddtlck = *ddtlock;
3788	struct lv *slv, *dlv;
3789	int xssi, ns, nd;
3790	int sfsi;
3791
3792	sstbl = (s8 *) & sp->slot[sp->header.stblindex];
3793	dstbl = (s8 *) & dp->slot[dp->header.stblindex];
3794
3795	dsi = dp->header.freelist;	/* first (whole page) free slot */
3796	sfsi = sp->header.freelist;
3797
3798	/* linelock destination entry slot */
3799	dlv = & ddtlck->lv[ddtlck->index];
3800	dlv->offset = dsi;
3801
3802	/* linelock source entry slot */
3803	slv = & sdtlck->lv[sdtlck->index];
3804	slv->offset = sstbl[si];
3805	xssi = slv->offset - 1;
3806
3807	/*
3808	 * move entries
3809	 */
3810	ns = nd = 0;
3811	for (di = 0; si < sp->header.nextindex; si++, di++) {
3812		ssi = sstbl[si];
3813		dstbl[di] = dsi;
3814
3815		/* is next slot contiguous ? */
3816		if (ssi != xssi + 1) {
3817			/* close current linelock */
3818			slv->length = ns;
3819			sdtlck->index++;
3820
3821			/* open new linelock */
3822			if (sdtlck->index < sdtlck->maxcnt)
3823				slv++;
3824			else {
3825				sdtlck = (struct dt_lock *) txLinelock(sdtlck);
3826				slv = & sdtlck->lv[0];
3827			}
3828
3829			slv->offset = ssi;
3830			ns = 0;
3831		}
3832
3833		/*
3834		 * move head/only segment of an entry
3835		 */
3836		/* get dst slot */
3837		h = d = &dp->slot[dsi];
3838
3839		/* get src slot and move */
3840		s = &sp->slot[ssi];
3841		if (sp->header.flag & BT_LEAF) {
3842			/* get source entry */
3843			slh = (struct ldtentry *) s;
3844			dlh = (struct ldtentry *) h;
3845			snamlen = slh->namlen;
3846
3847			if (do_index) {
3848				len = min(snamlen, DTLHDRDATALEN);
3849				dlh->index = slh->index; /* little-endian */
3850			} else
3851				len = min(snamlen, DTLHDRDATALEN_LEGACY);
3852
3853			memcpy(dlh, slh, 6 + len * 2);
3854
3855			next = slh->next;
3856
3857			/* update dst head/only segment next field */
3858			dsi++;
3859			dlh->next = dsi;
3860		} else {
3861			sih = (struct idtentry *) s;
3862			snamlen = sih->namlen;
3863
3864			len = min(snamlen, DTIHDRDATALEN);
3865			dih = (struct idtentry *) h;
3866			memcpy(dih, sih, 10 + len * 2);
3867			next = sih->next;
3868
3869			dsi++;
3870			dih->next = dsi;
3871		}
3872
3873		/* free src head/only segment */
3874		s->next = sfsi;
3875		s->cnt = 1;
3876		sfsi = ssi;
3877
3878		ns++;
3879		nd++;
3880		xssi = ssi;
3881
3882		/*
3883		 * move additional segment(s) of the entry
3884		 */
3885		snamlen -= len;
3886		while ((ssi = next) >= 0) {
3887			/* is next slot contiguous ? */
3888			if (ssi != xssi + 1) {
3889				/* close current linelock */
3890				slv->length = ns;
3891				sdtlck->index++;
3892
3893				/* open new linelock */
3894				if (sdtlck->index < sdtlck->maxcnt)
3895					slv++;
3896				else {
3897					sdtlck =
3898					    (struct dt_lock *)
3899					    txLinelock(sdtlck);
3900					slv = & sdtlck->lv[0];
3901				}
3902
3903				slv->offset = ssi;
3904				ns = 0;
3905			}
3906
3907			/* get next source segment */
3908			s = &sp->slot[ssi];
3909
3910			/* get next destination free slot */
3911			d++;
3912
3913			len = min(snamlen, DTSLOTDATALEN);
3914			UniStrncpy_le(d->name, s->name, len);
3915
3916			ns++;
3917			nd++;
3918			xssi = ssi;
3919
3920			dsi++;
3921			d->next = dsi;
3922
3923			/* free source segment */
3924			next = s->next;
3925			s->next = sfsi;
3926			s->cnt = 1;
3927			sfsi = ssi;
3928
3929			snamlen -= len;
3930		}		/* end while */
3931
3932		/* terminate dst last/only segment */
3933		if (h == d) {
3934			/* single segment entry */
3935			if (dp->header.flag & BT_LEAF)
3936				dlh->next = -1;
3937			else
3938				dih->next = -1;
3939		} else
3940			/* multi-segment entry */
3941			d->next = -1;
3942	}			/* end for */
3943
3944	/* close current linelock */
3945	slv->length = ns;
3946	sdtlck->index++;
3947	*sdtlock = sdtlck;
3948
3949	dlv->length = nd;
3950	ddtlck->index++;
3951	*ddtlock = ddtlck;
3952
3953	/* update source header */
3954	sp->header.freelist = sfsi;
3955	sp->header.freecnt += nd;
3956
3957	/* update destination header */
3958	dp->header.nextindex = di;
3959
3960	dp->header.freelist = dsi;
3961	dp->header.freecnt -= nd;
3962}
3963
3964
3965/*
3966 *	dtDeleteEntry()
3967 *
3968 * function: free a (leaf/internal) entry
3969 *
3970 * log freelist header, stbl, and each segment slot of entry
3971 * (even though last/only segment next field is modified,
3972 * physical image logging requires all segment slots of
3973 * the entry logged to avoid applying previous updates
3974 * to the same slots)
3975 */
3976static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock)
3977{
3978	int fsi;		/* free entry slot index */
3979	s8 *stbl;
3980	struct dtslot *t;
3981	int si, freecnt;
3982	struct dt_lock *dtlck = *dtlock;
3983	struct lv *lv;
3984	int xsi, n;
3985
3986	/* get free entry slot index */
3987	stbl = DT_GETSTBL(p);
3988	fsi = stbl[fi];
3989
3990	/* open new linelock */
3991	if (dtlck->index >= dtlck->maxcnt)
3992		dtlck = (struct dt_lock *) txLinelock(dtlck);
3993	lv = & dtlck->lv[dtlck->index];
3994
3995	lv->offset = fsi;
3996
3997	/* get the head/only segment */
3998	t = &p->slot[fsi];
3999	if (p->header.flag & BT_LEAF)
4000		si = ((struct ldtentry *) t)->next;
4001	else
4002		si = ((struct idtentry *) t)->next;
4003	t->next = si;
4004	t->cnt = 1;
4005
4006	n = freecnt = 1;
4007	xsi = fsi;
4008
4009	/* find the last/only segment */
4010	while (si >= 0) {
4011		/* is next slot contiguous ? */
4012		if (si != xsi + 1) {
4013			/* close current linelock */
4014			lv->length = n;
4015			dtlck->index++;
4016
4017			/* open new linelock */
4018			if (dtlck->index < dtlck->maxcnt)
4019				lv++;
4020			else {
4021				dtlck = (struct dt_lock *) txLinelock(dtlck);
4022				lv = & dtlck->lv[0];
4023			}
4024
4025			lv->offset = si;
4026			n = 0;
4027		}
4028
4029		n++;
4030		xsi = si;
4031		freecnt++;
4032
4033		t = &p->slot[si];
4034		t->cnt = 1;
4035		si = t->next;
4036	}
4037
4038	/* close current linelock */
4039	lv->length = n;
4040	dtlck->index++;
4041
4042	*dtlock = dtlck;
4043
4044	/* update freelist */
4045	t->next = p->header.freelist;
4046	p->header.freelist = fsi;
4047	p->header.freecnt += freecnt;
4048
4049	/* if delete from middle,
4050	 * shift left the succedding entries in the stbl
4051	 */
4052	si = p->header.nextindex;
4053	if (fi < si - 1)
4054		memmove(&stbl[fi], &stbl[fi + 1], si - fi - 1);
4055
4056	p->header.nextindex--;
4057}
4058
4059
4060/*
4061 *	dtTruncateEntry()
4062 *
4063 * function: truncate a (leaf/internal) entry
4064 *
4065 * log freelist header, stbl, and each segment slot of entry
4066 * (even though last/only segment next field is modified,
4067 * physical image logging requires all segment slots of
4068 * the entry logged to avoid applying previous updates
4069 * to the same slots)
4070 */
4071static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock)
4072{
4073	int tsi;		/* truncate entry slot index */
4074	s8 *stbl;
4075	struct dtslot *t;
4076	int si, freecnt;
4077	struct dt_lock *dtlck = *dtlock;
4078	struct lv *lv;
4079	int fsi, xsi, n;
4080
4081	/* get free entry slot index */
4082	stbl = DT_GETSTBL(p);
4083	tsi = stbl[ti];
4084
4085	/* open new linelock */
4086	if (dtlck->index >= dtlck->maxcnt)
4087		dtlck = (struct dt_lock *) txLinelock(dtlck);
4088	lv = & dtlck->lv[dtlck->index];
4089
4090	lv->offset = tsi;
4091
4092	/* get the head/only segment */
4093	t = &p->slot[tsi];
4094	ASSERT(p->header.flag & BT_INTERNAL);
4095	((struct idtentry *) t)->namlen = 0;
4096	si = ((struct idtentry *) t)->next;
4097	((struct idtentry *) t)->next = -1;
4098
4099	n = 1;
4100	freecnt = 0;
4101	fsi = si;
4102	xsi = tsi;
4103
4104	/* find the last/only segment */
4105	while (si >= 0) {
4106		/* is next slot contiguous ? */
4107		if (si != xsi + 1) {
4108			/* close current linelock */
4109			lv->length = n;
4110			dtlck->index++;
4111
4112			/* open new linelock */
4113			if (dtlck->index < dtlck->maxcnt)
4114				lv++;
4115			else {
4116				dtlck = (struct dt_lock *) txLinelock(dtlck);
4117				lv = & dtlck->lv[0];
4118			}
4119
4120			lv->offset = si;
4121			n = 0;
4122		}
4123
4124		n++;
4125		xsi = si;
4126		freecnt++;
4127
4128		t = &p->slot[si];
4129		t->cnt = 1;
4130		si = t->next;
4131	}
4132
4133	/* close current linelock */
4134	lv->length = n;
4135	dtlck->index++;
4136
4137	*dtlock = dtlck;
4138
4139	/* update freelist */
4140	if (freecnt == 0)
4141		return;
4142	t->next = p->header.freelist;
4143	p->header.freelist = fsi;
4144	p->header.freecnt += freecnt;
4145}
4146
4147
4148/*
4149 *	dtLinelockFreelist()
4150 */
4151static void dtLinelockFreelist(dtpage_t * p,	/* directory page */
4152			       int m,	/* max slot index */
4153			       struct dt_lock ** dtlock)
4154{
4155	int fsi;		/* free entry slot index */
4156	struct dtslot *t;
4157	int si;
4158	struct dt_lock *dtlck = *dtlock;
4159	struct lv *lv;
4160	int xsi, n;
4161
4162	/* get free entry slot index */
4163	fsi = p->header.freelist;
4164
4165	/* open new linelock */
4166	if (dtlck->index >= dtlck->maxcnt)
4167		dtlck = (struct dt_lock *) txLinelock(dtlck);
4168	lv = & dtlck->lv[dtlck->index];
4169
4170	lv->offset = fsi;
4171
4172	n = 1;
4173	xsi = fsi;
4174
4175	t = &p->slot[fsi];
4176	si = t->next;
4177
4178	/* find the last/only segment */
4179	while (si < m && si >= 0) {
4180		/* is next slot contiguous ? */
4181		if (si != xsi + 1) {
4182			/* close current linelock */
4183			lv->length = n;
4184			dtlck->index++;
4185
4186			/* open new linelock */
4187			if (dtlck->index < dtlck->maxcnt)
4188				lv++;
4189			else {
4190				dtlck = (struct dt_lock *) txLinelock(dtlck);
4191				lv = & dtlck->lv[0];
4192			}
4193
4194			lv->offset = si;
4195			n = 0;
4196		}
4197
4198		n++;
4199		xsi = si;
4200
4201		t = &p->slot[si];
4202		si = t->next;
4203	}
4204
4205	/* close current linelock */
4206	lv->length = n;
4207	dtlck->index++;
4208
4209	*dtlock = dtlck;
4210}
4211
4212
4213/*
4214 * NAME: dtModify
4215 *
4216 * FUNCTION: Modify the inode number part of a directory entry
4217 *
4218 * PARAMETERS:
4219 *	tid	- Transaction id
4220 *	ip	- Inode of parent directory
4221 *	key	- Name of entry to be modified
4222 *	orig_ino	- Original inode number expected in entry
4223 *	new_ino	- New inode number to put into entry
4224 *	flag	- JFS_RENAME
4225 *
4226 * RETURNS:
4227 *	-ESTALE	- If entry found does not match orig_ino passed in
4228 *	-ENOENT	- If no entry can be found to match key
4229 *	0	- If successfully modified entry
4230 */
4231int dtModify(tid_t tid, struct inode *ip,
4232	 struct component_name * key, ino_t * orig_ino, ino_t new_ino, int flag)
4233{
4234	int rc;
4235	s64 bn;
4236	struct metapage *mp;
4237	dtpage_t *p;
4238	int index;
4239	struct btstack btstack;
4240	struct tlock *tlck;
4241	struct dt_lock *dtlck;
4242	struct lv *lv;
4243	s8 *stbl;
4244	int entry_si;		/* entry slot index */
4245	struct ldtentry *entry;
4246
4247	/*
4248	 *	search for the entry to modify:
4249	 *
4250	 * dtSearch() returns (leaf page pinned, index at which to modify).
4251	 */
4252	if ((rc = dtSearch(ip, key, orig_ino, &btstack, flag)))
4253		return rc;
4254
4255	/* retrieve search result */
4256	DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
4257
4258	BT_MARK_DIRTY(mp, ip);
4259	/*
4260	 * acquire a transaction lock on the leaf page of named entry
4261	 */
4262	tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
4263	dtlck = (struct dt_lock *) & tlck->lock;
4264
4265	/* get slot index of the entry */
4266	stbl = DT_GETSTBL(p);
4267	entry_si = stbl[index];
4268
4269	/* linelock entry */
4270	ASSERT(dtlck->index == 0);
4271	lv = & dtlck->lv[0];
4272	lv->offset = entry_si;
4273	lv->length = 1;
4274	dtlck->index++;
4275
4276	/* get the head/only segment */
4277	entry = (struct ldtentry *) & p->slot[entry_si];
4278
4279	/* substitute the inode number of the entry */
4280	entry->inumber = cpu_to_le32(new_ino);
4281
4282	/* unpin the leaf page */
4283	DT_PUTPAGE(mp);
4284
4285	return 0;
4286}