Linux Audio

Check our new training course

Linux debugging, profiling, tracing and performance analysis training

Mar 24-27, 2025, special US time zones
Register
Loading...
Note: File does not exist in v3.1.
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  fs/ext4/extents_status.c
   4 *
   5 * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
   6 * Modified by
   7 *	Allison Henderson <achender@linux.vnet.ibm.com>
   8 *	Hugh Dickins <hughd@google.com>
   9 *	Zheng Liu <wenqing.lz@taobao.com>
  10 *
  11 * Ext4 extents status tree core functions.
  12 */
  13#include <linux/list_sort.h>
  14#include <linux/proc_fs.h>
  15#include <linux/seq_file.h>
  16#include "ext4.h"
  17
  18#include <trace/events/ext4.h>
  19
  20/*
  21 * According to previous discussion in Ext4 Developer Workshop, we
  22 * will introduce a new structure called io tree to track all extent
  23 * status in order to solve some problems that we have met
  24 * (e.g. Reservation space warning), and provide extent-level locking.
  25 * Delay extent tree is the first step to achieve this goal.  It is
  26 * original built by Yongqiang Yang.  At that time it is called delay
  27 * extent tree, whose goal is only track delayed extents in memory to
  28 * simplify the implementation of fiemap and bigalloc, and introduce
  29 * lseek SEEK_DATA/SEEK_HOLE support.  That is why it is still called
  30 * delay extent tree at the first commit.  But for better understand
  31 * what it does, it has been rename to extent status tree.
  32 *
  33 * Step1:
  34 * Currently the first step has been done.  All delayed extents are
  35 * tracked in the tree.  It maintains the delayed extent when a delayed
  36 * allocation is issued, and the delayed extent is written out or
  37 * invalidated.  Therefore the implementation of fiemap and bigalloc
  38 * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
  39 *
  40 * The following comment describes the implemenmtation of extent
  41 * status tree and future works.
  42 *
  43 * Step2:
  44 * In this step all extent status are tracked by extent status tree.
  45 * Thus, we can first try to lookup a block mapping in this tree before
  46 * finding it in extent tree.  Hence, single extent cache can be removed
  47 * because extent status tree can do a better job.  Extents in status
  48 * tree are loaded on-demand.  Therefore, the extent status tree may not
  49 * contain all of the extents in a file.  Meanwhile we define a shrinker
  50 * to reclaim memory from extent status tree because fragmented extent
  51 * tree will make status tree cost too much memory.  written/unwritten/-
  52 * hole extents in the tree will be reclaimed by this shrinker when we
  53 * are under high memory pressure.  Delayed extents will not be
  54 * reclimed because fiemap, bigalloc, and seek_data/hole need it.
  55 */
  56
  57/*
  58 * Extent status tree implementation for ext4.
  59 *
  60 *
  61 * ==========================================================================
  62 * Extent status tree tracks all extent status.
  63 *
  64 * 1. Why we need to implement extent status tree?
  65 *
  66 * Without extent status tree, ext4 identifies a delayed extent by looking
  67 * up page cache, this has several deficiencies - complicated, buggy,
  68 * and inefficient code.
  69 *
  70 * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a
  71 * block or a range of blocks are belonged to a delayed extent.
  72 *
  73 * Let us have a look at how they do without extent status tree.
  74 *   --	FIEMAP
  75 *	FIEMAP looks up page cache to identify delayed allocations from holes.
  76 *
  77 *   --	SEEK_HOLE/DATA
  78 *	SEEK_HOLE/DATA has the same problem as FIEMAP.
  79 *
  80 *   --	bigalloc
  81 *	bigalloc looks up page cache to figure out if a block is
  82 *	already under delayed allocation or not to determine whether
  83 *	quota reserving is needed for the cluster.
  84 *
  85 *   --	writeout
  86 *	Writeout looks up whole page cache to see if a buffer is
  87 *	mapped, If there are not very many delayed buffers, then it is
  88 *	time consuming.
  89 *
  90 * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,
  91 * bigalloc and writeout can figure out if a block or a range of
  92 * blocks is under delayed allocation(belonged to a delayed extent) or
  93 * not by searching the extent tree.
  94 *
  95 *
  96 * ==========================================================================
  97 * 2. Ext4 extent status tree impelmentation
  98 *
  99 *   --	extent
 100 *	A extent is a range of blocks which are contiguous logically and
 101 *	physically.  Unlike extent in extent tree, this extent in ext4 is
 102 *	a in-memory struct, there is no corresponding on-disk data.  There
 103 *	is no limit on length of extent, so an extent can contain as many
 104 *	blocks as they are contiguous logically and physically.
 105 *
 106 *   --	extent status tree
 107 *	Every inode has an extent status tree and all allocation blocks
 108 *	are added to the tree with different status.  The extent in the
 109 *	tree are ordered by logical block no.
 110 *
 111 *   --	operations on a extent status tree
 112 *	There are three important operations on a delayed extent tree: find
 113 *	next extent, adding a extent(a range of blocks) and removing a extent.
 114 *
 115 *   --	race on a extent status tree
 116 *	Extent status tree is protected by inode->i_es_lock.
 117 *
 118 *   --	memory consumption
 119 *      Fragmented extent tree will make extent status tree cost too much
 120 *      memory.  Hence, we will reclaim written/unwritten/hole extents from
 121 *      the tree under a heavy memory pressure.
 122 *
 123 *
 124 * ==========================================================================
 125 * 3. Performance analysis
 126 *
 127 *   --	overhead
 128 *	1. There is a cache extent for write access, so if writes are
 129 *	not very random, adding space operaions are in O(1) time.
 130 *
 131 *   --	gain
 132 *	2. Code is much simpler, more readable, more maintainable and
 133 *	more efficient.
 134 *
 135 *
 136 * ==========================================================================
 137 * 4. TODO list
 138 *
 139 *   -- Refactor delayed space reservation
 140 *
 141 *   -- Extent-level locking
 142 */
 143
 144static struct kmem_cache *ext4_es_cachep;
 145static struct kmem_cache *ext4_pending_cachep;
 146
 147static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
 148static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 149			      ext4_lblk_t end, int *reserved);
 150static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
 151static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
 152		       struct ext4_inode_info *locked_ei);
 153static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
 154			     ext4_lblk_t len);
 155
 156int __init ext4_init_es(void)
 157{
 158	ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
 159	if (ext4_es_cachep == NULL)
 160		return -ENOMEM;
 161	return 0;
 162}
 163
 164void ext4_exit_es(void)
 165{
 166	kmem_cache_destroy(ext4_es_cachep);
 167}
 168
 169void ext4_es_init_tree(struct ext4_es_tree *tree)
 170{
 171	tree->root = RB_ROOT;
 172	tree->cache_es = NULL;
 173}
 174
 175#ifdef ES_DEBUG__
 176static void ext4_es_print_tree(struct inode *inode)
 177{
 178	struct ext4_es_tree *tree;
 179	struct rb_node *node;
 180
 181	printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
 182	tree = &EXT4_I(inode)->i_es_tree;
 183	node = rb_first(&tree->root);
 184	while (node) {
 185		struct extent_status *es;
 186		es = rb_entry(node, struct extent_status, rb_node);
 187		printk(KERN_DEBUG " [%u/%u) %llu %x",
 188		       es->es_lblk, es->es_len,
 189		       ext4_es_pblock(es), ext4_es_status(es));
 190		node = rb_next(node);
 191	}
 192	printk(KERN_DEBUG "\n");
 193}
 194#else
 195#define ext4_es_print_tree(inode)
 196#endif
 197
 198static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
 199{
 200	BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
 201	return es->es_lblk + es->es_len - 1;
 202}
 203
 204/*
 205 * search through the tree for an delayed extent with a given offset.  If
 206 * it can't be found, try to find next extent.
 207 */
 208static struct extent_status *__es_tree_search(struct rb_root *root,
 209					      ext4_lblk_t lblk)
 210{
 211	struct rb_node *node = root->rb_node;
 212	struct extent_status *es = NULL;
 213
 214	while (node) {
 215		es = rb_entry(node, struct extent_status, rb_node);
 216		if (lblk < es->es_lblk)
 217			node = node->rb_left;
 218		else if (lblk > ext4_es_end(es))
 219			node = node->rb_right;
 220		else
 221			return es;
 222	}
 223
 224	if (es && lblk < es->es_lblk)
 225		return es;
 226
 227	if (es && lblk > ext4_es_end(es)) {
 228		node = rb_next(&es->rb_node);
 229		return node ? rb_entry(node, struct extent_status, rb_node) :
 230			      NULL;
 231	}
 232
 233	return NULL;
 234}
 235
 236/*
 237 * ext4_es_find_extent_range - find extent with specified status within block
 238 *                             range or next extent following block range in
 239 *                             extents status tree
 240 *
 241 * @inode - file containing the range
 242 * @matching_fn - pointer to function that matches extents with desired status
 243 * @lblk - logical block defining start of range
 244 * @end - logical block defining end of range
 245 * @es - extent found, if any
 246 *
 247 * Find the first extent within the block range specified by @lblk and @end
 248 * in the extents status tree that satisfies @matching_fn.  If a match
 249 * is found, it's returned in @es.  If not, and a matching extent is found
 250 * beyond the block range, it's returned in @es.  If no match is found, an
 251 * extent is returned in @es whose es_lblk, es_len, and es_pblk components
 252 * are 0.
 253 */
 254static void __es_find_extent_range(struct inode *inode,
 255				   int (*matching_fn)(struct extent_status *es),
 256				   ext4_lblk_t lblk, ext4_lblk_t end,
 257				   struct extent_status *es)
 258{
 259	struct ext4_es_tree *tree = NULL;
 260	struct extent_status *es1 = NULL;
 261	struct rb_node *node;
 262
 263	WARN_ON(es == NULL);
 264	WARN_ON(end < lblk);
 265
 266	tree = &EXT4_I(inode)->i_es_tree;
 267
 268	/* see if the extent has been cached */
 269	es->es_lblk = es->es_len = es->es_pblk = 0;
 270	if (tree->cache_es) {
 271		es1 = tree->cache_es;
 272		if (in_range(lblk, es1->es_lblk, es1->es_len)) {
 273			es_debug("%u cached by [%u/%u) %llu %x\n",
 274				 lblk, es1->es_lblk, es1->es_len,
 275				 ext4_es_pblock(es1), ext4_es_status(es1));
 276			goto out;
 277		}
 278	}
 279
 280	es1 = __es_tree_search(&tree->root, lblk);
 281
 282out:
 283	if (es1 && !matching_fn(es1)) {
 284		while ((node = rb_next(&es1->rb_node)) != NULL) {
 285			es1 = rb_entry(node, struct extent_status, rb_node);
 286			if (es1->es_lblk > end) {
 287				es1 = NULL;
 288				break;
 289			}
 290			if (matching_fn(es1))
 291				break;
 292		}
 293	}
 294
 295	if (es1 && matching_fn(es1)) {
 296		tree->cache_es = es1;
 297		es->es_lblk = es1->es_lblk;
 298		es->es_len = es1->es_len;
 299		es->es_pblk = es1->es_pblk;
 300	}
 301
 302}
 303
 304/*
 305 * Locking for __es_find_extent_range() for external use
 306 */
 307void ext4_es_find_extent_range(struct inode *inode,
 308			       int (*matching_fn)(struct extent_status *es),
 309			       ext4_lblk_t lblk, ext4_lblk_t end,
 310			       struct extent_status *es)
 311{
 312	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
 313		return;
 314
 315	trace_ext4_es_find_extent_range_enter(inode, lblk);
 316
 317	read_lock(&EXT4_I(inode)->i_es_lock);
 318	__es_find_extent_range(inode, matching_fn, lblk, end, es);
 319	read_unlock(&EXT4_I(inode)->i_es_lock);
 320
 321	trace_ext4_es_find_extent_range_exit(inode, es);
 322}
 323
 324/*
 325 * __es_scan_range - search block range for block with specified status
 326 *                   in extents status tree
 327 *
 328 * @inode - file containing the range
 329 * @matching_fn - pointer to function that matches extents with desired status
 330 * @lblk - logical block defining start of range
 331 * @end - logical block defining end of range
 332 *
 333 * Returns true if at least one block in the specified block range satisfies
 334 * the criterion specified by @matching_fn, and false if not.  If at least
 335 * one extent has the specified status, then there is at least one block
 336 * in the cluster with that status.  Should only be called by code that has
 337 * taken i_es_lock.
 338 */
 339static bool __es_scan_range(struct inode *inode,
 340			    int (*matching_fn)(struct extent_status *es),
 341			    ext4_lblk_t start, ext4_lblk_t end)
 342{
 343	struct extent_status es;
 344
 345	__es_find_extent_range(inode, matching_fn, start, end, &es);
 346	if (es.es_len == 0)
 347		return false;   /* no matching extent in the tree */
 348	else if (es.es_lblk <= start &&
 349		 start < es.es_lblk + es.es_len)
 350		return true;
 351	else if (start <= es.es_lblk && es.es_lblk <= end)
 352		return true;
 353	else
 354		return false;
 355}
 356/*
 357 * Locking for __es_scan_range() for external use
 358 */
 359bool ext4_es_scan_range(struct inode *inode,
 360			int (*matching_fn)(struct extent_status *es),
 361			ext4_lblk_t lblk, ext4_lblk_t end)
 362{
 363	bool ret;
 364
 365	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
 366		return false;
 367
 368	read_lock(&EXT4_I(inode)->i_es_lock);
 369	ret = __es_scan_range(inode, matching_fn, lblk, end);
 370	read_unlock(&EXT4_I(inode)->i_es_lock);
 371
 372	return ret;
 373}
 374
 375/*
 376 * __es_scan_clu - search cluster for block with specified status in
 377 *                 extents status tree
 378 *
 379 * @inode - file containing the cluster
 380 * @matching_fn - pointer to function that matches extents with desired status
 381 * @lblk - logical block in cluster to be searched
 382 *
 383 * Returns true if at least one extent in the cluster containing @lblk
 384 * satisfies the criterion specified by @matching_fn, and false if not.  If at
 385 * least one extent has the specified status, then there is at least one block
 386 * in the cluster with that status.  Should only be called by code that has
 387 * taken i_es_lock.
 388 */
 389static bool __es_scan_clu(struct inode *inode,
 390			  int (*matching_fn)(struct extent_status *es),
 391			  ext4_lblk_t lblk)
 392{
 393	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 394	ext4_lblk_t lblk_start, lblk_end;
 395
 396	lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
 397	lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
 398
 399	return __es_scan_range(inode, matching_fn, lblk_start, lblk_end);
 400}
 401
 402/*
 403 * Locking for __es_scan_clu() for external use
 404 */
 405bool ext4_es_scan_clu(struct inode *inode,
 406		      int (*matching_fn)(struct extent_status *es),
 407		      ext4_lblk_t lblk)
 408{
 409	bool ret;
 410
 411	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
 412		return false;
 413
 414	read_lock(&EXT4_I(inode)->i_es_lock);
 415	ret = __es_scan_clu(inode, matching_fn, lblk);
 416	read_unlock(&EXT4_I(inode)->i_es_lock);
 417
 418	return ret;
 419}
 420
 421static void ext4_es_list_add(struct inode *inode)
 422{
 423	struct ext4_inode_info *ei = EXT4_I(inode);
 424	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 425
 426	if (!list_empty(&ei->i_es_list))
 427		return;
 428
 429	spin_lock(&sbi->s_es_lock);
 430	if (list_empty(&ei->i_es_list)) {
 431		list_add_tail(&ei->i_es_list, &sbi->s_es_list);
 432		sbi->s_es_nr_inode++;
 433	}
 434	spin_unlock(&sbi->s_es_lock);
 435}
 436
 437static void ext4_es_list_del(struct inode *inode)
 438{
 439	struct ext4_inode_info *ei = EXT4_I(inode);
 440	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 441
 442	spin_lock(&sbi->s_es_lock);
 443	if (!list_empty(&ei->i_es_list)) {
 444		list_del_init(&ei->i_es_list);
 445		sbi->s_es_nr_inode--;
 446		WARN_ON_ONCE(sbi->s_es_nr_inode < 0);
 447	}
 448	spin_unlock(&sbi->s_es_lock);
 449}
 450
 451static struct extent_status *
 452ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
 453		     ext4_fsblk_t pblk)
 454{
 455	struct extent_status *es;
 456	es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
 457	if (es == NULL)
 458		return NULL;
 459	es->es_lblk = lblk;
 460	es->es_len = len;
 461	es->es_pblk = pblk;
 462
 463	/*
 464	 * We don't count delayed extent because we never try to reclaim them
 465	 */
 466	if (!ext4_es_is_delayed(es)) {
 467		if (!EXT4_I(inode)->i_es_shk_nr++)
 468			ext4_es_list_add(inode);
 469		percpu_counter_inc(&EXT4_SB(inode->i_sb)->
 470					s_es_stats.es_stats_shk_cnt);
 471	}
 472
 473	EXT4_I(inode)->i_es_all_nr++;
 474	percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
 475
 476	return es;
 477}
 478
 479static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
 480{
 481	EXT4_I(inode)->i_es_all_nr--;
 482	percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
 483
 484	/* Decrease the shrink counter when this es is not delayed */
 485	if (!ext4_es_is_delayed(es)) {
 486		BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
 487		if (!--EXT4_I(inode)->i_es_shk_nr)
 488			ext4_es_list_del(inode);
 489		percpu_counter_dec(&EXT4_SB(inode->i_sb)->
 490					s_es_stats.es_stats_shk_cnt);
 491	}
 492
 493	kmem_cache_free(ext4_es_cachep, es);
 494}
 495
 496/*
 497 * Check whether or not two extents can be merged
 498 * Condition:
 499 *  - logical block number is contiguous
 500 *  - physical block number is contiguous
 501 *  - status is equal
 502 */
 503static int ext4_es_can_be_merged(struct extent_status *es1,
 504				 struct extent_status *es2)
 505{
 506	if (ext4_es_type(es1) != ext4_es_type(es2))
 507		return 0;
 508
 509	if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
 510		pr_warn("ES assertion failed when merging extents. "
 511			"The sum of lengths of es1 (%d) and es2 (%d) "
 512			"is bigger than allowed file size (%d)\n",
 513			es1->es_len, es2->es_len, EXT_MAX_BLOCKS);
 514		WARN_ON(1);
 515		return 0;
 516	}
 517
 518	if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
 519		return 0;
 520
 521	if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
 522	    (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2)))
 523		return 1;
 524
 525	if (ext4_es_is_hole(es1))
 526		return 1;
 527
 528	/* we need to check delayed extent is without unwritten status */
 529	if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1))
 530		return 1;
 531
 532	return 0;
 533}
 534
 535static struct extent_status *
 536ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
 537{
 538	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
 539	struct extent_status *es1;
 540	struct rb_node *node;
 541
 542	node = rb_prev(&es->rb_node);
 543	if (!node)
 544		return es;
 545
 546	es1 = rb_entry(node, struct extent_status, rb_node);
 547	if (ext4_es_can_be_merged(es1, es)) {
 548		es1->es_len += es->es_len;
 549		if (ext4_es_is_referenced(es))
 550			ext4_es_set_referenced(es1);
 551		rb_erase(&es->rb_node, &tree->root);
 552		ext4_es_free_extent(inode, es);
 553		es = es1;
 554	}
 555
 556	return es;
 557}
 558
 559static struct extent_status *
 560ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
 561{
 562	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
 563	struct extent_status *es1;
 564	struct rb_node *node;
 565
 566	node = rb_next(&es->rb_node);
 567	if (!node)
 568		return es;
 569
 570	es1 = rb_entry(node, struct extent_status, rb_node);
 571	if (ext4_es_can_be_merged(es, es1)) {
 572		es->es_len += es1->es_len;
 573		if (ext4_es_is_referenced(es1))
 574			ext4_es_set_referenced(es);
 575		rb_erase(node, &tree->root);
 576		ext4_es_free_extent(inode, es1);
 577	}
 578
 579	return es;
 580}
 581
 582#ifdef ES_AGGRESSIVE_TEST
 583#include "ext4_extents.h"	/* Needed when ES_AGGRESSIVE_TEST is defined */
 584
 585static void ext4_es_insert_extent_ext_check(struct inode *inode,
 586					    struct extent_status *es)
 587{
 588	struct ext4_ext_path *path = NULL;
 589	struct ext4_extent *ex;
 590	ext4_lblk_t ee_block;
 591	ext4_fsblk_t ee_start;
 592	unsigned short ee_len;
 593	int depth, ee_status, es_status;
 594
 595	path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
 596	if (IS_ERR(path))
 597		return;
 598
 599	depth = ext_depth(inode);
 600	ex = path[depth].p_ext;
 601
 602	if (ex) {
 603
 604		ee_block = le32_to_cpu(ex->ee_block);
 605		ee_start = ext4_ext_pblock(ex);
 606		ee_len = ext4_ext_get_actual_len(ex);
 607
 608		ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0;
 609		es_status = ext4_es_is_unwritten(es) ? 1 : 0;
 610
 611		/*
 612		 * Make sure ex and es are not overlap when we try to insert
 613		 * a delayed/hole extent.
 614		 */
 615		if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
 616			if (in_range(es->es_lblk, ee_block, ee_len)) {
 617				pr_warn("ES insert assertion failed for "
 618					"inode: %lu we can find an extent "
 619					"at block [%d/%d/%llu/%c], but we "
 620					"want to add a delayed/hole extent "
 621					"[%d/%d/%llu/%x]\n",
 622					inode->i_ino, ee_block, ee_len,
 623					ee_start, ee_status ? 'u' : 'w',
 624					es->es_lblk, es->es_len,
 625					ext4_es_pblock(es), ext4_es_status(es));
 626			}
 627			goto out;
 628		}
 629
 630		/*
 631		 * We don't check ee_block == es->es_lblk, etc. because es
 632		 * might be a part of whole extent, vice versa.
 633		 */
 634		if (es->es_lblk < ee_block ||
 635		    ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
 636			pr_warn("ES insert assertion failed for inode: %lu "
 637				"ex_status [%d/%d/%llu/%c] != "
 638				"es_status [%d/%d/%llu/%c]\n", inode->i_ino,
 639				ee_block, ee_len, ee_start,
 640				ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
 641				ext4_es_pblock(es), es_status ? 'u' : 'w');
 642			goto out;
 643		}
 644
 645		if (ee_status ^ es_status) {
 646			pr_warn("ES insert assertion failed for inode: %lu "
 647				"ex_status [%d/%d/%llu/%c] != "
 648				"es_status [%d/%d/%llu/%c]\n", inode->i_ino,
 649				ee_block, ee_len, ee_start,
 650				ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
 651				ext4_es_pblock(es), es_status ? 'u' : 'w');
 652		}
 653	} else {
 654		/*
 655		 * We can't find an extent on disk.  So we need to make sure
 656		 * that we don't want to add an written/unwritten extent.
 657		 */
 658		if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
 659			pr_warn("ES insert assertion failed for inode: %lu "
 660				"can't find an extent at block %d but we want "
 661				"to add a written/unwritten extent "
 662				"[%d/%d/%llu/%x]\n", inode->i_ino,
 663				es->es_lblk, es->es_lblk, es->es_len,
 664				ext4_es_pblock(es), ext4_es_status(es));
 665		}
 666	}
 667out:
 668	ext4_free_ext_path(path);
 669}
 670
 671static void ext4_es_insert_extent_ind_check(struct inode *inode,
 672					    struct extent_status *es)
 673{
 674	struct ext4_map_blocks map;
 675	int retval;
 676
 677	/*
 678	 * Here we call ext4_ind_map_blocks to lookup a block mapping because
 679	 * 'Indirect' structure is defined in indirect.c.  So we couldn't
 680	 * access direct/indirect tree from outside.  It is too dirty to define
 681	 * this function in indirect.c file.
 682	 */
 683
 684	map.m_lblk = es->es_lblk;
 685	map.m_len = es->es_len;
 686
 687	retval = ext4_ind_map_blocks(NULL, inode, &map, 0);
 688	if (retval > 0) {
 689		if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) {
 690			/*
 691			 * We want to add a delayed/hole extent but this
 692			 * block has been allocated.
 693			 */
 694			pr_warn("ES insert assertion failed for inode: %lu "
 695				"We can find blocks but we want to add a "
 696				"delayed/hole extent [%d/%d/%llu/%x]\n",
 697				inode->i_ino, es->es_lblk, es->es_len,
 698				ext4_es_pblock(es), ext4_es_status(es));
 699			return;
 700		} else if (ext4_es_is_written(es)) {
 701			if (retval != es->es_len) {
 702				pr_warn("ES insert assertion failed for "
 703					"inode: %lu retval %d != es_len %d\n",
 704					inode->i_ino, retval, es->es_len);
 705				return;
 706			}
 707			if (map.m_pblk != ext4_es_pblock(es)) {
 708				pr_warn("ES insert assertion failed for "
 709					"inode: %lu m_pblk %llu != "
 710					"es_pblk %llu\n",
 711					inode->i_ino, map.m_pblk,
 712					ext4_es_pblock(es));
 713				return;
 714			}
 715		} else {
 716			/*
 717			 * We don't need to check unwritten extent because
 718			 * indirect-based file doesn't have it.
 719			 */
 720			BUG();
 721		}
 722	} else if (retval == 0) {
 723		if (ext4_es_is_written(es)) {
 724			pr_warn("ES insert assertion failed for inode: %lu "
 725				"We can't find the block but we want to add "
 726				"a written extent [%d/%d/%llu/%x]\n",
 727				inode->i_ino, es->es_lblk, es->es_len,
 728				ext4_es_pblock(es), ext4_es_status(es));
 729			return;
 730		}
 731	}
 732}
 733
 734static inline void ext4_es_insert_extent_check(struct inode *inode,
 735					       struct extent_status *es)
 736{
 737	/*
 738	 * We don't need to worry about the race condition because
 739	 * caller takes i_data_sem locking.
 740	 */
 741	BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
 742	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 743		ext4_es_insert_extent_ext_check(inode, es);
 744	else
 745		ext4_es_insert_extent_ind_check(inode, es);
 746}
 747#else
 748static inline void ext4_es_insert_extent_check(struct inode *inode,
 749					       struct extent_status *es)
 750{
 751}
 752#endif
 753
 754static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
 755{
 756	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
 757	struct rb_node **p = &tree->root.rb_node;
 758	struct rb_node *parent = NULL;
 759	struct extent_status *es;
 760
 761	while (*p) {
 762		parent = *p;
 763		es = rb_entry(parent, struct extent_status, rb_node);
 764
 765		if (newes->es_lblk < es->es_lblk) {
 766			if (ext4_es_can_be_merged(newes, es)) {
 767				/*
 768				 * Here we can modify es_lblk directly
 769				 * because it isn't overlapped.
 770				 */
 771				es->es_lblk = newes->es_lblk;
 772				es->es_len += newes->es_len;
 773				if (ext4_es_is_written(es) ||
 774				    ext4_es_is_unwritten(es))
 775					ext4_es_store_pblock(es,
 776							     newes->es_pblk);
 777				es = ext4_es_try_to_merge_left(inode, es);
 778				goto out;
 779			}
 780			p = &(*p)->rb_left;
 781		} else if (newes->es_lblk > ext4_es_end(es)) {
 782			if (ext4_es_can_be_merged(es, newes)) {
 783				es->es_len += newes->es_len;
 784				es = ext4_es_try_to_merge_right(inode, es);
 785				goto out;
 786			}
 787			p = &(*p)->rb_right;
 788		} else {
 789			BUG();
 790			return -EINVAL;
 791		}
 792	}
 793
 794	es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len,
 795				  newes->es_pblk);
 796	if (!es)
 797		return -ENOMEM;
 798	rb_link_node(&es->rb_node, parent, p);
 799	rb_insert_color(&es->rb_node, &tree->root);
 800
 801out:
 802	tree->cache_es = es;
 803	return 0;
 804}
 805
 806/*
 807 * ext4_es_insert_extent() adds information to an inode's extent
 808 * status tree.
 809 *
 810 * Return 0 on success, error code on failure.
 811 */
 812int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
 813			  ext4_lblk_t len, ext4_fsblk_t pblk,
 814			  unsigned int status)
 815{
 816	struct extent_status newes;
 817	ext4_lblk_t end = lblk + len - 1;
 818	int err = 0;
 819	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 820
 821	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
 822		return 0;
 823
 824	es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
 825		 lblk, len, pblk, status, inode->i_ino);
 826
 827	if (!len)
 828		return 0;
 829
 830	BUG_ON(end < lblk);
 831
 832	if ((status & EXTENT_STATUS_DELAYED) &&
 833	    (status & EXTENT_STATUS_WRITTEN)) {
 834		ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as "
 835				" delayed and written which can potentially "
 836				" cause data loss.", lblk, len);
 837		WARN_ON(1);
 838	}
 839
 840	newes.es_lblk = lblk;
 841	newes.es_len = len;
 842	ext4_es_store_pblock_status(&newes, pblk, status);
 843	trace_ext4_es_insert_extent(inode, &newes);
 844
 845	ext4_es_insert_extent_check(inode, &newes);
 846
 847	write_lock(&EXT4_I(inode)->i_es_lock);
 848	err = __es_remove_extent(inode, lblk, end, NULL);
 849	if (err != 0)
 850		goto error;
 851retry:
 852	err = __es_insert_extent(inode, &newes);
 853	if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
 854					  128, EXT4_I(inode)))
 855		goto retry;
 856	if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
 857		err = 0;
 858
 859	if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) &&
 860	    (status & EXTENT_STATUS_WRITTEN ||
 861	     status & EXTENT_STATUS_UNWRITTEN))
 862		__revise_pending(inode, lblk, len);
 863
 864error:
 865	write_unlock(&EXT4_I(inode)->i_es_lock);
 866
 867	ext4_es_print_tree(inode);
 868
 869	return err;
 870}
 871
 872/*
 873 * ext4_es_cache_extent() inserts information into the extent status
 874 * tree if and only if there isn't information about the range in
 875 * question already.
 876 */
 877void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
 878			  ext4_lblk_t len, ext4_fsblk_t pblk,
 879			  unsigned int status)
 880{
 881	struct extent_status *es;
 882	struct extent_status newes;
 883	ext4_lblk_t end = lblk + len - 1;
 884
 885	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
 886		return;
 887
 888	newes.es_lblk = lblk;
 889	newes.es_len = len;
 890	ext4_es_store_pblock_status(&newes, pblk, status);
 891	trace_ext4_es_cache_extent(inode, &newes);
 892
 893	if (!len)
 894		return;
 895
 896	BUG_ON(end < lblk);
 897
 898	write_lock(&EXT4_I(inode)->i_es_lock);
 899
 900	es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
 901	if (!es || es->es_lblk > end)
 902		__es_insert_extent(inode, &newes);
 903	write_unlock(&EXT4_I(inode)->i_es_lock);
 904}
 905
 906/*
 907 * ext4_es_lookup_extent() looks up an extent in extent status tree.
 908 *
 909 * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
 910 *
 911 * Return: 1 on found, 0 on not
 912 */
 913int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
 914			  ext4_lblk_t *next_lblk,
 915			  struct extent_status *es)
 916{
 917	struct ext4_es_tree *tree;
 918	struct ext4_es_stats *stats;
 919	struct extent_status *es1 = NULL;
 920	struct rb_node *node;
 921	int found = 0;
 922
 923	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
 924		return 0;
 925
 926	trace_ext4_es_lookup_extent_enter(inode, lblk);
 927	es_debug("lookup extent in block %u\n", lblk);
 928
 929	tree = &EXT4_I(inode)->i_es_tree;
 930	read_lock(&EXT4_I(inode)->i_es_lock);
 931
 932	/* find extent in cache firstly */
 933	es->es_lblk = es->es_len = es->es_pblk = 0;
 934	if (tree->cache_es) {
 935		es1 = tree->cache_es;
 936		if (in_range(lblk, es1->es_lblk, es1->es_len)) {
 937			es_debug("%u cached by [%u/%u)\n",
 938				 lblk, es1->es_lblk, es1->es_len);
 939			found = 1;
 940			goto out;
 941		}
 942	}
 943
 944	node = tree->root.rb_node;
 945	while (node) {
 946		es1 = rb_entry(node, struct extent_status, rb_node);
 947		if (lblk < es1->es_lblk)
 948			node = node->rb_left;
 949		else if (lblk > ext4_es_end(es1))
 950			node = node->rb_right;
 951		else {
 952			found = 1;
 953			break;
 954		}
 955	}
 956
 957out:
 958	stats = &EXT4_SB(inode->i_sb)->s_es_stats;
 959	if (found) {
 960		BUG_ON(!es1);
 961		es->es_lblk = es1->es_lblk;
 962		es->es_len = es1->es_len;
 963		es->es_pblk = es1->es_pblk;
 964		if (!ext4_es_is_referenced(es1))
 965			ext4_es_set_referenced(es1);
 966		percpu_counter_inc(&stats->es_stats_cache_hits);
 967		if (next_lblk) {
 968			node = rb_next(&es1->rb_node);
 969			if (node) {
 970				es1 = rb_entry(node, struct extent_status,
 971					       rb_node);
 972				*next_lblk = es1->es_lblk;
 973			} else
 974				*next_lblk = 0;
 975		}
 976	} else {
 977		percpu_counter_inc(&stats->es_stats_cache_misses);
 978	}
 979
 980	read_unlock(&EXT4_I(inode)->i_es_lock);
 981
 982	trace_ext4_es_lookup_extent_exit(inode, es, found);
 983	return found;
 984}
 985
 986struct rsvd_count {
 987	int ndelonly;
 988	bool first_do_lblk_found;
 989	ext4_lblk_t first_do_lblk;
 990	ext4_lblk_t last_do_lblk;
 991	struct extent_status *left_es;
 992	bool partial;
 993	ext4_lblk_t lclu;
 994};
 995
 996/*
 997 * init_rsvd - initialize reserved count data before removing block range
 998 *	       in file from extent status tree
 999 *
1000 * @inode - file containing range
1001 * @lblk - first block in range
1002 * @es - pointer to first extent in range
1003 * @rc - pointer to reserved count data
1004 *
1005 * Assumes es is not NULL
1006 */
1007static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
1008		      struct extent_status *es, struct rsvd_count *rc)
1009{
1010	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1011	struct rb_node *node;
1012
1013	rc->ndelonly = 0;
1014
1015	/*
1016	 * for bigalloc, note the first delonly block in the range has not
1017	 * been found, record the extent containing the block to the left of
1018	 * the region to be removed, if any, and note that there's no partial
1019	 * cluster to track
1020	 */
1021	if (sbi->s_cluster_ratio > 1) {
1022		rc->first_do_lblk_found = false;
1023		if (lblk > es->es_lblk) {
1024			rc->left_es = es;
1025		} else {
1026			node = rb_prev(&es->rb_node);
1027			rc->left_es = node ? rb_entry(node,
1028						      struct extent_status,
1029						      rb_node) : NULL;
1030		}
1031		rc->partial = false;
1032	}
1033}
1034
1035/*
1036 * count_rsvd - count the clusters containing delayed and not unwritten
1037 *		(delonly) blocks in a range within an extent and add to
1038 *	        the running tally in rsvd_count
1039 *
1040 * @inode - file containing extent
1041 * @lblk - first block in range
1042 * @len - length of range in blocks
1043 * @es - pointer to extent containing clusters to be counted
1044 * @rc - pointer to reserved count data
1045 *
1046 * Tracks partial clusters found at the beginning and end of extents so
1047 * they aren't overcounted when they span adjacent extents
1048 */
1049static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
1050		       struct extent_status *es, struct rsvd_count *rc)
1051{
1052	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1053	ext4_lblk_t i, end, nclu;
1054
1055	if (!ext4_es_is_delonly(es))
1056		return;
1057
1058	WARN_ON(len <= 0);
1059
1060	if (sbi->s_cluster_ratio == 1) {
1061		rc->ndelonly += (int) len;
1062		return;
1063	}
1064
1065	/* bigalloc */
1066
1067	i = (lblk < es->es_lblk) ? es->es_lblk : lblk;
1068	end = lblk + (ext4_lblk_t) len - 1;
1069	end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;
1070
1071	/* record the first block of the first delonly extent seen */
1072	if (!rc->first_do_lblk_found) {
1073		rc->first_do_lblk = i;
1074		rc->first_do_lblk_found = true;
1075	}
1076
1077	/* update the last lblk in the region seen so far */
1078	rc->last_do_lblk = end;
1079
1080	/*
1081	 * if we're tracking a partial cluster and the current extent
1082	 * doesn't start with it, count it and stop tracking
1083	 */
1084	if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) {
1085		rc->ndelonly++;
1086		rc->partial = false;
1087	}
1088
1089	/*
1090	 * if the first cluster doesn't start on a cluster boundary but
1091	 * ends on one, count it
1092	 */
1093	if (EXT4_LBLK_COFF(sbi, i) != 0) {
1094		if (end >= EXT4_LBLK_CFILL(sbi, i)) {
1095			rc->ndelonly++;
1096			rc->partial = false;
1097			i = EXT4_LBLK_CFILL(sbi, i) + 1;
1098		}
1099	}
1100
1101	/*
1102	 * if the current cluster starts on a cluster boundary, count the
1103	 * number of whole delonly clusters in the extent
1104	 */
1105	if ((i + sbi->s_cluster_ratio - 1) <= end) {
1106		nclu = (end - i + 1) >> sbi->s_cluster_bits;
1107		rc->ndelonly += nclu;
1108		i += nclu << sbi->s_cluster_bits;
1109	}
1110
1111	/*
1112	 * start tracking a partial cluster if there's a partial at the end
1113	 * of the current extent and we're not already tracking one
1114	 */
1115	if (!rc->partial && i <= end) {
1116		rc->partial = true;
1117		rc->lclu = EXT4_B2C(sbi, i);
1118	}
1119}
1120
1121/*
1122 * __pr_tree_search - search for a pending cluster reservation
1123 *
1124 * @root - root of pending reservation tree
1125 * @lclu - logical cluster to search for
1126 *
1127 * Returns the pending reservation for the cluster identified by @lclu
1128 * if found.  If not, returns a reservation for the next cluster if any,
1129 * and if not, returns NULL.
1130 */
1131static struct pending_reservation *__pr_tree_search(struct rb_root *root,
1132						    ext4_lblk_t lclu)
1133{
1134	struct rb_node *node = root->rb_node;
1135	struct pending_reservation *pr = NULL;
1136
1137	while (node) {
1138		pr = rb_entry(node, struct pending_reservation, rb_node);
1139		if (lclu < pr->lclu)
1140			node = node->rb_left;
1141		else if (lclu > pr->lclu)
1142			node = node->rb_right;
1143		else
1144			return pr;
1145	}
1146	if (pr && lclu < pr->lclu)
1147		return pr;
1148	if (pr && lclu > pr->lclu) {
1149		node = rb_next(&pr->rb_node);
1150		return node ? rb_entry(node, struct pending_reservation,
1151				       rb_node) : NULL;
1152	}
1153	return NULL;
1154}
1155
1156/*
1157 * get_rsvd - calculates and returns the number of cluster reservations to be
1158 *	      released when removing a block range from the extent status tree
1159 *	      and releases any pending reservations within the range
1160 *
1161 * @inode - file containing block range
1162 * @end - last block in range
1163 * @right_es - pointer to extent containing next block beyond end or NULL
1164 * @rc - pointer to reserved count data
1165 *
1166 * The number of reservations to be released is equal to the number of
1167 * clusters containing delayed and not unwritten (delonly) blocks within
1168 * the range, minus the number of clusters still containing delonly blocks
1169 * at the ends of the range, and minus the number of pending reservations
1170 * within the range.
1171 */
1172static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
1173			     struct extent_status *right_es,
1174			     struct rsvd_count *rc)
1175{
1176	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1177	struct pending_reservation *pr;
1178	struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
1179	struct rb_node *node;
1180	ext4_lblk_t first_lclu, last_lclu;
1181	bool left_delonly, right_delonly, count_pending;
1182	struct extent_status *es;
1183
1184	if (sbi->s_cluster_ratio > 1) {
1185		/* count any remaining partial cluster */
1186		if (rc->partial)
1187			rc->ndelonly++;
1188
1189		if (rc->ndelonly == 0)
1190			return 0;
1191
1192		first_lclu = EXT4_B2C(sbi, rc->first_do_lblk);
1193		last_lclu = EXT4_B2C(sbi, rc->last_do_lblk);
1194
1195		/*
1196		 * decrease the delonly count by the number of clusters at the
1197		 * ends of the range that still contain delonly blocks -
1198		 * these clusters still need to be reserved
1199		 */
1200		left_delonly = right_delonly = false;
1201
1202		es = rc->left_es;
1203		while (es && ext4_es_end(es) >=
1204		       EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) {
1205			if (ext4_es_is_delonly(es)) {
1206				rc->ndelonly--;
1207				left_delonly = true;
1208				break;
1209			}
1210			node = rb_prev(&es->rb_node);
1211			if (!node)
1212				break;
1213			es = rb_entry(node, struct extent_status, rb_node);
1214		}
1215		if (right_es && (!left_delonly || first_lclu != last_lclu)) {
1216			if (end < ext4_es_end(right_es)) {
1217				es = right_es;
1218			} else {
1219				node = rb_next(&right_es->rb_node);
1220				es = node ? rb_entry(node, struct extent_status,
1221						     rb_node) : NULL;
1222			}
1223			while (es && es->es_lblk <=
1224			       EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) {
1225				if (ext4_es_is_delonly(es)) {
1226					rc->ndelonly--;
1227					right_delonly = true;
1228					break;
1229				}
1230				node = rb_next(&es->rb_node);
1231				if (!node)
1232					break;
1233				es = rb_entry(node, struct extent_status,
1234					      rb_node);
1235			}
1236		}
1237
1238		/*
1239		 * Determine the block range that should be searched for
1240		 * pending reservations, if any.  Clusters on the ends of the
1241		 * original removed range containing delonly blocks are
1242		 * excluded.  They've already been accounted for and it's not
1243		 * possible to determine if an associated pending reservation
1244		 * should be released with the information available in the
1245		 * extents status tree.
1246		 */
1247		if (first_lclu == last_lclu) {
1248			if (left_delonly | right_delonly)
1249				count_pending = false;
1250			else
1251				count_pending = true;
1252		} else {
1253			if (left_delonly)
1254				first_lclu++;
1255			if (right_delonly)
1256				last_lclu--;
1257			if (first_lclu <= last_lclu)
1258				count_pending = true;
1259			else
1260				count_pending = false;
1261		}
1262
1263		/*
1264		 * a pending reservation found between first_lclu and last_lclu
1265		 * represents an allocated cluster that contained at least one
1266		 * delonly block, so the delonly total must be reduced by one
1267		 * for each pending reservation found and released
1268		 */
1269		if (count_pending) {
1270			pr = __pr_tree_search(&tree->root, first_lclu);
1271			while (pr && pr->lclu <= last_lclu) {
1272				rc->ndelonly--;
1273				node = rb_next(&pr->rb_node);
1274				rb_erase(&pr->rb_node, &tree->root);
1275				kmem_cache_free(ext4_pending_cachep, pr);
1276				if (!node)
1277					break;
1278				pr = rb_entry(node, struct pending_reservation,
1279					      rb_node);
1280			}
1281		}
1282	}
1283	return rc->ndelonly;
1284}
1285
1286
1287/*
1288 * __es_remove_extent - removes block range from extent status tree
1289 *
1290 * @inode - file containing range
1291 * @lblk - first block in range
1292 * @end - last block in range
1293 * @reserved - number of cluster reservations released
1294 *
1295 * If @reserved is not NULL and delayed allocation is enabled, counts
1296 * block/cluster reservations freed by removing range and if bigalloc
1297 * enabled cancels pending reservations as needed. Returns 0 on success,
1298 * error code on failure.
1299 */
1300static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
1301			      ext4_lblk_t end, int *reserved)
1302{
1303	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
1304	struct rb_node *node;
1305	struct extent_status *es;
1306	struct extent_status orig_es;
1307	ext4_lblk_t len1, len2;
1308	ext4_fsblk_t block;
1309	int err;
1310	bool count_reserved = true;
1311	struct rsvd_count rc;
1312
1313	if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC))
1314		count_reserved = false;
1315retry:
1316	err = 0;
1317
1318	es = __es_tree_search(&tree->root, lblk);
1319	if (!es)
1320		goto out;
1321	if (es->es_lblk > end)
1322		goto out;
1323
1324	/* Simply invalidate cache_es. */
1325	tree->cache_es = NULL;
1326	if (count_reserved)
1327		init_rsvd(inode, lblk, es, &rc);
1328
1329	orig_es.es_lblk = es->es_lblk;
1330	orig_es.es_len = es->es_len;
1331	orig_es.es_pblk = es->es_pblk;
1332
1333	len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
1334	len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
1335	if (len1 > 0)
1336		es->es_len = len1;
1337	if (len2 > 0) {
1338		if (len1 > 0) {
1339			struct extent_status newes;
1340
1341			newes.es_lblk = end + 1;
1342			newes.es_len = len2;
1343			block = 0x7FDEADBEEFULL;
1344			if (ext4_es_is_written(&orig_es) ||
1345			    ext4_es_is_unwritten(&orig_es))
1346				block = ext4_es_pblock(&orig_es) +
1347					orig_es.es_len - len2;
1348			ext4_es_store_pblock_status(&newes, block,
1349						    ext4_es_status(&orig_es));
1350			err = __es_insert_extent(inode, &newes);
1351			if (err) {
1352				es->es_lblk = orig_es.es_lblk;
1353				es->es_len = orig_es.es_len;
1354				if ((err == -ENOMEM) &&
1355				    __es_shrink(EXT4_SB(inode->i_sb),
1356							128, EXT4_I(inode)))
1357					goto retry;
1358				goto out;
1359			}
1360		} else {
1361			es->es_lblk = end + 1;
1362			es->es_len = len2;
1363			if (ext4_es_is_written(es) ||
1364			    ext4_es_is_unwritten(es)) {
1365				block = orig_es.es_pblk + orig_es.es_len - len2;
1366				ext4_es_store_pblock(es, block);
1367			}
1368		}
1369		if (count_reserved)
1370			count_rsvd(inode, lblk, orig_es.es_len - len1 - len2,
1371				   &orig_es, &rc);
1372		goto out_get_reserved;
1373	}
1374
1375	if (len1 > 0) {
1376		if (count_reserved)
1377			count_rsvd(inode, lblk, orig_es.es_len - len1,
1378				   &orig_es, &rc);
1379		node = rb_next(&es->rb_node);
1380		if (node)
1381			es = rb_entry(node, struct extent_status, rb_node);
1382		else
1383			es = NULL;
1384	}
1385
1386	while (es && ext4_es_end(es) <= end) {
1387		if (count_reserved)
1388			count_rsvd(inode, es->es_lblk, es->es_len, es, &rc);
1389		node = rb_next(&es->rb_node);
1390		rb_erase(&es->rb_node, &tree->root);
1391		ext4_es_free_extent(inode, es);
1392		if (!node) {
1393			es = NULL;
1394			break;
1395		}
1396		es = rb_entry(node, struct extent_status, rb_node);
1397	}
1398
1399	if (es && es->es_lblk < end + 1) {
1400		ext4_lblk_t orig_len = es->es_len;
1401
1402		len1 = ext4_es_end(es) - end;
1403		if (count_reserved)
1404			count_rsvd(inode, es->es_lblk, orig_len - len1,
1405				   es, &rc);
1406		es->es_lblk = end + 1;
1407		es->es_len = len1;
1408		if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
1409			block = es->es_pblk + orig_len - len1;
1410			ext4_es_store_pblock(es, block);
1411		}
1412	}
1413
1414out_get_reserved:
1415	if (count_reserved)
1416		*reserved = get_rsvd(inode, end, es, &rc);
1417out:
1418	return err;
1419}
1420
1421/*
1422 * ext4_es_remove_extent - removes block range from extent status tree
1423 *
1424 * @inode - file containing range
1425 * @lblk - first block in range
1426 * @len - number of blocks to remove
1427 *
1428 * Reduces block/cluster reservation count and for bigalloc cancels pending
1429 * reservations as needed. Returns 0 on success, error code on failure.
1430 */
1431int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
1432			  ext4_lblk_t len)
1433{
1434	ext4_lblk_t end;
1435	int err = 0;
1436	int reserved = 0;
1437
1438	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
1439		return 0;
1440
1441	trace_ext4_es_remove_extent(inode, lblk, len);
1442	es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
1443		 lblk, len, inode->i_ino);
1444
1445	if (!len)
1446		return err;
1447
1448	end = lblk + len - 1;
1449	BUG_ON(end < lblk);
1450
1451	/*
1452	 * ext4_clear_inode() depends on us taking i_es_lock unconditionally
1453	 * so that we are sure __es_shrink() is done with the inode before it
1454	 * is reclaimed.
1455	 */
1456	write_lock(&EXT4_I(inode)->i_es_lock);
1457	err = __es_remove_extent(inode, lblk, end, &reserved);
1458	write_unlock(&EXT4_I(inode)->i_es_lock);
1459	ext4_es_print_tree(inode);
1460	ext4_da_release_space(inode, reserved);
1461	return err;
1462}
1463
1464static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
1465		       struct ext4_inode_info *locked_ei)
1466{
1467	struct ext4_inode_info *ei;
1468	struct ext4_es_stats *es_stats;
1469	ktime_t start_time;
1470	u64 scan_time;
1471	int nr_to_walk;
1472	int nr_shrunk = 0;
1473	int retried = 0, nr_skipped = 0;
1474
1475	es_stats = &sbi->s_es_stats;
1476	start_time = ktime_get();
1477
1478retry:
1479	spin_lock(&sbi->s_es_lock);
1480	nr_to_walk = sbi->s_es_nr_inode;
1481	while (nr_to_walk-- > 0) {
1482		if (list_empty(&sbi->s_es_list)) {
1483			spin_unlock(&sbi->s_es_lock);
1484			goto out;
1485		}
1486		ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
1487				      i_es_list);
1488		/* Move the inode to the tail */
1489		list_move_tail(&ei->i_es_list, &sbi->s_es_list);
1490
1491		/*
1492		 * Normally we try hard to avoid shrinking precached inodes,
1493		 * but we will as a last resort.
1494		 */
1495		if (!retried && ext4_test_inode_state(&ei->vfs_inode,
1496						EXT4_STATE_EXT_PRECACHED)) {
1497			nr_skipped++;
1498			continue;
1499		}
1500
1501		if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) {
1502			nr_skipped++;
1503			continue;
1504		}
1505		/*
1506		 * Now we hold i_es_lock which protects us from inode reclaim
1507		 * freeing inode under us
1508		 */
1509		spin_unlock(&sbi->s_es_lock);
1510
1511		nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
1512		write_unlock(&ei->i_es_lock);
1513
1514		if (nr_to_scan <= 0)
1515			goto out;
1516		spin_lock(&sbi->s_es_lock);
1517	}
1518	spin_unlock(&sbi->s_es_lock);
1519
1520	/*
1521	 * If we skipped any inodes, and we weren't able to make any
1522	 * forward progress, try again to scan precached inodes.
1523	 */
1524	if ((nr_shrunk == 0) && nr_skipped && !retried) {
1525		retried++;
1526		goto retry;
1527	}
1528
1529	if (locked_ei && nr_shrunk == 0)
1530		nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);
1531
1532out:
1533	scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1534	if (likely(es_stats->es_stats_scan_time))
1535		es_stats->es_stats_scan_time = (scan_time +
1536				es_stats->es_stats_scan_time*3) / 4;
1537	else
1538		es_stats->es_stats_scan_time = scan_time;
1539	if (scan_time > es_stats->es_stats_max_scan_time)
1540		es_stats->es_stats_max_scan_time = scan_time;
1541	if (likely(es_stats->es_stats_shrunk))
1542		es_stats->es_stats_shrunk = (nr_shrunk +
1543				es_stats->es_stats_shrunk*3) / 4;
1544	else
1545		es_stats->es_stats_shrunk = nr_shrunk;
1546
1547	trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time,
1548			     nr_skipped, retried);
1549	return nr_shrunk;
1550}
1551
1552static unsigned long ext4_es_count(struct shrinker *shrink,
1553				   struct shrink_control *sc)
1554{
1555	unsigned long nr;
1556	struct ext4_sb_info *sbi;
1557
1558	sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
1559	nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1560	trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
1561	return nr;
1562}
1563
1564static unsigned long ext4_es_scan(struct shrinker *shrink,
1565				  struct shrink_control *sc)
1566{
1567	struct ext4_sb_info *sbi = container_of(shrink,
1568					struct ext4_sb_info, s_es_shrinker);
1569	int nr_to_scan = sc->nr_to_scan;
1570	int ret, nr_shrunk;
1571
1572	ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1573	trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
1574
1575	nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);
1576
1577	ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1578	trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
1579	return nr_shrunk;
1580}
1581
1582int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v)
1583{
1584	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *) seq->private);
1585	struct ext4_es_stats *es_stats = &sbi->s_es_stats;
1586	struct ext4_inode_info *ei, *max = NULL;
1587	unsigned int inode_cnt = 0;
1588
1589	if (v != SEQ_START_TOKEN)
1590		return 0;
1591
1592	/* here we just find an inode that has the max nr. of objects */
1593	spin_lock(&sbi->s_es_lock);
1594	list_for_each_entry(ei, &sbi->s_es_list, i_es_list) {
1595		inode_cnt++;
1596		if (max && max->i_es_all_nr < ei->i_es_all_nr)
1597			max = ei;
1598		else if (!max)
1599			max = ei;
1600	}
1601	spin_unlock(&sbi->s_es_lock);
1602
1603	seq_printf(seq, "stats:\n  %lld objects\n  %lld reclaimable objects\n",
1604		   percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
1605		   percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
1606	seq_printf(seq, "  %lld/%lld cache hits/misses\n",
1607		   percpu_counter_sum_positive(&es_stats->es_stats_cache_hits),
1608		   percpu_counter_sum_positive(&es_stats->es_stats_cache_misses));
1609	if (inode_cnt)
1610		seq_printf(seq, "  %d inodes on list\n", inode_cnt);
1611
1612	seq_printf(seq, "average:\n  %llu us scan time\n",
1613	    div_u64(es_stats->es_stats_scan_time, 1000));
1614	seq_printf(seq, "  %lu shrunk objects\n", es_stats->es_stats_shrunk);
1615	if (inode_cnt)
1616		seq_printf(seq,
1617		    "maximum:\n  %lu inode (%u objects, %u reclaimable)\n"
1618		    "  %llu us max scan time\n",
1619		    max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr,
1620		    div_u64(es_stats->es_stats_max_scan_time, 1000));
1621
1622	return 0;
1623}
1624
1625int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
1626{
1627	int err;
1628
1629	/* Make sure we have enough bits for physical block number */
1630	BUILD_BUG_ON(ES_SHIFT < 48);
1631	INIT_LIST_HEAD(&sbi->s_es_list);
1632	sbi->s_es_nr_inode = 0;
1633	spin_lock_init(&sbi->s_es_lock);
1634	sbi->s_es_stats.es_stats_shrunk = 0;
1635	err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_hits, 0,
1636				  GFP_KERNEL);
1637	if (err)
1638		return err;
1639	err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_misses, 0,
1640				  GFP_KERNEL);
1641	if (err)
1642		goto err1;
1643	sbi->s_es_stats.es_stats_scan_time = 0;
1644	sbi->s_es_stats.es_stats_max_scan_time = 0;
1645	err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
1646	if (err)
1647		goto err2;
1648	err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
1649	if (err)
1650		goto err3;
1651
1652	sbi->s_es_shrinker.scan_objects = ext4_es_scan;
1653	sbi->s_es_shrinker.count_objects = ext4_es_count;
1654	sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
1655	err = register_shrinker(&sbi->s_es_shrinker, "ext4-es:%s",
1656				sbi->s_sb->s_id);
1657	if (err)
1658		goto err4;
1659
1660	return 0;
1661err4:
1662	percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
1663err3:
1664	percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1665err2:
1666	percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
1667err1:
1668	percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
1669	return err;
1670}
1671
1672void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
1673{
1674	percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
1675	percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
1676	percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1677	percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
1678	unregister_shrinker(&sbi->s_es_shrinker);
1679}
1680
1681/*
1682 * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at
1683 * most *nr_to_scan extents, update *nr_to_scan accordingly.
1684 *
1685 * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan.
1686 * Increment *nr_shrunk by the number of reclaimed extents. Also update
1687 * ei->i_es_shrink_lblk to where we should continue scanning.
1688 */
1689static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
1690				 int *nr_to_scan, int *nr_shrunk)
1691{
1692	struct inode *inode = &ei->vfs_inode;
1693	struct ext4_es_tree *tree = &ei->i_es_tree;
1694	struct extent_status *es;
1695	struct rb_node *node;
1696
1697	es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
1698	if (!es)
1699		goto out_wrap;
1700
1701	while (*nr_to_scan > 0) {
1702		if (es->es_lblk > end) {
1703			ei->i_es_shrink_lblk = end + 1;
1704			return 0;
1705		}
1706
1707		(*nr_to_scan)--;
1708		node = rb_next(&es->rb_node);
1709		/*
1710		 * We can't reclaim delayed extent from status tree because
1711		 * fiemap, bigallic, and seek_data/hole need to use it.
1712		 */
1713		if (ext4_es_is_delayed(es))
1714			goto next;
1715		if (ext4_es_is_referenced(es)) {
1716			ext4_es_clear_referenced(es);
1717			goto next;
1718		}
1719
1720		rb_erase(&es->rb_node, &tree->root);
1721		ext4_es_free_extent(inode, es);
1722		(*nr_shrunk)++;
1723next:
1724		if (!node)
1725			goto out_wrap;
1726		es = rb_entry(node, struct extent_status, rb_node);
1727	}
1728	ei->i_es_shrink_lblk = es->es_lblk;
1729	return 1;
1730out_wrap:
1731	ei->i_es_shrink_lblk = 0;
1732	return 0;
1733}
1734
1735static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
1736{
1737	struct inode *inode = &ei->vfs_inode;
1738	int nr_shrunk = 0;
1739	ext4_lblk_t start = ei->i_es_shrink_lblk;
1740	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
1741				      DEFAULT_RATELIMIT_BURST);
1742
1743	if (ei->i_es_shk_nr == 0)
1744		return 0;
1745
1746	if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
1747	    __ratelimit(&_rs))
1748		ext4_warning(inode->i_sb, "forced shrink of precached extents");
1749
1750	if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) &&
1751	    start != 0)
1752		es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk);
1753
1754	ei->i_es_tree.cache_es = NULL;
1755	return nr_shrunk;
1756}
1757
1758/*
1759 * Called to support EXT4_IOC_CLEAR_ES_CACHE.  We can only remove
1760 * discretionary entries from the extent status cache.  (Some entries
1761 * must be present for proper operations.)
1762 */
1763void ext4_clear_inode_es(struct inode *inode)
1764{
1765	struct ext4_inode_info *ei = EXT4_I(inode);
1766	struct extent_status *es;
1767	struct ext4_es_tree *tree;
1768	struct rb_node *node;
1769
1770	write_lock(&ei->i_es_lock);
1771	tree = &EXT4_I(inode)->i_es_tree;
1772	tree->cache_es = NULL;
1773	node = rb_first(&tree->root);
1774	while (node) {
1775		es = rb_entry(node, struct extent_status, rb_node);
1776		node = rb_next(node);
1777		if (!ext4_es_is_delayed(es)) {
1778			rb_erase(&es->rb_node, &tree->root);
1779			ext4_es_free_extent(inode, es);
1780		}
1781	}
1782	ext4_clear_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
1783	write_unlock(&ei->i_es_lock);
1784}
1785
1786#ifdef ES_DEBUG__
1787static void ext4_print_pending_tree(struct inode *inode)
1788{
1789	struct ext4_pending_tree *tree;
1790	struct rb_node *node;
1791	struct pending_reservation *pr;
1792
1793	printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino);
1794	tree = &EXT4_I(inode)->i_pending_tree;
1795	node = rb_first(&tree->root);
1796	while (node) {
1797		pr = rb_entry(node, struct pending_reservation, rb_node);
1798		printk(KERN_DEBUG " %u", pr->lclu);
1799		node = rb_next(node);
1800	}
1801	printk(KERN_DEBUG "\n");
1802}
1803#else
1804#define ext4_print_pending_tree(inode)
1805#endif
1806
1807int __init ext4_init_pending(void)
1808{
1809	ext4_pending_cachep = KMEM_CACHE(pending_reservation, SLAB_RECLAIM_ACCOUNT);
1810	if (ext4_pending_cachep == NULL)
1811		return -ENOMEM;
1812	return 0;
1813}
1814
1815void ext4_exit_pending(void)
1816{
1817	kmem_cache_destroy(ext4_pending_cachep);
1818}
1819
1820void ext4_init_pending_tree(struct ext4_pending_tree *tree)
1821{
1822	tree->root = RB_ROOT;
1823}
1824
1825/*
1826 * __get_pending - retrieve a pointer to a pending reservation
1827 *
1828 * @inode - file containing the pending cluster reservation
1829 * @lclu - logical cluster of interest
1830 *
1831 * Returns a pointer to a pending reservation if it's a member of
1832 * the set, and NULL if not.  Must be called holding i_es_lock.
1833 */
1834static struct pending_reservation *__get_pending(struct inode *inode,
1835						 ext4_lblk_t lclu)
1836{
1837	struct ext4_pending_tree *tree;
1838	struct rb_node *node;
1839	struct pending_reservation *pr = NULL;
1840
1841	tree = &EXT4_I(inode)->i_pending_tree;
1842	node = (&tree->root)->rb_node;
1843
1844	while (node) {
1845		pr = rb_entry(node, struct pending_reservation, rb_node);
1846		if (lclu < pr->lclu)
1847			node = node->rb_left;
1848		else if (lclu > pr->lclu)
1849			node = node->rb_right;
1850		else if (lclu == pr->lclu)
1851			return pr;
1852	}
1853	return NULL;
1854}
1855
1856/*
1857 * __insert_pending - adds a pending cluster reservation to the set of
1858 *                    pending reservations
1859 *
1860 * @inode - file containing the cluster
1861 * @lblk - logical block in the cluster to be added
1862 *
1863 * Returns 0 on successful insertion and -ENOMEM on failure.  If the
1864 * pending reservation is already in the set, returns successfully.
1865 */
1866static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
1867{
1868	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1869	struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
1870	struct rb_node **p = &tree->root.rb_node;
1871	struct rb_node *parent = NULL;
1872	struct pending_reservation *pr;
1873	ext4_lblk_t lclu;
1874	int ret = 0;
1875
1876	lclu = EXT4_B2C(sbi, lblk);
1877	/* search to find parent for insertion */
1878	while (*p) {
1879		parent = *p;
1880		pr = rb_entry(parent, struct pending_reservation, rb_node);
1881
1882		if (lclu < pr->lclu) {
1883			p = &(*p)->rb_left;
1884		} else if (lclu > pr->lclu) {
1885			p = &(*p)->rb_right;
1886		} else {
1887			/* pending reservation already inserted */
1888			goto out;
1889		}
1890	}
1891
1892	pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
1893	if (pr == NULL) {
1894		ret = -ENOMEM;
1895		goto out;
1896	}
1897	pr->lclu = lclu;
1898
1899	rb_link_node(&pr->rb_node, parent, p);
1900	rb_insert_color(&pr->rb_node, &tree->root);
1901
1902out:
1903	return ret;
1904}
1905
1906/*
1907 * __remove_pending - removes a pending cluster reservation from the set
1908 *                    of pending reservations
1909 *
1910 * @inode - file containing the cluster
1911 * @lblk - logical block in the pending cluster reservation to be removed
1912 *
1913 * Returns successfully if pending reservation is not a member of the set.
1914 */
1915static void __remove_pending(struct inode *inode, ext4_lblk_t lblk)
1916{
1917	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1918	struct pending_reservation *pr;
1919	struct ext4_pending_tree *tree;
1920
1921	pr = __get_pending(inode, EXT4_B2C(sbi, lblk));
1922	if (pr != NULL) {
1923		tree = &EXT4_I(inode)->i_pending_tree;
1924		rb_erase(&pr->rb_node, &tree->root);
1925		kmem_cache_free(ext4_pending_cachep, pr);
1926	}
1927}
1928
1929/*
1930 * ext4_remove_pending - removes a pending cluster reservation from the set
1931 *                       of pending reservations
1932 *
1933 * @inode - file containing the cluster
1934 * @lblk - logical block in the pending cluster reservation to be removed
1935 *
1936 * Locking for external use of __remove_pending.
1937 */
1938void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk)
1939{
1940	struct ext4_inode_info *ei = EXT4_I(inode);
1941
1942	write_lock(&ei->i_es_lock);
1943	__remove_pending(inode, lblk);
1944	write_unlock(&ei->i_es_lock);
1945}
1946
1947/*
1948 * ext4_is_pending - determine whether a cluster has a pending reservation
1949 *                   on it
1950 *
1951 * @inode - file containing the cluster
1952 * @lblk - logical block in the cluster
1953 *
1954 * Returns true if there's a pending reservation for the cluster in the
1955 * set of pending reservations, and false if not.
1956 */
1957bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
1958{
1959	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1960	struct ext4_inode_info *ei = EXT4_I(inode);
1961	bool ret;
1962
1963	read_lock(&ei->i_es_lock);
1964	ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL);
1965	read_unlock(&ei->i_es_lock);
1966
1967	return ret;
1968}
1969
1970/*
1971 * ext4_es_insert_delayed_block - adds a delayed block to the extents status
1972 *                                tree, adding a pending reservation where
1973 *                                needed
1974 *
1975 * @inode - file containing the newly added block
1976 * @lblk - logical block to be added
1977 * @allocated - indicates whether a physical cluster has been allocated for
1978 *              the logical cluster that contains the block
1979 *
1980 * Returns 0 on success, negative error code on failure.
1981 */
1982int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
1983				 bool allocated)
1984{
1985	struct extent_status newes;
1986	int err = 0;
1987
1988	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
1989		return 0;
1990
1991	es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
1992		 lblk, inode->i_ino);
1993
1994	newes.es_lblk = lblk;
1995	newes.es_len = 1;
1996	ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
1997	trace_ext4_es_insert_delayed_block(inode, &newes, allocated);
1998
1999	ext4_es_insert_extent_check(inode, &newes);
2000
2001	write_lock(&EXT4_I(inode)->i_es_lock);
2002
2003	err = __es_remove_extent(inode, lblk, lblk, NULL);
2004	if (err != 0)
2005		goto error;
2006retry:
2007	err = __es_insert_extent(inode, &newes);
2008	if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
2009					  128, EXT4_I(inode)))
2010		goto retry;
2011	if (err != 0)
2012		goto error;
2013
2014	if (allocated)
2015		__insert_pending(inode, lblk);
2016
2017error:
2018	write_unlock(&EXT4_I(inode)->i_es_lock);
2019
2020	ext4_es_print_tree(inode);
2021	ext4_print_pending_tree(inode);
2022
2023	return err;
2024}
2025
2026/*
2027 * __es_delayed_clu - count number of clusters containing blocks that
2028 *                    are delayed only
2029 *
2030 * @inode - file containing block range
2031 * @start - logical block defining start of range
2032 * @end - logical block defining end of range
2033 *
2034 * Returns the number of clusters containing only delayed (not delayed
2035 * and unwritten) blocks in the range specified by @start and @end.  Any
2036 * cluster or part of a cluster within the range and containing a delayed
2037 * and not unwritten block within the range is counted as a whole cluster.
2038 */
2039static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start,
2040				     ext4_lblk_t end)
2041{
2042	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
2043	struct extent_status *es;
2044	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2045	struct rb_node *node;
2046	ext4_lblk_t first_lclu, last_lclu;
2047	unsigned long long last_counted_lclu;
2048	unsigned int n = 0;
2049
2050	/* guaranteed to be unequal to any ext4_lblk_t value */
2051	last_counted_lclu = ~0ULL;
2052
2053	es = __es_tree_search(&tree->root, start);
2054
2055	while (es && (es->es_lblk <= end)) {
2056		if (ext4_es_is_delonly(es)) {
2057			if (es->es_lblk <= start)
2058				first_lclu = EXT4_B2C(sbi, start);
2059			else
2060				first_lclu = EXT4_B2C(sbi, es->es_lblk);
2061
2062			if (ext4_es_end(es) >= end)
2063				last_lclu = EXT4_B2C(sbi, end);
2064			else
2065				last_lclu = EXT4_B2C(sbi, ext4_es_end(es));
2066
2067			if (first_lclu == last_counted_lclu)
2068				n += last_lclu - first_lclu;
2069			else
2070				n += last_lclu - first_lclu + 1;
2071			last_counted_lclu = last_lclu;
2072		}
2073		node = rb_next(&es->rb_node);
2074		if (!node)
2075			break;
2076		es = rb_entry(node, struct extent_status, rb_node);
2077	}
2078
2079	return n;
2080}
2081
2082/*
2083 * ext4_es_delayed_clu - count number of clusters containing blocks that
2084 *                       are both delayed and unwritten
2085 *
2086 * @inode - file containing block range
2087 * @lblk - logical block defining start of range
2088 * @len - number of blocks in range
2089 *
2090 * Locking for external use of __es_delayed_clu().
2091 */
2092unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
2093				 ext4_lblk_t len)
2094{
2095	struct ext4_inode_info *ei = EXT4_I(inode);
2096	ext4_lblk_t end;
2097	unsigned int n;
2098
2099	if (len == 0)
2100		return 0;
2101
2102	end = lblk + len - 1;
2103	WARN_ON(end < lblk);
2104
2105	read_lock(&ei->i_es_lock);
2106
2107	n = __es_delayed_clu(inode, lblk, end);
2108
2109	read_unlock(&ei->i_es_lock);
2110
2111	return n;
2112}
2113
2114/*
2115 * __revise_pending - makes, cancels, or leaves unchanged pending cluster
2116 *                    reservations for a specified block range depending
2117 *                    upon the presence or absence of delayed blocks
2118 *                    outside the range within clusters at the ends of the
2119 *                    range
2120 *
2121 * @inode - file containing the range
2122 * @lblk - logical block defining the start of range
2123 * @len  - length of range in blocks
2124 *
2125 * Used after a newly allocated extent is added to the extents status tree.
2126 * Requires that the extents in the range have either written or unwritten
2127 * status.  Must be called while holding i_es_lock.
2128 */
2129static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
2130			     ext4_lblk_t len)
2131{
2132	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2133	ext4_lblk_t end = lblk + len - 1;
2134	ext4_lblk_t first, last;
2135	bool f_del = false, l_del = false;
2136
2137	if (len == 0)
2138		return;
2139
2140	/*
2141	 * Two cases - block range within single cluster and block range
2142	 * spanning two or more clusters.  Note that a cluster belonging
2143	 * to a range starting and/or ending on a cluster boundary is treated
2144	 * as if it does not contain a delayed extent.  The new range may
2145	 * have allocated space for previously delayed blocks out to the
2146	 * cluster boundary, requiring that any pre-existing pending
2147	 * reservation be canceled.  Because this code only looks at blocks
2148	 * outside the range, it should revise pending reservations
2149	 * correctly even if the extent represented by the range can't be
2150	 * inserted in the extents status tree due to ENOSPC.
2151	 */
2152
2153	if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) {
2154		first = EXT4_LBLK_CMASK(sbi, lblk);
2155		if (first != lblk)
2156			f_del = __es_scan_range(inode, &ext4_es_is_delonly,
2157						first, lblk - 1);
2158		if (f_del) {
2159			__insert_pending(inode, first);
2160		} else {
2161			last = EXT4_LBLK_CMASK(sbi, end) +
2162			       sbi->s_cluster_ratio - 1;
2163			if (last != end)
2164				l_del = __es_scan_range(inode,
2165							&ext4_es_is_delonly,
2166							end + 1, last);
2167			if (l_del)
2168				__insert_pending(inode, last);
2169			else
2170				__remove_pending(inode, last);
2171		}
2172	} else {
2173		first = EXT4_LBLK_CMASK(sbi, lblk);
2174		if (first != lblk)
2175			f_del = __es_scan_range(inode, &ext4_es_is_delonly,
2176						first, lblk - 1);
2177		if (f_del)
2178			__insert_pending(inode, first);
2179		else
2180			__remove_pending(inode, first);
2181
2182		last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
2183		if (last != end)
2184			l_del = __es_scan_range(inode, &ext4_es_is_delonly,
2185						end + 1, last);
2186		if (l_del)
2187			__insert_pending(inode, last);
2188		else
2189			__remove_pending(inode, last);
2190	}
2191}