Linux Audio

Check our new training course

Loading...
v3.5.6
   1#include <linux/bitops.h>
   2#include <linux/slab.h>
   3#include <linux/bio.h>
   4#include <linux/mm.h>
   5#include <linux/pagemap.h>
   6#include <linux/page-flags.h>
   7#include <linux/module.h>
   8#include <linux/spinlock.h>
   9#include <linux/blkdev.h>
  10#include <linux/swap.h>
  11#include <linux/writeback.h>
  12#include <linux/pagevec.h>
  13#include <linux/prefetch.h>
  14#include <linux/cleancache.h>
  15#include "extent_io.h"
  16#include "extent_map.h"
  17#include "compat.h"
  18#include "ctree.h"
  19#include "btrfs_inode.h"
  20#include "volumes.h"
  21#include "check-integrity.h"
  22#include "locking.h"
  23#include "rcu-string.h"
  24
  25static struct kmem_cache *extent_state_cache;
  26static struct kmem_cache *extent_buffer_cache;
  27
  28static LIST_HEAD(buffers);
  29static LIST_HEAD(states);
  30
  31#define LEAK_DEBUG 0
  32#if LEAK_DEBUG
  33static DEFINE_SPINLOCK(leak_lock);
  34#endif
  35
  36#define BUFFER_LRU_MAX 64
  37
  38struct tree_entry {
  39	u64 start;
  40	u64 end;
  41	struct rb_node rb_node;
  42};
  43
  44struct extent_page_data {
  45	struct bio *bio;
  46	struct extent_io_tree *tree;
  47	get_extent_t *get_extent;
  48
  49	/* tells writepage not to lock the state bits for this range
  50	 * it still does the unlocking
  51	 */
  52	unsigned int extent_locked:1;
  53
  54	/* tells the submit_bio code to use a WRITE_SYNC */
  55	unsigned int sync_io:1;
  56};
  57
  58static noinline void flush_write_bio(void *data);
  59static inline struct btrfs_fs_info *
  60tree_fs_info(struct extent_io_tree *tree)
  61{
  62	return btrfs_sb(tree->mapping->host->i_sb);
  63}
  64
  65int __init extent_io_init(void)
  66{
  67	extent_state_cache = kmem_cache_create("extent_state",
  68			sizeof(struct extent_state), 0,
  69			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
  70	if (!extent_state_cache)
  71		return -ENOMEM;
  72
  73	extent_buffer_cache = kmem_cache_create("extent_buffers",
  74			sizeof(struct extent_buffer), 0,
  75			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
  76	if (!extent_buffer_cache)
  77		goto free_state_cache;
  78	return 0;
  79
  80free_state_cache:
  81	kmem_cache_destroy(extent_state_cache);
  82	return -ENOMEM;
  83}
  84
  85void extent_io_exit(void)
  86{
  87	struct extent_state *state;
  88	struct extent_buffer *eb;
  89
  90	while (!list_empty(&states)) {
  91		state = list_entry(states.next, struct extent_state, leak_list);
  92		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
  93		       "state %lu in tree %p refs %d\n",
  94		       (unsigned long long)state->start,
  95		       (unsigned long long)state->end,
  96		       state->state, state->tree, atomic_read(&state->refs));
  97		list_del(&state->leak_list);
  98		kmem_cache_free(extent_state_cache, state);
  99
 100	}
 101
 102	while (!list_empty(&buffers)) {
 103		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
 104		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
 105		       "refs %d\n", (unsigned long long)eb->start,
 106		       eb->len, atomic_read(&eb->refs));
 107		list_del(&eb->leak_list);
 108		kmem_cache_free(extent_buffer_cache, eb);
 109	}
 110	if (extent_state_cache)
 111		kmem_cache_destroy(extent_state_cache);
 112	if (extent_buffer_cache)
 113		kmem_cache_destroy(extent_buffer_cache);
 114}
 115
 116void extent_io_tree_init(struct extent_io_tree *tree,
 117			 struct address_space *mapping)
 118{
 119	tree->state = RB_ROOT;
 120	INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
 121	tree->ops = NULL;
 122	tree->dirty_bytes = 0;
 123	spin_lock_init(&tree->lock);
 124	spin_lock_init(&tree->buffer_lock);
 125	tree->mapping = mapping;
 126}
 127
 128static struct extent_state *alloc_extent_state(gfp_t mask)
 129{
 130	struct extent_state *state;
 131#if LEAK_DEBUG
 132	unsigned long flags;
 133#endif
 134
 135	state = kmem_cache_alloc(extent_state_cache, mask);
 136	if (!state)
 137		return state;
 138	state->state = 0;
 139	state->private = 0;
 140	state->tree = NULL;
 141#if LEAK_DEBUG
 142	spin_lock_irqsave(&leak_lock, flags);
 143	list_add(&state->leak_list, &states);
 144	spin_unlock_irqrestore(&leak_lock, flags);
 145#endif
 146	atomic_set(&state->refs, 1);
 147	init_waitqueue_head(&state->wq);
 148	trace_alloc_extent_state(state, mask, _RET_IP_);
 149	return state;
 150}
 151
 152void free_extent_state(struct extent_state *state)
 153{
 154	if (!state)
 155		return;
 156	if (atomic_dec_and_test(&state->refs)) {
 157#if LEAK_DEBUG
 158		unsigned long flags;
 159#endif
 160		WARN_ON(state->tree);
 161#if LEAK_DEBUG
 162		spin_lock_irqsave(&leak_lock, flags);
 163		list_del(&state->leak_list);
 164		spin_unlock_irqrestore(&leak_lock, flags);
 165#endif
 166		trace_free_extent_state(state, _RET_IP_);
 167		kmem_cache_free(extent_state_cache, state);
 168	}
 169}
 170
 171static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 172				   struct rb_node *node)
 173{
 174	struct rb_node **p = &root->rb_node;
 175	struct rb_node *parent = NULL;
 176	struct tree_entry *entry;
 177
 178	while (*p) {
 179		parent = *p;
 180		entry = rb_entry(parent, struct tree_entry, rb_node);
 181
 182		if (offset < entry->start)
 183			p = &(*p)->rb_left;
 184		else if (offset > entry->end)
 185			p = &(*p)->rb_right;
 186		else
 187			return parent;
 188	}
 189
 
 190	rb_link_node(node, parent, p);
 191	rb_insert_color(node, root);
 192	return NULL;
 193}
 194
 195static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 196				     struct rb_node **prev_ret,
 197				     struct rb_node **next_ret)
 198{
 199	struct rb_root *root = &tree->state;
 200	struct rb_node *n = root->rb_node;
 201	struct rb_node *prev = NULL;
 202	struct rb_node *orig_prev = NULL;
 203	struct tree_entry *entry;
 204	struct tree_entry *prev_entry = NULL;
 205
 206	while (n) {
 207		entry = rb_entry(n, struct tree_entry, rb_node);
 208		prev = n;
 209		prev_entry = entry;
 210
 211		if (offset < entry->start)
 212			n = n->rb_left;
 213		else if (offset > entry->end)
 214			n = n->rb_right;
 215		else
 216			return n;
 217	}
 218
 219	if (prev_ret) {
 220		orig_prev = prev;
 221		while (prev && offset > prev_entry->end) {
 222			prev = rb_next(prev);
 223			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 224		}
 225		*prev_ret = prev;
 226		prev = orig_prev;
 227	}
 228
 229	if (next_ret) {
 230		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 231		while (prev && offset < prev_entry->start) {
 232			prev = rb_prev(prev);
 233			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 234		}
 235		*next_ret = prev;
 236	}
 237	return NULL;
 238}
 239
 240static inline struct rb_node *tree_search(struct extent_io_tree *tree,
 241					  u64 offset)
 242{
 243	struct rb_node *prev = NULL;
 244	struct rb_node *ret;
 245
 246	ret = __etree_search(tree, offset, &prev, NULL);
 247	if (!ret)
 248		return prev;
 249	return ret;
 250}
 251
 252static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
 253		     struct extent_state *other)
 254{
 255	if (tree->ops && tree->ops->merge_extent_hook)
 256		tree->ops->merge_extent_hook(tree->mapping->host, new,
 257					     other);
 258}
 259
 260/*
 261 * utility function to look for merge candidates inside a given range.
 262 * Any extents with matching state are merged together into a single
 263 * extent in the tree.  Extents with EXTENT_IO in their state field
 264 * are not merged because the end_io handlers need to be able to do
 265 * operations on them without sleeping (or doing allocations/splits).
 266 *
 267 * This should be called with the tree lock held.
 268 */
 269static void merge_state(struct extent_io_tree *tree,
 270		        struct extent_state *state)
 271{
 272	struct extent_state *other;
 273	struct rb_node *other_node;
 274
 275	if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
 276		return;
 277
 278	other_node = rb_prev(&state->rb_node);
 279	if (other_node) {
 280		other = rb_entry(other_node, struct extent_state, rb_node);
 281		if (other->end == state->start - 1 &&
 282		    other->state == state->state) {
 283			merge_cb(tree, state, other);
 284			state->start = other->start;
 285			other->tree = NULL;
 286			rb_erase(&other->rb_node, &tree->state);
 287			free_extent_state(other);
 288		}
 289	}
 290	other_node = rb_next(&state->rb_node);
 291	if (other_node) {
 292		other = rb_entry(other_node, struct extent_state, rb_node);
 293		if (other->start == state->end + 1 &&
 294		    other->state == state->state) {
 295			merge_cb(tree, state, other);
 296			state->end = other->end;
 297			other->tree = NULL;
 298			rb_erase(&other->rb_node, &tree->state);
 299			free_extent_state(other);
 300		}
 301	}
 302}
 303
 304static void set_state_cb(struct extent_io_tree *tree,
 305			 struct extent_state *state, int *bits)
 306{
 307	if (tree->ops && tree->ops->set_bit_hook)
 308		tree->ops->set_bit_hook(tree->mapping->host, state, bits);
 309}
 310
 311static void clear_state_cb(struct extent_io_tree *tree,
 312			   struct extent_state *state, int *bits)
 313{
 314	if (tree->ops && tree->ops->clear_bit_hook)
 315		tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
 316}
 317
 318static void set_state_bits(struct extent_io_tree *tree,
 319			   struct extent_state *state, int *bits);
 320
 321/*
 322 * insert an extent_state struct into the tree.  'bits' are set on the
 323 * struct before it is inserted.
 324 *
 325 * This may return -EEXIST if the extent is already there, in which case the
 326 * state struct is freed.
 327 *
 328 * The tree lock is not taken internally.  This is a utility function and
 329 * probably isn't what you want to call (see set/clear_extent_bit).
 330 */
 331static int insert_state(struct extent_io_tree *tree,
 332			struct extent_state *state, u64 start, u64 end,
 333			int *bits)
 334{
 335	struct rb_node *node;
 336
 337	if (end < start) {
 338		printk(KERN_ERR "btrfs end < start %llu %llu\n",
 339		       (unsigned long long)end,
 340		       (unsigned long long)start);
 341		WARN_ON(1);
 342	}
 343	state->start = start;
 344	state->end = end;
 345
 346	set_state_bits(tree, state, bits);
 347
 348	node = tree_insert(&tree->state, end, &state->rb_node);
 349	if (node) {
 350		struct extent_state *found;
 351		found = rb_entry(node, struct extent_state, rb_node);
 352		printk(KERN_ERR "btrfs found node %llu %llu on insert of "
 353		       "%llu %llu\n", (unsigned long long)found->start,
 354		       (unsigned long long)found->end,
 355		       (unsigned long long)start, (unsigned long long)end);
 356		return -EEXIST;
 357	}
 358	state->tree = tree;
 359	merge_state(tree, state);
 360	return 0;
 361}
 362
 363static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
 364		     u64 split)
 365{
 366	if (tree->ops && tree->ops->split_extent_hook)
 367		tree->ops->split_extent_hook(tree->mapping->host, orig, split);
 368}
 369
 370/*
 371 * split a given extent state struct in two, inserting the preallocated
 372 * struct 'prealloc' as the newly created second half.  'split' indicates an
 373 * offset inside 'orig' where it should be split.
 374 *
 375 * Before calling,
 376 * the tree has 'orig' at [orig->start, orig->end].  After calling, there
 377 * are two extent state structs in the tree:
 378 * prealloc: [orig->start, split - 1]
 379 * orig: [ split, orig->end ]
 380 *
 381 * The tree locks are not taken by this function. They need to be held
 382 * by the caller.
 383 */
 384static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 385		       struct extent_state *prealloc, u64 split)
 386{
 387	struct rb_node *node;
 388
 389	split_cb(tree, orig, split);
 390
 391	prealloc->start = orig->start;
 392	prealloc->end = split - 1;
 393	prealloc->state = orig->state;
 394	orig->start = split;
 395
 396	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
 397	if (node) {
 398		free_extent_state(prealloc);
 399		return -EEXIST;
 400	}
 401	prealloc->tree = tree;
 402	return 0;
 403}
 404
 405static struct extent_state *next_state(struct extent_state *state)
 406{
 407	struct rb_node *next = rb_next(&state->rb_node);
 408	if (next)
 409		return rb_entry(next, struct extent_state, rb_node);
 410	else
 411		return NULL;
 412}
 413
 414/*
 415 * utility function to clear some bits in an extent state struct.
 416 * it will optionally wake up any one waiting on this state (wake == 1).
 
 417 *
 418 * If no bits are set on the state struct after clearing things, the
 419 * struct is freed and removed from the tree
 420 */
 421static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
 422					    struct extent_state *state,
 423					    int *bits, int wake)
 424{
 425	struct extent_state *next;
 426	int bits_to_clear = *bits & ~EXTENT_CTLBITS;
 
 427
 428	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
 429		u64 range = state->end - state->start + 1;
 430		WARN_ON(range > tree->dirty_bytes);
 431		tree->dirty_bytes -= range;
 432	}
 433	clear_state_cb(tree, state, bits);
 434	state->state &= ~bits_to_clear;
 435	if (wake)
 436		wake_up(&state->wq);
 437	if (state->state == 0) {
 438		next = next_state(state);
 439		if (state->tree) {
 440			rb_erase(&state->rb_node, &tree->state);
 441			state->tree = NULL;
 442			free_extent_state(state);
 443		} else {
 444			WARN_ON(1);
 445		}
 446	} else {
 447		merge_state(tree, state);
 448		next = next_state(state);
 449	}
 450	return next;
 451}
 452
 453static struct extent_state *
 454alloc_extent_state_atomic(struct extent_state *prealloc)
 455{
 456	if (!prealloc)
 457		prealloc = alloc_extent_state(GFP_ATOMIC);
 458
 459	return prealloc;
 460}
 461
 462void extent_io_tree_panic(struct extent_io_tree *tree, int err)
 463{
 464	btrfs_panic(tree_fs_info(tree), err, "Locking error: "
 465		    "Extent tree was modified by another "
 466		    "thread while locked.");
 467}
 468
 469/*
 470 * clear some bits on a range in the tree.  This may require splitting
 471 * or inserting elements in the tree, so the gfp mask is used to
 472 * indicate which allocations or sleeping are allowed.
 473 *
 474 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
 475 * the given range from the tree regardless of state (ie for truncate).
 476 *
 477 * the range [start, end] is inclusive.
 478 *
 479 * This takes the tree lock, and returns 0 on success and < 0 on error.
 
 480 */
 481int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 482		     int bits, int wake, int delete,
 483		     struct extent_state **cached_state,
 484		     gfp_t mask)
 485{
 486	struct extent_state *state;
 487	struct extent_state *cached;
 488	struct extent_state *prealloc = NULL;
 
 489	struct rb_node *node;
 490	u64 last_end;
 491	int err;
 
 492	int clear = 0;
 493
 494	if (delete)
 495		bits |= ~EXTENT_CTLBITS;
 496	bits |= EXTENT_FIRST_DELALLOC;
 497
 498	if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
 499		clear = 1;
 500again:
 501	if (!prealloc && (mask & __GFP_WAIT)) {
 502		prealloc = alloc_extent_state(mask);
 503		if (!prealloc)
 504			return -ENOMEM;
 505	}
 506
 507	spin_lock(&tree->lock);
 508	if (cached_state) {
 509		cached = *cached_state;
 510
 511		if (clear) {
 512			*cached_state = NULL;
 513			cached_state = NULL;
 514		}
 515
 516		if (cached && cached->tree && cached->start <= start &&
 517		    cached->end > start) {
 518			if (clear)
 519				atomic_dec(&cached->refs);
 520			state = cached;
 521			goto hit_next;
 522		}
 523		if (clear)
 524			free_extent_state(cached);
 525	}
 526	/*
 527	 * this search will find the extents that end after
 528	 * our range starts
 529	 */
 530	node = tree_search(tree, start);
 531	if (!node)
 532		goto out;
 533	state = rb_entry(node, struct extent_state, rb_node);
 534hit_next:
 535	if (state->start > end)
 536		goto out;
 537	WARN_ON(state->end < start);
 538	last_end = state->end;
 539
 540	/* the state doesn't have the wanted bits, go ahead */
 541	if (!(state->state & bits)) {
 542		state = next_state(state);
 543		goto next;
 544	}
 545
 546	/*
 547	 *     | ---- desired range ---- |
 548	 *  | state | or
 549	 *  | ------------- state -------------- |
 550	 *
 551	 * We need to split the extent we found, and may flip
 552	 * bits on second half.
 553	 *
 554	 * If the extent we found extends past our range, we
 555	 * just split and search again.  It'll get split again
 556	 * the next time though.
 557	 *
 558	 * If the extent we found is inside our range, we clear
 559	 * the desired bit on it.
 560	 */
 561
 562	if (state->start < start) {
 563		prealloc = alloc_extent_state_atomic(prealloc);
 564		BUG_ON(!prealloc);
 565		err = split_state(tree, state, prealloc, start);
 566		if (err)
 567			extent_io_tree_panic(tree, err);
 568
 569		prealloc = NULL;
 570		if (err)
 571			goto out;
 572		if (state->end <= end) {
 573			state = clear_state_bit(tree, state, &bits, wake);
 574			goto next;
 
 
 575		}
 576		goto search_again;
 577	}
 578	/*
 579	 * | ---- desired range ---- |
 580	 *                        | state |
 581	 * We need to split the extent, and clear the bit
 582	 * on the first half
 583	 */
 584	if (state->start <= end && state->end > end) {
 585		prealloc = alloc_extent_state_atomic(prealloc);
 586		BUG_ON(!prealloc);
 587		err = split_state(tree, state, prealloc, end + 1);
 588		if (err)
 589			extent_io_tree_panic(tree, err);
 590
 591		if (wake)
 592			wake_up(&state->wq);
 593
 594		clear_state_bit(tree, prealloc, &bits, wake);
 595
 596		prealloc = NULL;
 597		goto out;
 598	}
 599
 600	state = clear_state_bit(tree, state, &bits, wake);
 601next:
 
 
 
 
 602	if (last_end == (u64)-1)
 603		goto out;
 604	start = last_end + 1;
 605	if (start <= end && state && !need_resched())
 606		goto hit_next;
 
 
 
 
 607	goto search_again;
 608
 609out:
 610	spin_unlock(&tree->lock);
 611	if (prealloc)
 612		free_extent_state(prealloc);
 613
 614	return 0;
 615
 616search_again:
 617	if (start > end)
 618		goto out;
 619	spin_unlock(&tree->lock);
 620	if (mask & __GFP_WAIT)
 621		cond_resched();
 622	goto again;
 623}
 624
 625static void wait_on_state(struct extent_io_tree *tree,
 626			  struct extent_state *state)
 627		__releases(tree->lock)
 628		__acquires(tree->lock)
 629{
 630	DEFINE_WAIT(wait);
 631	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
 632	spin_unlock(&tree->lock);
 633	schedule();
 634	spin_lock(&tree->lock);
 635	finish_wait(&state->wq, &wait);
 
 636}
 637
 638/*
 639 * waits for one or more bits to clear on a range in the state tree.
 640 * The range [start, end] is inclusive.
 641 * The tree lock is taken by this function
 642 */
 643void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
 644{
 645	struct extent_state *state;
 646	struct rb_node *node;
 647
 648	spin_lock(&tree->lock);
 649again:
 650	while (1) {
 651		/*
 652		 * this search will find all the extents that end after
 653		 * our range starts
 654		 */
 655		node = tree_search(tree, start);
 656		if (!node)
 657			break;
 658
 659		state = rb_entry(node, struct extent_state, rb_node);
 660
 661		if (state->start > end)
 662			goto out;
 663
 664		if (state->state & bits) {
 665			start = state->start;
 666			atomic_inc(&state->refs);
 667			wait_on_state(tree, state);
 668			free_extent_state(state);
 669			goto again;
 670		}
 671		start = state->end + 1;
 672
 673		if (start > end)
 674			break;
 675
 676		cond_resched_lock(&tree->lock);
 677	}
 678out:
 679	spin_unlock(&tree->lock);
 
 680}
 681
 682static void set_state_bits(struct extent_io_tree *tree,
 683			   struct extent_state *state,
 684			   int *bits)
 685{
 686	int bits_to_set = *bits & ~EXTENT_CTLBITS;
 687
 688	set_state_cb(tree, state, bits);
 689	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
 690		u64 range = state->end - state->start + 1;
 691		tree->dirty_bytes += range;
 692	}
 693	state->state |= bits_to_set;
 694}
 695
 696static void cache_state(struct extent_state *state,
 697			struct extent_state **cached_ptr)
 698{
 699	if (cached_ptr && !(*cached_ptr)) {
 700		if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
 701			*cached_ptr = state;
 702			atomic_inc(&state->refs);
 703		}
 704	}
 705}
 706
 707static void uncache_state(struct extent_state **cached_ptr)
 708{
 709	if (cached_ptr && (*cached_ptr)) {
 710		struct extent_state *state = *cached_ptr;
 711		*cached_ptr = NULL;
 712		free_extent_state(state);
 713	}
 714}
 715
 716/*
 717 * set some bits on a range in the tree.  This may require allocations or
 718 * sleeping, so the gfp mask is used to indicate what is allowed.
 719 *
 720 * If any of the exclusive bits are set, this will fail with -EEXIST if some
 721 * part of the range already has the desired bits set.  The start of the
 722 * existing range is returned in failed_start in this case.
 723 *
 724 * [start, end] is inclusive This takes the tree lock.
 725 */
 726
 727static int __must_check
 728__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 729		 int bits, int exclusive_bits, u64 *failed_start,
 730		 struct extent_state **cached_state, gfp_t mask)
 731{
 732	struct extent_state *state;
 733	struct extent_state *prealloc = NULL;
 734	struct rb_node *node;
 735	int err = 0;
 736	u64 last_start;
 737	u64 last_end;
 738
 739	bits |= EXTENT_FIRST_DELALLOC;
 740again:
 741	if (!prealloc && (mask & __GFP_WAIT)) {
 742		prealloc = alloc_extent_state(mask);
 743		BUG_ON(!prealloc);
 744	}
 745
 746	spin_lock(&tree->lock);
 747	if (cached_state && *cached_state) {
 748		state = *cached_state;
 749		if (state->start <= start && state->end > start &&
 750		    state->tree) {
 751			node = &state->rb_node;
 752			goto hit_next;
 753		}
 754	}
 755	/*
 756	 * this search will find all the extents that end after
 757	 * our range starts.
 758	 */
 759	node = tree_search(tree, start);
 760	if (!node) {
 761		prealloc = alloc_extent_state_atomic(prealloc);
 762		BUG_ON(!prealloc);
 763		err = insert_state(tree, prealloc, start, end, &bits);
 764		if (err)
 765			extent_io_tree_panic(tree, err);
 766
 767		prealloc = NULL;
 
 768		goto out;
 769	}
 770	state = rb_entry(node, struct extent_state, rb_node);
 771hit_next:
 772	last_start = state->start;
 773	last_end = state->end;
 774
 775	/*
 776	 * | ---- desired range ---- |
 777	 * | state |
 778	 *
 779	 * Just lock what we found and keep going
 780	 */
 781	if (state->start == start && state->end <= end) {
 
 782		if (state->state & exclusive_bits) {
 783			*failed_start = state->start;
 784			err = -EEXIST;
 785			goto out;
 786		}
 787
 788		set_state_bits(tree, state, &bits);
 
 789		cache_state(state, cached_state);
 790		merge_state(tree, state);
 791		if (last_end == (u64)-1)
 792			goto out;
 
 793		start = last_end + 1;
 794		state = next_state(state);
 795		if (start < end && state && state->start == start &&
 796		    !need_resched())
 797			goto hit_next;
 
 
 
 798		goto search_again;
 799	}
 800
 801	/*
 802	 *     | ---- desired range ---- |
 803	 * | state |
 804	 *   or
 805	 * | ------------- state -------------- |
 806	 *
 807	 * We need to split the extent we found, and may flip bits on
 808	 * second half.
 809	 *
 810	 * If the extent we found extends past our
 811	 * range, we just split and search again.  It'll get split
 812	 * again the next time though.
 813	 *
 814	 * If the extent we found is inside our range, we set the
 815	 * desired bit on it.
 816	 */
 817	if (state->start < start) {
 818		if (state->state & exclusive_bits) {
 819			*failed_start = start;
 820			err = -EEXIST;
 821			goto out;
 822		}
 823
 824		prealloc = alloc_extent_state_atomic(prealloc);
 825		BUG_ON(!prealloc);
 826		err = split_state(tree, state, prealloc, start);
 827		if (err)
 828			extent_io_tree_panic(tree, err);
 829
 830		prealloc = NULL;
 831		if (err)
 832			goto out;
 833		if (state->end <= end) {
 834			set_state_bits(tree, state, &bits);
 835			cache_state(state, cached_state);
 836			merge_state(tree, state);
 837			if (last_end == (u64)-1)
 838				goto out;
 839			start = last_end + 1;
 840			state = next_state(state);
 841			if (start < end && state && state->start == start &&
 842			    !need_resched())
 843				goto hit_next;
 844		}
 845		goto search_again;
 846	}
 847	/*
 848	 * | ---- desired range ---- |
 849	 *     | state | or               | state |
 850	 *
 851	 * There's a hole, we need to insert something in it and
 852	 * ignore the extent we found.
 853	 */
 854	if (state->start > start) {
 855		u64 this_end;
 856		if (end < last_start)
 857			this_end = end;
 858		else
 859			this_end = last_start - 1;
 860
 861		prealloc = alloc_extent_state_atomic(prealloc);
 862		BUG_ON(!prealloc);
 863
 864		/*
 865		 * Avoid to free 'prealloc' if it can be merged with
 866		 * the later extent.
 867		 */
 868		err = insert_state(tree, prealloc, start, this_end,
 869				   &bits);
 870		if (err)
 871			extent_io_tree_panic(tree, err);
 872
 
 
 
 873		cache_state(prealloc, cached_state);
 874		prealloc = NULL;
 875		start = this_end + 1;
 876		goto search_again;
 877	}
 878	/*
 879	 * | ---- desired range ---- |
 880	 *                        | state |
 881	 * We need to split the extent, and set the bit
 882	 * on the first half
 883	 */
 884	if (state->start <= end && state->end > end) {
 885		if (state->state & exclusive_bits) {
 886			*failed_start = start;
 887			err = -EEXIST;
 888			goto out;
 889		}
 890
 891		prealloc = alloc_extent_state_atomic(prealloc);
 892		BUG_ON(!prealloc);
 893		err = split_state(tree, state, prealloc, end + 1);
 894		if (err)
 895			extent_io_tree_panic(tree, err);
 896
 897		set_state_bits(tree, prealloc, &bits);
 898		cache_state(prealloc, cached_state);
 899		merge_state(tree, prealloc);
 900		prealloc = NULL;
 901		goto out;
 902	}
 903
 904	goto search_again;
 905
 906out:
 907	spin_unlock(&tree->lock);
 908	if (prealloc)
 909		free_extent_state(prealloc);
 910
 911	return err;
 912
 913search_again:
 914	if (start > end)
 915		goto out;
 916	spin_unlock(&tree->lock);
 917	if (mask & __GFP_WAIT)
 918		cond_resched();
 919	goto again;
 920}
 921
 922int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
 923		   u64 *failed_start, struct extent_state **cached_state,
 924		   gfp_t mask)
 925{
 926	return __set_extent_bit(tree, start, end, bits, 0, failed_start,
 927				cached_state, mask);
 928}
 929
 930
 931/**
 932 * convert_extent - convert all bits in a given range from one bit to another
 933 * @tree:	the io tree to search
 934 * @start:	the start offset in bytes
 935 * @end:	the end offset in bytes (inclusive)
 936 * @bits:	the bits to set in this range
 937 * @clear_bits:	the bits to clear in this range
 938 * @mask:	the allocation mask
 939 *
 940 * This will go through and set bits for the given range.  If any states exist
 941 * already in this range they are set with the given bit and cleared of the
 942 * clear_bits.  This is only meant to be used by things that are mergeable, ie
 943 * converting from say DELALLOC to DIRTY.  This is not meant to be used with
 944 * boundary bits like LOCK.
 945 */
 946int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 947		       int bits, int clear_bits, gfp_t mask)
 948{
 949	struct extent_state *state;
 950	struct extent_state *prealloc = NULL;
 951	struct rb_node *node;
 952	int err = 0;
 953	u64 last_start;
 954	u64 last_end;
 955
 956again:
 957	if (!prealloc && (mask & __GFP_WAIT)) {
 958		prealloc = alloc_extent_state(mask);
 959		if (!prealloc)
 960			return -ENOMEM;
 961	}
 962
 963	spin_lock(&tree->lock);
 964	/*
 965	 * this search will find all the extents that end after
 966	 * our range starts.
 967	 */
 968	node = tree_search(tree, start);
 969	if (!node) {
 970		prealloc = alloc_extent_state_atomic(prealloc);
 971		if (!prealloc) {
 972			err = -ENOMEM;
 973			goto out;
 974		}
 975		err = insert_state(tree, prealloc, start, end, &bits);
 976		prealloc = NULL;
 977		if (err)
 978			extent_io_tree_panic(tree, err);
 979		goto out;
 980	}
 981	state = rb_entry(node, struct extent_state, rb_node);
 982hit_next:
 983	last_start = state->start;
 984	last_end = state->end;
 985
 986	/*
 987	 * | ---- desired range ---- |
 988	 * | state |
 989	 *
 990	 * Just lock what we found and keep going
 991	 */
 992	if (state->start == start && state->end <= end) {
 993		set_state_bits(tree, state, &bits);
 994		state = clear_state_bit(tree, state, &clear_bits, 0);
 995		if (last_end == (u64)-1)
 996			goto out;
 997		start = last_end + 1;
 998		if (start < end && state && state->start == start &&
 999		    !need_resched())
1000			goto hit_next;
1001		goto search_again;
1002	}
1003
1004	/*
1005	 *     | ---- desired range ---- |
1006	 * | state |
1007	 *   or
1008	 * | ------------- state -------------- |
1009	 *
1010	 * We need to split the extent we found, and may flip bits on
1011	 * second half.
1012	 *
1013	 * If the extent we found extends past our
1014	 * range, we just split and search again.  It'll get split
1015	 * again the next time though.
1016	 *
1017	 * If the extent we found is inside our range, we set the
1018	 * desired bit on it.
1019	 */
1020	if (state->start < start) {
1021		prealloc = alloc_extent_state_atomic(prealloc);
1022		if (!prealloc) {
1023			err = -ENOMEM;
1024			goto out;
1025		}
1026		err = split_state(tree, state, prealloc, start);
1027		if (err)
1028			extent_io_tree_panic(tree, err);
1029		prealloc = NULL;
1030		if (err)
1031			goto out;
1032		if (state->end <= end) {
1033			set_state_bits(tree, state, &bits);
1034			state = clear_state_bit(tree, state, &clear_bits, 0);
1035			if (last_end == (u64)-1)
1036				goto out;
1037			start = last_end + 1;
1038			if (start < end && state && state->start == start &&
1039			    !need_resched())
1040				goto hit_next;
1041		}
1042		goto search_again;
1043	}
1044	/*
1045	 * | ---- desired range ---- |
1046	 *     | state | or               | state |
1047	 *
1048	 * There's a hole, we need to insert something in it and
1049	 * ignore the extent we found.
1050	 */
1051	if (state->start > start) {
1052		u64 this_end;
1053		if (end < last_start)
1054			this_end = end;
1055		else
1056			this_end = last_start - 1;
1057
1058		prealloc = alloc_extent_state_atomic(prealloc);
1059		if (!prealloc) {
1060			err = -ENOMEM;
1061			goto out;
1062		}
1063
1064		/*
1065		 * Avoid to free 'prealloc' if it can be merged with
1066		 * the later extent.
1067		 */
1068		err = insert_state(tree, prealloc, start, this_end,
1069				   &bits);
1070		if (err)
1071			extent_io_tree_panic(tree, err);
1072		prealloc = NULL;
1073		start = this_end + 1;
1074		goto search_again;
1075	}
1076	/*
1077	 * | ---- desired range ---- |
1078	 *                        | state |
1079	 * We need to split the extent, and set the bit
1080	 * on the first half
1081	 */
1082	if (state->start <= end && state->end > end) {
1083		prealloc = alloc_extent_state_atomic(prealloc);
1084		if (!prealloc) {
1085			err = -ENOMEM;
1086			goto out;
1087		}
1088
1089		err = split_state(tree, state, prealloc, end + 1);
1090		if (err)
1091			extent_io_tree_panic(tree, err);
1092
1093		set_state_bits(tree, prealloc, &bits);
1094		clear_state_bit(tree, prealloc, &clear_bits, 0);
1095		prealloc = NULL;
1096		goto out;
1097	}
1098
1099	goto search_again;
1100
1101out:
1102	spin_unlock(&tree->lock);
1103	if (prealloc)
1104		free_extent_state(prealloc);
1105
1106	return err;
1107
1108search_again:
1109	if (start > end)
1110		goto out;
1111	spin_unlock(&tree->lock);
1112	if (mask & __GFP_WAIT)
1113		cond_resched();
1114	goto again;
1115}
1116
1117/* wrappers around set/clear extent bit */
1118int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1119		     gfp_t mask)
1120{
1121	return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
1122			      NULL, mask);
1123}
1124
1125int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1126		    int bits, gfp_t mask)
1127{
1128	return set_extent_bit(tree, start, end, bits, NULL,
1129			      NULL, mask);
1130}
1131
1132int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1133		      int bits, gfp_t mask)
1134{
1135	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
1136}
1137
1138int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
1139			struct extent_state **cached_state, gfp_t mask)
1140{
1141	return set_extent_bit(tree, start, end,
1142			      EXTENT_DELALLOC | EXTENT_UPTODATE,
1143			      NULL, cached_state, mask);
1144}
1145
1146int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1147		       gfp_t mask)
1148{
1149	return clear_extent_bit(tree, start, end,
1150				EXTENT_DIRTY | EXTENT_DELALLOC |
1151				EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
1152}
1153
1154int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
1155		     gfp_t mask)
1156{
1157	return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
1158			      NULL, mask);
1159}
1160
1161int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1162			struct extent_state **cached_state, gfp_t mask)
1163{
1164	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
1165			      cached_state, mask);
1166}
1167
1168int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1169			  struct extent_state **cached_state, gfp_t mask)
 
1170{
1171	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
1172				cached_state, mask);
1173}
1174
1175/*
1176 * either insert or lock state struct between start and end use mask to tell
1177 * us if waiting is desired.
1178 */
1179int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1180		     int bits, struct extent_state **cached_state)
1181{
1182	int err;
1183	u64 failed_start;
1184	while (1) {
1185		err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
1186				       EXTENT_LOCKED, &failed_start,
1187				       cached_state, GFP_NOFS);
1188		if (err == -EEXIST) {
1189			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1190			start = failed_start;
1191		} else
1192			break;
 
1193		WARN_ON(start > end);
1194	}
1195	return err;
1196}
1197
1198int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1199{
1200	return lock_extent_bits(tree, start, end, 0, NULL);
1201}
1202
1203int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
 
1204{
1205	int err;
1206	u64 failed_start;
1207
1208	err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1209			       &failed_start, NULL, GFP_NOFS);
1210	if (err == -EEXIST) {
1211		if (failed_start > start)
1212			clear_extent_bit(tree, start, failed_start - 1,
1213					 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
1214		return 0;
1215	}
1216	return 1;
1217}
1218
1219int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1220			 struct extent_state **cached, gfp_t mask)
1221{
1222	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1223				mask);
1224}
1225
1226int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1227{
1228	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1229				GFP_NOFS);
1230}
1231
1232/*
1233 * helper function to set both pages and extents in the tree writeback
1234 */
1235static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1236{
1237	unsigned long index = start >> PAGE_CACHE_SHIFT;
1238	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1239	struct page *page;
1240
1241	while (index <= end_index) {
1242		page = find_get_page(tree->mapping, index);
1243		BUG_ON(!page); /* Pages should be in the extent_io_tree */
1244		set_page_writeback(page);
1245		page_cache_release(page);
1246		index++;
1247	}
1248	return 0;
1249}
1250
1251/* find the first state struct with 'bits' set after 'start', and
1252 * return it.  tree->lock must be held.  NULL will returned if
1253 * nothing was found after 'start'
1254 */
1255struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1256						 u64 start, int bits)
1257{
1258	struct rb_node *node;
1259	struct extent_state *state;
1260
1261	/*
1262	 * this search will find all the extents that end after
1263	 * our range starts.
1264	 */
1265	node = tree_search(tree, start);
1266	if (!node)
1267		goto out;
1268
1269	while (1) {
1270		state = rb_entry(node, struct extent_state, rb_node);
1271		if (state->end >= start && (state->state & bits))
1272			return state;
1273
1274		node = rb_next(node);
1275		if (!node)
1276			break;
1277	}
1278out:
1279	return NULL;
1280}
1281
1282/*
1283 * find the first offset in the io tree with 'bits' set. zero is
1284 * returned if we find something, and *start_ret and *end_ret are
1285 * set to reflect the state struct that was found.
1286 *
1287 * If nothing was found, 1 is returned. If found something, return 0.
1288 */
1289int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1290			  u64 *start_ret, u64 *end_ret, int bits)
1291{
1292	struct extent_state *state;
1293	int ret = 1;
1294
1295	spin_lock(&tree->lock);
1296	state = find_first_extent_bit_state(tree, start, bits);
1297	if (state) {
1298		*start_ret = state->start;
1299		*end_ret = state->end;
1300		ret = 0;
1301	}
1302	spin_unlock(&tree->lock);
1303	return ret;
1304}
1305
1306/*
1307 * find a contiguous range of bytes in the file marked as delalloc, not
1308 * more than 'max_bytes'.  start and end are used to return the range,
1309 *
1310 * 1 is returned if we find something, 0 if nothing was in the tree
1311 */
1312static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1313					u64 *start, u64 *end, u64 max_bytes,
1314					struct extent_state **cached_state)
1315{
1316	struct rb_node *node;
1317	struct extent_state *state;
1318	u64 cur_start = *start;
1319	u64 found = 0;
1320	u64 total_bytes = 0;
1321
1322	spin_lock(&tree->lock);
1323
1324	/*
1325	 * this search will find all the extents that end after
1326	 * our range starts.
1327	 */
1328	node = tree_search(tree, cur_start);
1329	if (!node) {
1330		if (!found)
1331			*end = (u64)-1;
1332		goto out;
1333	}
1334
1335	while (1) {
1336		state = rb_entry(node, struct extent_state, rb_node);
1337		if (found && (state->start != cur_start ||
1338			      (state->state & EXTENT_BOUNDARY))) {
1339			goto out;
1340		}
1341		if (!(state->state & EXTENT_DELALLOC)) {
1342			if (!found)
1343				*end = state->end;
1344			goto out;
1345		}
1346		if (!found) {
1347			*start = state->start;
1348			*cached_state = state;
1349			atomic_inc(&state->refs);
1350		}
1351		found++;
1352		*end = state->end;
1353		cur_start = state->end + 1;
1354		node = rb_next(node);
1355		if (!node)
1356			break;
1357		total_bytes += state->end - state->start + 1;
1358		if (total_bytes >= max_bytes)
1359			break;
1360	}
1361out:
1362	spin_unlock(&tree->lock);
1363	return found;
1364}
1365
1366static noinline void __unlock_for_delalloc(struct inode *inode,
1367					   struct page *locked_page,
1368					   u64 start, u64 end)
1369{
1370	int ret;
1371	struct page *pages[16];
1372	unsigned long index = start >> PAGE_CACHE_SHIFT;
1373	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1374	unsigned long nr_pages = end_index - index + 1;
1375	int i;
1376
1377	if (index == locked_page->index && end_index == index)
1378		return;
1379
1380	while (nr_pages > 0) {
1381		ret = find_get_pages_contig(inode->i_mapping, index,
1382				     min_t(unsigned long, nr_pages,
1383				     ARRAY_SIZE(pages)), pages);
1384		for (i = 0; i < ret; i++) {
1385			if (pages[i] != locked_page)
1386				unlock_page(pages[i]);
1387			page_cache_release(pages[i]);
1388		}
1389		nr_pages -= ret;
1390		index += ret;
1391		cond_resched();
1392	}
 
1393}
1394
1395static noinline int lock_delalloc_pages(struct inode *inode,
1396					struct page *locked_page,
1397					u64 delalloc_start,
1398					u64 delalloc_end)
1399{
1400	unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1401	unsigned long start_index = index;
1402	unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1403	unsigned long pages_locked = 0;
1404	struct page *pages[16];
1405	unsigned long nrpages;
1406	int ret;
1407	int i;
1408
1409	/* the caller is responsible for locking the start index */
1410	if (index == locked_page->index && index == end_index)
1411		return 0;
1412
1413	/* skip the page at the start index */
1414	nrpages = end_index - index + 1;
1415	while (nrpages > 0) {
1416		ret = find_get_pages_contig(inode->i_mapping, index,
1417				     min_t(unsigned long,
1418				     nrpages, ARRAY_SIZE(pages)), pages);
1419		if (ret == 0) {
1420			ret = -EAGAIN;
1421			goto done;
1422		}
1423		/* now we have an array of pages, lock them all */
1424		for (i = 0; i < ret; i++) {
1425			/*
1426			 * the caller is taking responsibility for
1427			 * locked_page
1428			 */
1429			if (pages[i] != locked_page) {
1430				lock_page(pages[i]);
1431				if (!PageDirty(pages[i]) ||
1432				    pages[i]->mapping != inode->i_mapping) {
1433					ret = -EAGAIN;
1434					unlock_page(pages[i]);
1435					page_cache_release(pages[i]);
1436					goto done;
1437				}
1438			}
1439			page_cache_release(pages[i]);
1440			pages_locked++;
1441		}
1442		nrpages -= ret;
1443		index += ret;
1444		cond_resched();
1445	}
1446	ret = 0;
1447done:
1448	if (ret && pages_locked) {
1449		__unlock_for_delalloc(inode, locked_page,
1450			      delalloc_start,
1451			      ((u64)(start_index + pages_locked - 1)) <<
1452			      PAGE_CACHE_SHIFT);
1453	}
1454	return ret;
1455}
1456
1457/*
1458 * find a contiguous range of bytes in the file marked as delalloc, not
1459 * more than 'max_bytes'.  start and end are used to return the range,
1460 *
1461 * 1 is returned if we find something, 0 if nothing was in the tree
1462 */
1463static noinline u64 find_lock_delalloc_range(struct inode *inode,
1464					     struct extent_io_tree *tree,
1465					     struct page *locked_page,
1466					     u64 *start, u64 *end,
1467					     u64 max_bytes)
1468{
1469	u64 delalloc_start;
1470	u64 delalloc_end;
1471	u64 found;
1472	struct extent_state *cached_state = NULL;
1473	int ret;
1474	int loops = 0;
1475
1476again:
1477	/* step one, find a bunch of delalloc bytes starting at start */
1478	delalloc_start = *start;
1479	delalloc_end = 0;
1480	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1481				    max_bytes, &cached_state);
1482	if (!found || delalloc_end <= *start) {
1483		*start = delalloc_start;
1484		*end = delalloc_end;
1485		free_extent_state(cached_state);
1486		return found;
1487	}
1488
1489	/*
1490	 * start comes from the offset of locked_page.  We have to lock
1491	 * pages in order, so we can't process delalloc bytes before
1492	 * locked_page
1493	 */
1494	if (delalloc_start < *start)
1495		delalloc_start = *start;
1496
1497	/*
1498	 * make sure to limit the number of pages we try to lock down
1499	 * if we're looping.
1500	 */
1501	if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1502		delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1503
1504	/* step two, lock all the pages after the page that has start */
1505	ret = lock_delalloc_pages(inode, locked_page,
1506				  delalloc_start, delalloc_end);
1507	if (ret == -EAGAIN) {
1508		/* some of the pages are gone, lets avoid looping by
1509		 * shortening the size of the delalloc range we're searching
1510		 */
1511		free_extent_state(cached_state);
1512		if (!loops) {
1513			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1514			max_bytes = PAGE_CACHE_SIZE - offset;
1515			loops = 1;
1516			goto again;
1517		} else {
1518			found = 0;
1519			goto out_failed;
1520		}
1521	}
1522	BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
1523
1524	/* step three, lock the state bits for the whole range */
1525	lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
 
1526
1527	/* then test to make sure it is all still delalloc */
1528	ret = test_range_bit(tree, delalloc_start, delalloc_end,
1529			     EXTENT_DELALLOC, 1, cached_state);
1530	if (!ret) {
1531		unlock_extent_cached(tree, delalloc_start, delalloc_end,
1532				     &cached_state, GFP_NOFS);
1533		__unlock_for_delalloc(inode, locked_page,
1534			      delalloc_start, delalloc_end);
1535		cond_resched();
1536		goto again;
1537	}
1538	free_extent_state(cached_state);
1539	*start = delalloc_start;
1540	*end = delalloc_end;
1541out_failed:
1542	return found;
1543}
1544
1545int extent_clear_unlock_delalloc(struct inode *inode,
1546				struct extent_io_tree *tree,
1547				u64 start, u64 end, struct page *locked_page,
1548				unsigned long op)
1549{
1550	int ret;
1551	struct page *pages[16];
1552	unsigned long index = start >> PAGE_CACHE_SHIFT;
1553	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1554	unsigned long nr_pages = end_index - index + 1;
1555	int i;
1556	int clear_bits = 0;
1557
1558	if (op & EXTENT_CLEAR_UNLOCK)
1559		clear_bits |= EXTENT_LOCKED;
1560	if (op & EXTENT_CLEAR_DIRTY)
1561		clear_bits |= EXTENT_DIRTY;
1562
1563	if (op & EXTENT_CLEAR_DELALLOC)
1564		clear_bits |= EXTENT_DELALLOC;
1565
1566	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1567	if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1568		    EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
1569		    EXTENT_SET_PRIVATE2)))
1570		return 0;
1571
1572	while (nr_pages > 0) {
1573		ret = find_get_pages_contig(inode->i_mapping, index,
1574				     min_t(unsigned long,
1575				     nr_pages, ARRAY_SIZE(pages)), pages);
1576		for (i = 0; i < ret; i++) {
1577
1578			if (op & EXTENT_SET_PRIVATE2)
1579				SetPagePrivate2(pages[i]);
1580
1581			if (pages[i] == locked_page) {
1582				page_cache_release(pages[i]);
1583				continue;
1584			}
1585			if (op & EXTENT_CLEAR_DIRTY)
1586				clear_page_dirty_for_io(pages[i]);
1587			if (op & EXTENT_SET_WRITEBACK)
1588				set_page_writeback(pages[i]);
1589			if (op & EXTENT_END_WRITEBACK)
1590				end_page_writeback(pages[i]);
1591			if (op & EXTENT_CLEAR_UNLOCK_PAGE)
1592				unlock_page(pages[i]);
1593			page_cache_release(pages[i]);
1594		}
1595		nr_pages -= ret;
1596		index += ret;
1597		cond_resched();
1598	}
1599	return 0;
1600}
1601
1602/*
1603 * count the number of bytes in the tree that have a given bit(s)
1604 * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1605 * cached.  The total number found is returned.
1606 */
1607u64 count_range_bits(struct extent_io_tree *tree,
1608		     u64 *start, u64 search_end, u64 max_bytes,
1609		     unsigned long bits, int contig)
1610{
1611	struct rb_node *node;
1612	struct extent_state *state;
1613	u64 cur_start = *start;
1614	u64 total_bytes = 0;
1615	u64 last = 0;
1616	int found = 0;
1617
1618	if (search_end <= cur_start) {
1619		WARN_ON(1);
1620		return 0;
1621	}
1622
1623	spin_lock(&tree->lock);
1624	if (cur_start == 0 && bits == EXTENT_DIRTY) {
1625		total_bytes = tree->dirty_bytes;
1626		goto out;
1627	}
1628	/*
1629	 * this search will find all the extents that end after
1630	 * our range starts.
1631	 */
1632	node = tree_search(tree, cur_start);
1633	if (!node)
1634		goto out;
1635
1636	while (1) {
1637		state = rb_entry(node, struct extent_state, rb_node);
1638		if (state->start > search_end)
1639			break;
1640		if (contig && found && state->start > last + 1)
1641			break;
1642		if (state->end >= cur_start && (state->state & bits) == bits) {
1643			total_bytes += min(search_end, state->end) + 1 -
1644				       max(cur_start, state->start);
1645			if (total_bytes >= max_bytes)
1646				break;
1647			if (!found) {
1648				*start = max(cur_start, state->start);
1649				found = 1;
1650			}
1651			last = state->end;
1652		} else if (contig && found) {
1653			break;
1654		}
1655		node = rb_next(node);
1656		if (!node)
1657			break;
1658	}
1659out:
1660	spin_unlock(&tree->lock);
1661	return total_bytes;
1662}
1663
1664/*
1665 * set the private field for a given byte offset in the tree.  If there isn't
1666 * an extent_state there already, this does nothing.
1667 */
1668int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1669{
1670	struct rb_node *node;
1671	struct extent_state *state;
1672	int ret = 0;
1673
1674	spin_lock(&tree->lock);
1675	/*
1676	 * this search will find all the extents that end after
1677	 * our range starts.
1678	 */
1679	node = tree_search(tree, start);
1680	if (!node) {
1681		ret = -ENOENT;
1682		goto out;
1683	}
1684	state = rb_entry(node, struct extent_state, rb_node);
1685	if (state->start != start) {
1686		ret = -ENOENT;
1687		goto out;
1688	}
1689	state->private = private;
1690out:
1691	spin_unlock(&tree->lock);
1692	return ret;
1693}
1694
1695int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1696{
1697	struct rb_node *node;
1698	struct extent_state *state;
1699	int ret = 0;
1700
1701	spin_lock(&tree->lock);
1702	/*
1703	 * this search will find all the extents that end after
1704	 * our range starts.
1705	 */
1706	node = tree_search(tree, start);
1707	if (!node) {
1708		ret = -ENOENT;
1709		goto out;
1710	}
1711	state = rb_entry(node, struct extent_state, rb_node);
1712	if (state->start != start) {
1713		ret = -ENOENT;
1714		goto out;
1715	}
1716	*private = state->private;
1717out:
1718	spin_unlock(&tree->lock);
1719	return ret;
1720}
1721
1722/*
1723 * searches a range in the state tree for a given mask.
1724 * If 'filled' == 1, this returns 1 only if every extent in the tree
1725 * has the bits set.  Otherwise, 1 is returned if any bit in the
1726 * range is found set.
1727 */
1728int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1729		   int bits, int filled, struct extent_state *cached)
1730{
1731	struct extent_state *state = NULL;
1732	struct rb_node *node;
1733	int bitset = 0;
1734
1735	spin_lock(&tree->lock);
1736	if (cached && cached->tree && cached->start <= start &&
1737	    cached->end > start)
1738		node = &cached->rb_node;
1739	else
1740		node = tree_search(tree, start);
1741	while (node && start <= end) {
1742		state = rb_entry(node, struct extent_state, rb_node);
1743
1744		if (filled && state->start > start) {
1745			bitset = 0;
1746			break;
1747		}
1748
1749		if (state->start > end)
1750			break;
1751
1752		if (state->state & bits) {
1753			bitset = 1;
1754			if (!filled)
1755				break;
1756		} else if (filled) {
1757			bitset = 0;
1758			break;
1759		}
1760
1761		if (state->end == (u64)-1)
1762			break;
1763
1764		start = state->end + 1;
1765		if (start > end)
1766			break;
1767		node = rb_next(node);
1768		if (!node) {
1769			if (filled)
1770				bitset = 0;
1771			break;
1772		}
1773	}
1774	spin_unlock(&tree->lock);
1775	return bitset;
1776}
1777
1778/*
1779 * helper function to set a given page up to date if all the
1780 * extents in the tree for that page are up to date
1781 */
1782static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
 
1783{
1784	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1785	u64 end = start + PAGE_CACHE_SIZE - 1;
1786	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1787		SetPageUptodate(page);
 
1788}
1789
1790/*
1791 * helper function to unlock a page if all the extents in the tree
1792 * for that page are unlocked
1793 */
1794static void check_page_locked(struct extent_io_tree *tree, struct page *page)
 
1795{
1796	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1797	u64 end = start + PAGE_CACHE_SIZE - 1;
1798	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1799		unlock_page(page);
 
1800}
1801
1802/*
1803 * helper function to end page writeback if all the extents
1804 * in the tree for that page are done with writeback
1805 */
1806static void check_page_writeback(struct extent_io_tree *tree,
1807				 struct page *page)
1808{
1809	end_page_writeback(page);
1810}
1811
1812/*
1813 * When IO fails, either with EIO or csum verification fails, we
1814 * try other mirrors that might have a good copy of the data.  This
1815 * io_failure_record is used to record state as we go through all the
1816 * mirrors.  If another mirror has good data, the page is set up to date
1817 * and things continue.  If a good mirror can't be found, the original
1818 * bio end_io callback is called to indicate things have failed.
1819 */
1820struct io_failure_record {
1821	struct page *page;
1822	u64 start;
1823	u64 len;
1824	u64 logical;
1825	unsigned long bio_flags;
1826	int this_mirror;
1827	int failed_mirror;
1828	int in_validation;
1829};
1830
1831static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1832				int did_repair)
1833{
1834	int ret;
1835	int err = 0;
1836	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1837
1838	set_state_private(failure_tree, rec->start, 0);
1839	ret = clear_extent_bits(failure_tree, rec->start,
1840				rec->start + rec->len - 1,
1841				EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1842	if (ret)
1843		err = ret;
1844
1845	if (did_repair) {
1846		ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1847					rec->start + rec->len - 1,
1848					EXTENT_DAMAGED, GFP_NOFS);
1849		if (ret && !err)
1850			err = ret;
1851	}
1852
1853	kfree(rec);
1854	return err;
1855}
1856
1857static void repair_io_failure_callback(struct bio *bio, int err)
1858{
1859	complete(bio->bi_private);
1860}
1861
1862/*
1863 * this bypasses the standard btrfs submit functions deliberately, as
1864 * the standard behavior is to write all copies in a raid setup. here we only
1865 * want to write the one bad copy. so we do the mapping for ourselves and issue
1866 * submit_bio directly.
1867 * to avoid any synchonization issues, wait for the data after writing, which
1868 * actually prevents the read that triggered the error from finishing.
1869 * currently, there can be no more than two copies of every data bit. thus,
1870 * exactly one rewrite is required.
1871 */
1872int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1873			u64 length, u64 logical, struct page *page,
1874			int mirror_num)
1875{
1876	struct bio *bio;
1877	struct btrfs_device *dev;
1878	DECLARE_COMPLETION_ONSTACK(compl);
1879	u64 map_length = 0;
1880	u64 sector;
1881	struct btrfs_bio *bbio = NULL;
1882	int ret;
1883
1884	BUG_ON(!mirror_num);
1885
1886	bio = bio_alloc(GFP_NOFS, 1);
1887	if (!bio)
1888		return -EIO;
1889	bio->bi_private = &compl;
1890	bio->bi_end_io = repair_io_failure_callback;
1891	bio->bi_size = 0;
1892	map_length = length;
1893
1894	ret = btrfs_map_block(map_tree, WRITE, logical,
1895			      &map_length, &bbio, mirror_num);
1896	if (ret) {
1897		bio_put(bio);
1898		return -EIO;
1899	}
1900	BUG_ON(mirror_num != bbio->mirror_num);
1901	sector = bbio->stripes[mirror_num-1].physical >> 9;
1902	bio->bi_sector = sector;
1903	dev = bbio->stripes[mirror_num-1].dev;
1904	kfree(bbio);
1905	if (!dev || !dev->bdev || !dev->writeable) {
1906		bio_put(bio);
1907		return -EIO;
1908	}
1909	bio->bi_bdev = dev->bdev;
1910	bio_add_page(bio, page, length, start-page_offset(page));
1911	btrfsic_submit_bio(WRITE_SYNC, bio);
1912	wait_for_completion(&compl);
1913
1914	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1915		/* try to remap that extent elsewhere? */
1916		bio_put(bio);
1917		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
1918		return -EIO;
1919	}
1920
1921	printk_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu "
1922		      "(dev %s sector %llu)\n", page->mapping->host->i_ino,
1923		      start, rcu_str_deref(dev->name), sector);
1924
1925	bio_put(bio);
1926	return 0;
1927}
1928
1929int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
1930			 int mirror_num)
1931{
1932	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
1933	u64 start = eb->start;
1934	unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
1935	int ret = 0;
1936
1937	for (i = 0; i < num_pages; i++) {
1938		struct page *p = extent_buffer_page(eb, i);
1939		ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE,
1940					start, p, mirror_num);
1941		if (ret)
1942			break;
1943		start += PAGE_CACHE_SIZE;
1944	}
1945
1946	return ret;
1947}
1948
1949/*
1950 * each time an IO finishes, we do a fast check in the IO failure tree
1951 * to see if we need to process or clean up an io_failure_record
1952 */
1953static int clean_io_failure(u64 start, struct page *page)
1954{
1955	u64 private;
1956	u64 private_failure;
1957	struct io_failure_record *failrec;
1958	struct btrfs_mapping_tree *map_tree;
1959	struct extent_state *state;
1960	int num_copies;
1961	int did_repair = 0;
1962	int ret;
1963	struct inode *inode = page->mapping->host;
1964
1965	private = 0;
1966	ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1967				(u64)-1, 1, EXTENT_DIRTY, 0);
1968	if (!ret)
1969		return 0;
1970
1971	ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
1972				&private_failure);
1973	if (ret)
1974		return 0;
1975
1976	failrec = (struct io_failure_record *)(unsigned long) private_failure;
1977	BUG_ON(!failrec->this_mirror);
1978
1979	if (failrec->in_validation) {
1980		/* there was no real error, just free the record */
1981		pr_debug("clean_io_failure: freeing dummy error at %llu\n",
1982			 failrec->start);
1983		did_repair = 1;
1984		goto out;
1985	}
1986
1987	spin_lock(&BTRFS_I(inode)->io_tree.lock);
1988	state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1989					    failrec->start,
1990					    EXTENT_LOCKED);
1991	spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1992
1993	if (state && state->start == failrec->start) {
1994		map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
1995		num_copies = btrfs_num_copies(map_tree, failrec->logical,
1996						failrec->len);
1997		if (num_copies > 1)  {
1998			ret = repair_io_failure(map_tree, start, failrec->len,
1999						failrec->logical, page,
2000						failrec->failed_mirror);
2001			did_repair = !ret;
2002		}
2003	}
2004
2005out:
2006	if (!ret)
2007		ret = free_io_failure(inode, failrec, did_repair);
2008
2009	return ret;
2010}
2011
2012/*
2013 * this is a generic handler for readpage errors (default
2014 * readpage_io_failed_hook). if other copies exist, read those and write back
2015 * good data to the failed position. does not investigate in remapping the
2016 * failed extent elsewhere, hoping the device will be smart enough to do this as
2017 * needed
2018 */
2019
2020static int bio_readpage_error(struct bio *failed_bio, struct page *page,
2021				u64 start, u64 end, int failed_mirror,
2022				struct extent_state *state)
2023{
2024	struct io_failure_record *failrec = NULL;
2025	u64 private;
2026	struct extent_map *em;
2027	struct inode *inode = page->mapping->host;
2028	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2029	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2030	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2031	struct bio *bio;
2032	int num_copies;
2033	int ret;
2034	int read_mode;
2035	u64 logical;
2036
2037	BUG_ON(failed_bio->bi_rw & REQ_WRITE);
2038
2039	ret = get_state_private(failure_tree, start, &private);
2040	if (ret) {
2041		failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2042		if (!failrec)
2043			return -ENOMEM;
2044		failrec->start = start;
2045		failrec->len = end - start + 1;
2046		failrec->this_mirror = 0;
2047		failrec->bio_flags = 0;
2048		failrec->in_validation = 0;
2049
2050		read_lock(&em_tree->lock);
2051		em = lookup_extent_mapping(em_tree, start, failrec->len);
2052		if (!em) {
2053			read_unlock(&em_tree->lock);
2054			kfree(failrec);
2055			return -EIO;
2056		}
2057
2058		if (em->start > start || em->start + em->len < start) {
2059			free_extent_map(em);
2060			em = NULL;
2061		}
2062		read_unlock(&em_tree->lock);
2063
2064		if (!em || IS_ERR(em)) {
2065			kfree(failrec);
2066			return -EIO;
2067		}
2068		logical = start - em->start;
2069		logical = em->block_start + logical;
2070		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2071			logical = em->block_start;
2072			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2073			extent_set_compress_type(&failrec->bio_flags,
2074						 em->compress_type);
2075		}
2076		pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
2077			 "len=%llu\n", logical, start, failrec->len);
2078		failrec->logical = logical;
2079		free_extent_map(em);
2080
2081		/* set the bits in the private failure tree */
2082		ret = set_extent_bits(failure_tree, start, end,
2083					EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2084		if (ret >= 0)
2085			ret = set_state_private(failure_tree, start,
2086						(u64)(unsigned long)failrec);
2087		/* set the bits in the inode's tree */
2088		if (ret >= 0)
2089			ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
2090						GFP_NOFS);
2091		if (ret < 0) {
2092			kfree(failrec);
2093			return ret;
2094		}
2095	} else {
2096		failrec = (struct io_failure_record *)(unsigned long)private;
2097		pr_debug("bio_readpage_error: (found) logical=%llu, "
2098			 "start=%llu, len=%llu, validation=%d\n",
2099			 failrec->logical, failrec->start, failrec->len,
2100			 failrec->in_validation);
2101		/*
2102		 * when data can be on disk more than twice, add to failrec here
2103		 * (e.g. with a list for failed_mirror) to make
2104		 * clean_io_failure() clean all those errors at once.
2105		 */
2106	}
2107	num_copies = btrfs_num_copies(
2108			      &BTRFS_I(inode)->root->fs_info->mapping_tree,
2109			      failrec->logical, failrec->len);
2110	if (num_copies == 1) {
2111		/*
2112		 * we only have a single copy of the data, so don't bother with
2113		 * all the retry and error correction code that follows. no
2114		 * matter what the error is, it is very likely to persist.
2115		 */
2116		pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
2117			 "state=%p, num_copies=%d, next_mirror %d, "
2118			 "failed_mirror %d\n", state, num_copies,
2119			 failrec->this_mirror, failed_mirror);
2120		free_io_failure(inode, failrec, 0);
2121		return -EIO;
2122	}
2123
2124	if (!state) {
2125		spin_lock(&tree->lock);
2126		state = find_first_extent_bit_state(tree, failrec->start,
2127						    EXTENT_LOCKED);
2128		if (state && state->start != failrec->start)
2129			state = NULL;
2130		spin_unlock(&tree->lock);
2131	}
2132
2133	/*
2134	 * there are two premises:
2135	 *	a) deliver good data to the caller
2136	 *	b) correct the bad sectors on disk
2137	 */
2138	if (failed_bio->bi_vcnt > 1) {
2139		/*
2140		 * to fulfill b), we need to know the exact failing sectors, as
2141		 * we don't want to rewrite any more than the failed ones. thus,
2142		 * we need separate read requests for the failed bio
2143		 *
2144		 * if the following BUG_ON triggers, our validation request got
2145		 * merged. we need separate requests for our algorithm to work.
2146		 */
2147		BUG_ON(failrec->in_validation);
2148		failrec->in_validation = 1;
2149		failrec->this_mirror = failed_mirror;
2150		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2151	} else {
2152		/*
2153		 * we're ready to fulfill a) and b) alongside. get a good copy
2154		 * of the failed sector and if we succeed, we have setup
2155		 * everything for repair_io_failure to do the rest for us.
2156		 */
2157		if (failrec->in_validation) {
2158			BUG_ON(failrec->this_mirror != failed_mirror);
2159			failrec->in_validation = 0;
2160			failrec->this_mirror = 0;
2161		}
2162		failrec->failed_mirror = failed_mirror;
2163		failrec->this_mirror++;
2164		if (failrec->this_mirror == failed_mirror)
2165			failrec->this_mirror++;
2166		read_mode = READ_SYNC;
2167	}
2168
2169	if (!state || failrec->this_mirror > num_copies) {
2170		pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
2171			 "next_mirror %d, failed_mirror %d\n", state,
2172			 num_copies, failrec->this_mirror, failed_mirror);
2173		free_io_failure(inode, failrec, 0);
2174		return -EIO;
2175	}
2176
2177	bio = bio_alloc(GFP_NOFS, 1);
2178	if (!bio) {
2179		free_io_failure(inode, failrec, 0);
2180		return -EIO;
2181	}
2182	bio->bi_private = state;
2183	bio->bi_end_io = failed_bio->bi_end_io;
2184	bio->bi_sector = failrec->logical >> 9;
2185	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2186	bio->bi_size = 0;
2187
2188	bio_add_page(bio, page, failrec->len, start - page_offset(page));
2189
2190	pr_debug("bio_readpage_error: submitting new read[%#x] to "
2191		 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
2192		 failrec->this_mirror, num_copies, failrec->in_validation);
2193
2194	ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
2195					 failrec->this_mirror,
2196					 failrec->bio_flags, 0);
2197	return ret;
2198}
2199
2200/* lots and lots of room for performance fixes in the end_bio funcs */
2201
2202int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2203{
2204	int uptodate = (err == 0);
2205	struct extent_io_tree *tree;
2206	int ret;
2207
2208	tree = &BTRFS_I(page->mapping->host)->io_tree;
2209
2210	if (tree->ops && tree->ops->writepage_end_io_hook) {
2211		ret = tree->ops->writepage_end_io_hook(page, start,
2212					       end, NULL, uptodate);
2213		if (ret)
2214			uptodate = 0;
2215	}
2216
2217	if (!uptodate) {
2218		ClearPageUptodate(page);
2219		SetPageError(page);
2220	}
2221	return 0;
2222}
2223
2224/*
2225 * after a writepage IO is done, we need to:
2226 * clear the uptodate bits on error
2227 * clear the writeback bits in the extent tree for this IO
2228 * end_page_writeback if the page has no more pending IO
2229 *
2230 * Scheduling is not allowed, so the extent state tree is expected
2231 * to have one and only one object corresponding to this IO.
2232 */
2233static void end_bio_extent_writepage(struct bio *bio, int err)
2234{
 
2235	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
2236	struct extent_io_tree *tree;
2237	u64 start;
2238	u64 end;
2239	int whole_page;
 
2240
2241	do {
2242		struct page *page = bvec->bv_page;
2243		tree = &BTRFS_I(page->mapping->host)->io_tree;
2244
2245		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
2246			 bvec->bv_offset;
2247		end = start + bvec->bv_len - 1;
2248
2249		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
2250			whole_page = 1;
2251		else
2252			whole_page = 0;
2253
2254		if (--bvec >= bio->bi_io_vec)
2255			prefetchw(&bvec->bv_page->flags);
 
 
 
 
 
 
2256
2257		if (end_extent_writepage(page, err, start, end))
2258			continue;
 
 
 
 
 
 
 
 
 
 
 
 
 
2259
2260		if (whole_page)
2261			end_page_writeback(page);
2262		else
2263			check_page_writeback(tree, page);
2264	} while (bvec >= bio->bi_io_vec);
2265
2266	bio_put(bio);
2267}
2268
2269/*
2270 * after a readpage IO is done, we need to:
2271 * clear the uptodate bits on error
2272 * set the uptodate bits if things worked
2273 * set the page up to date if all extents in the tree are uptodate
2274 * clear the lock bit in the extent tree
2275 * unlock the page if there are no other extents locked for it
2276 *
2277 * Scheduling is not allowed, so the extent state tree is expected
2278 * to have one and only one object corresponding to this IO.
2279 */
2280static void end_bio_extent_readpage(struct bio *bio, int err)
2281{
2282	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
2283	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
2284	struct bio_vec *bvec = bio->bi_io_vec;
2285	struct extent_io_tree *tree;
2286	u64 start;
2287	u64 end;
2288	int whole_page;
2289	int mirror;
2290	int ret;
2291
2292	if (err)
2293		uptodate = 0;
2294
2295	do {
2296		struct page *page = bvec->bv_page;
2297		struct extent_state *cached = NULL;
2298		struct extent_state *state;
2299
2300		pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
2301			 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
2302			 (long int)bio->bi_bdev);
2303		tree = &BTRFS_I(page->mapping->host)->io_tree;
2304
2305		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
2306			bvec->bv_offset;
2307		end = start + bvec->bv_len - 1;
2308
2309		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
2310			whole_page = 1;
2311		else
2312			whole_page = 0;
2313
2314		if (++bvec <= bvec_end)
2315			prefetchw(&bvec->bv_page->flags);
2316
2317		spin_lock(&tree->lock);
2318		state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
2319		if (state && state->start == start) {
2320			/*
2321			 * take a reference on the state, unlock will drop
2322			 * the ref
2323			 */
2324			cache_state(state, &cached);
2325		}
2326		spin_unlock(&tree->lock);
2327
2328		mirror = (int)(unsigned long)bio->bi_bdev;
2329		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
2330			ret = tree->ops->readpage_end_io_hook(page, start, end,
2331							      state, mirror);
2332			if (ret)
2333				uptodate = 0;
2334			else
2335				clean_io_failure(start, page);
2336		}
2337
2338		if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
2339			ret = tree->ops->readpage_io_failed_hook(page, mirror);
2340			if (!ret && !err &&
2341			    test_bit(BIO_UPTODATE, &bio->bi_flags))
2342				uptodate = 1;
2343		} else if (!uptodate) {
2344			/*
2345			 * The generic bio_readpage_error handles errors the
2346			 * following way: If possible, new read requests are
2347			 * created and submitted and will end up in
2348			 * end_bio_extent_readpage as well (if we're lucky, not
2349			 * in the !uptodate case). In that case it returns 0 and
2350			 * we just go on with the next page in our bio. If it
2351			 * can't handle the error it will return -EIO and we
2352			 * remain responsible for that page.
2353			 */
2354			ret = bio_readpage_error(bio, page, start, end, mirror, NULL);
2355			if (ret == 0) {
2356				uptodate =
2357					test_bit(BIO_UPTODATE, &bio->bi_flags);
2358				if (err)
2359					uptodate = 0;
2360				uncache_state(&cached);
2361				continue;
2362			}
2363		}
2364
2365		if (uptodate && tree->track_uptodate) {
2366			set_extent_uptodate(tree, start, end, &cached,
2367					    GFP_ATOMIC);
2368		}
2369		unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
2370
2371		if (whole_page) {
2372			if (uptodate) {
2373				SetPageUptodate(page);
2374			} else {
2375				ClearPageUptodate(page);
2376				SetPageError(page);
2377			}
2378			unlock_page(page);
2379		} else {
2380			if (uptodate) {
2381				check_page_uptodate(tree, page);
2382			} else {
2383				ClearPageUptodate(page);
2384				SetPageError(page);
2385			}
2386			check_page_locked(tree, page);
2387		}
2388	} while (bvec <= bvec_end);
2389
2390	bio_put(bio);
2391}
2392
2393struct bio *
2394btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2395		gfp_t gfp_flags)
2396{
2397	struct bio *bio;
2398
2399	bio = bio_alloc(gfp_flags, nr_vecs);
2400
2401	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
2402		while (!bio && (nr_vecs /= 2))
2403			bio = bio_alloc(gfp_flags, nr_vecs);
2404	}
2405
2406	if (bio) {
2407		bio->bi_size = 0;
2408		bio->bi_bdev = bdev;
2409		bio->bi_sector = first_sector;
2410	}
2411	return bio;
2412}
2413
2414/*
2415 * Since writes are async, they will only return -ENOMEM.
2416 * Reads can return the full range of I/O error conditions.
2417 */
2418static int __must_check submit_one_bio(int rw, struct bio *bio,
2419				       int mirror_num, unsigned long bio_flags)
2420{
2421	int ret = 0;
2422	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
2423	struct page *page = bvec->bv_page;
2424	struct extent_io_tree *tree = bio->bi_private;
2425	u64 start;
2426
2427	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
2428
2429	bio->bi_private = NULL;
2430
2431	bio_get(bio);
2432
2433	if (tree->ops && tree->ops->submit_bio_hook)
2434		ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
2435					   mirror_num, bio_flags, start);
2436	else
2437		btrfsic_submit_bio(rw, bio);
2438
2439	if (bio_flagged(bio, BIO_EOPNOTSUPP))
2440		ret = -EOPNOTSUPP;
2441	bio_put(bio);
2442	return ret;
2443}
2444
2445static int merge_bio(struct extent_io_tree *tree, struct page *page,
2446		     unsigned long offset, size_t size, struct bio *bio,
2447		     unsigned long bio_flags)
2448{
2449	int ret = 0;
2450	if (tree->ops && tree->ops->merge_bio_hook)
2451		ret = tree->ops->merge_bio_hook(page, offset, size, bio,
2452						bio_flags);
2453	BUG_ON(ret < 0);
2454	return ret;
2455
2456}
2457
2458static int submit_extent_page(int rw, struct extent_io_tree *tree,
2459			      struct page *page, sector_t sector,
2460			      size_t size, unsigned long offset,
2461			      struct block_device *bdev,
2462			      struct bio **bio_ret,
2463			      unsigned long max_pages,
2464			      bio_end_io_t end_io_func,
2465			      int mirror_num,
2466			      unsigned long prev_bio_flags,
2467			      unsigned long bio_flags)
2468{
2469	int ret = 0;
2470	struct bio *bio;
2471	int nr;
2472	int contig = 0;
2473	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
2474	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
2475	size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
2476
2477	if (bio_ret && *bio_ret) {
2478		bio = *bio_ret;
2479		if (old_compressed)
2480			contig = bio->bi_sector == sector;
2481		else
2482			contig = bio->bi_sector + (bio->bi_size >> 9) ==
2483				sector;
2484
2485		if (prev_bio_flags != bio_flags || !contig ||
2486		    merge_bio(tree, page, offset, page_size, bio, bio_flags) ||
 
 
2487		    bio_add_page(bio, page, page_size, offset) < page_size) {
2488			ret = submit_one_bio(rw, bio, mirror_num,
2489					     prev_bio_flags);
2490			if (ret < 0)
2491				return ret;
2492			bio = NULL;
2493		} else {
2494			return 0;
2495		}
2496	}
2497	if (this_compressed)
2498		nr = BIO_MAX_PAGES;
2499	else
2500		nr = bio_get_nr_vecs(bdev);
2501
2502	bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
2503	if (!bio)
2504		return -ENOMEM;
2505
2506	bio_add_page(bio, page, page_size, offset);
2507	bio->bi_end_io = end_io_func;
2508	bio->bi_private = tree;
2509
2510	if (bio_ret)
2511		*bio_ret = bio;
2512	else
2513		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
2514
2515	return ret;
2516}
2517
2518void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
2519{
2520	if (!PagePrivate(page)) {
2521		SetPagePrivate(page);
2522		page_cache_get(page);
2523		set_page_private(page, (unsigned long)eb);
2524	} else {
2525		WARN_ON(page->private != (unsigned long)eb);
2526	}
2527}
2528
2529void set_page_extent_mapped(struct page *page)
2530{
2531	if (!PagePrivate(page)) {
2532		SetPagePrivate(page);
2533		page_cache_get(page);
2534		set_page_private(page, EXTENT_PAGE_PRIVATE);
2535	}
2536}
2537
2538/*
2539 * basic readpage implementation.  Locked extent state structs are inserted
2540 * into the tree that are removed when the IO is done (by the end_io
2541 * handlers)
2542 * XXX JDM: This needs looking at to ensure proper page locking
2543 */
2544static int __extent_read_full_page(struct extent_io_tree *tree,
2545				   struct page *page,
2546				   get_extent_t *get_extent,
2547				   struct bio **bio, int mirror_num,
2548				   unsigned long *bio_flags)
2549{
2550	struct inode *inode = page->mapping->host;
2551	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2552	u64 page_end = start + PAGE_CACHE_SIZE - 1;
2553	u64 end;
2554	u64 cur = start;
2555	u64 extent_offset;
2556	u64 last_byte = i_size_read(inode);
2557	u64 block_start;
2558	u64 cur_end;
2559	sector_t sector;
2560	struct extent_map *em;
2561	struct block_device *bdev;
2562	struct btrfs_ordered_extent *ordered;
2563	int ret;
2564	int nr = 0;
2565	size_t pg_offset = 0;
2566	size_t iosize;
2567	size_t disk_io_size;
2568	size_t blocksize = inode->i_sb->s_blocksize;
2569	unsigned long this_bio_flag = 0;
2570
2571	set_page_extent_mapped(page);
2572
2573	if (!PageUptodate(page)) {
2574		if (cleancache_get_page(page) == 0) {
2575			BUG_ON(blocksize != PAGE_SIZE);
2576			goto out;
2577		}
2578	}
2579
2580	end = page_end;
2581	while (1) {
2582		lock_extent(tree, start, end);
2583		ordered = btrfs_lookup_ordered_extent(inode, start);
2584		if (!ordered)
2585			break;
2586		unlock_extent(tree, start, end);
2587		btrfs_start_ordered_extent(inode, ordered, 1);
2588		btrfs_put_ordered_extent(ordered);
2589	}
2590
2591	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2592		char *userpage;
2593		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
2594
2595		if (zero_offset) {
2596			iosize = PAGE_CACHE_SIZE - zero_offset;
2597			userpage = kmap_atomic(page);
2598			memset(userpage + zero_offset, 0, iosize);
2599			flush_dcache_page(page);
2600			kunmap_atomic(userpage);
2601		}
2602	}
2603	while (cur <= end) {
2604		if (cur >= last_byte) {
2605			char *userpage;
2606			struct extent_state *cached = NULL;
2607
2608			iosize = PAGE_CACHE_SIZE - pg_offset;
2609			userpage = kmap_atomic(page);
2610			memset(userpage + pg_offset, 0, iosize);
2611			flush_dcache_page(page);
2612			kunmap_atomic(userpage);
2613			set_extent_uptodate(tree, cur, cur + iosize - 1,
2614					    &cached, GFP_NOFS);
2615			unlock_extent_cached(tree, cur, cur + iosize - 1,
2616					     &cached, GFP_NOFS);
2617			break;
2618		}
2619		em = get_extent(inode, page, pg_offset, cur,
2620				end - cur + 1, 0);
2621		if (IS_ERR_OR_NULL(em)) {
2622			SetPageError(page);
2623			unlock_extent(tree, cur, end);
2624			break;
2625		}
2626		extent_offset = cur - em->start;
2627		BUG_ON(extent_map_end(em) <= cur);
2628		BUG_ON(end < cur);
2629
2630		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2631			this_bio_flag = EXTENT_BIO_COMPRESSED;
2632			extent_set_compress_type(&this_bio_flag,
2633						 em->compress_type);
2634		}
2635
2636		iosize = min(extent_map_end(em) - cur, end - cur + 1);
2637		cur_end = min(extent_map_end(em) - 1, end);
2638		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2639		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2640			disk_io_size = em->block_len;
2641			sector = em->block_start >> 9;
2642		} else {
2643			sector = (em->block_start + extent_offset) >> 9;
2644			disk_io_size = iosize;
2645		}
2646		bdev = em->bdev;
2647		block_start = em->block_start;
2648		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2649			block_start = EXTENT_MAP_HOLE;
2650		free_extent_map(em);
2651		em = NULL;
2652
2653		/* we've found a hole, just zero and go on */
2654		if (block_start == EXTENT_MAP_HOLE) {
2655			char *userpage;
2656			struct extent_state *cached = NULL;
2657
2658			userpage = kmap_atomic(page);
2659			memset(userpage + pg_offset, 0, iosize);
2660			flush_dcache_page(page);
2661			kunmap_atomic(userpage);
2662
2663			set_extent_uptodate(tree, cur, cur + iosize - 1,
2664					    &cached, GFP_NOFS);
2665			unlock_extent_cached(tree, cur, cur + iosize - 1,
2666			                     &cached, GFP_NOFS);
2667			cur = cur + iosize;
2668			pg_offset += iosize;
2669			continue;
2670		}
2671		/* the get_extent function already copied into the page */
2672		if (test_range_bit(tree, cur, cur_end,
2673				   EXTENT_UPTODATE, 1, NULL)) {
2674			check_page_uptodate(tree, page);
2675			unlock_extent(tree, cur, cur + iosize - 1);
2676			cur = cur + iosize;
2677			pg_offset += iosize;
2678			continue;
2679		}
2680		/* we have an inline extent but it didn't get marked up
2681		 * to date.  Error out
2682		 */
2683		if (block_start == EXTENT_MAP_INLINE) {
2684			SetPageError(page);
2685			unlock_extent(tree, cur, cur + iosize - 1);
2686			cur = cur + iosize;
2687			pg_offset += iosize;
2688			continue;
2689		}
2690
2691		ret = 0;
2692		if (tree->ops && tree->ops->readpage_io_hook) {
2693			ret = tree->ops->readpage_io_hook(page, cur,
2694							  cur + iosize - 1);
2695		}
2696		if (!ret) {
2697			unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2698			pnr -= page->index;
2699			ret = submit_extent_page(READ, tree, page,
2700					 sector, disk_io_size, pg_offset,
2701					 bdev, bio, pnr,
2702					 end_bio_extent_readpage, mirror_num,
2703					 *bio_flags,
2704					 this_bio_flag);
2705			BUG_ON(ret == -ENOMEM);
2706			nr++;
2707			*bio_flags = this_bio_flag;
2708		}
2709		if (ret)
2710			SetPageError(page);
2711		cur = cur + iosize;
2712		pg_offset += iosize;
2713	}
2714out:
2715	if (!nr) {
2716		if (!PageError(page))
2717			SetPageUptodate(page);
2718		unlock_page(page);
2719	}
2720	return 0;
2721}
2722
2723int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2724			    get_extent_t *get_extent, int mirror_num)
2725{
2726	struct bio *bio = NULL;
2727	unsigned long bio_flags = 0;
2728	int ret;
2729
2730	ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
2731				      &bio_flags);
2732	if (bio)
2733		ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
2734	return ret;
2735}
2736
2737static noinline void update_nr_written(struct page *page,
2738				      struct writeback_control *wbc,
2739				      unsigned long nr_written)
2740{
2741	wbc->nr_to_write -= nr_written;
2742	if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2743	    wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2744		page->mapping->writeback_index = page->index + nr_written;
2745}
2746
2747/*
2748 * the writepage semantics are similar to regular writepage.  extent
2749 * records are inserted to lock ranges in the tree, and as dirty areas
2750 * are found, they are marked writeback.  Then the lock bits are removed
2751 * and the end_io handler clears the writeback ranges
2752 */
2753static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2754			      void *data)
2755{
2756	struct inode *inode = page->mapping->host;
2757	struct extent_page_data *epd = data;
2758	struct extent_io_tree *tree = epd->tree;
2759	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2760	u64 delalloc_start;
2761	u64 page_end = start + PAGE_CACHE_SIZE - 1;
2762	u64 end;
2763	u64 cur = start;
2764	u64 extent_offset;
2765	u64 last_byte = i_size_read(inode);
2766	u64 block_start;
2767	u64 iosize;
2768	sector_t sector;
2769	struct extent_state *cached_state = NULL;
2770	struct extent_map *em;
2771	struct block_device *bdev;
2772	int ret;
2773	int nr = 0;
2774	size_t pg_offset = 0;
2775	size_t blocksize;
2776	loff_t i_size = i_size_read(inode);
2777	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2778	u64 nr_delalloc;
2779	u64 delalloc_end;
2780	int page_started;
2781	int compressed;
2782	int write_flags;
2783	unsigned long nr_written = 0;
2784	bool fill_delalloc = true;
2785
2786	if (wbc->sync_mode == WB_SYNC_ALL)
2787		write_flags = WRITE_SYNC;
2788	else
2789		write_flags = WRITE;
2790
2791	trace___extent_writepage(page, inode, wbc);
2792
2793	WARN_ON(!PageLocked(page));
2794
2795	ClearPageError(page);
2796
2797	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2798	if (page->index > end_index ||
2799	   (page->index == end_index && !pg_offset)) {
2800		page->mapping->a_ops->invalidatepage(page, 0);
2801		unlock_page(page);
2802		return 0;
2803	}
2804
2805	if (page->index == end_index) {
2806		char *userpage;
2807
2808		userpage = kmap_atomic(page);
2809		memset(userpage + pg_offset, 0,
2810		       PAGE_CACHE_SIZE - pg_offset);
2811		kunmap_atomic(userpage);
2812		flush_dcache_page(page);
2813	}
2814	pg_offset = 0;
2815
2816	set_page_extent_mapped(page);
2817
2818	if (!tree->ops || !tree->ops->fill_delalloc)
2819		fill_delalloc = false;
2820
2821	delalloc_start = start;
2822	delalloc_end = 0;
2823	page_started = 0;
2824	if (!epd->extent_locked && fill_delalloc) {
2825		u64 delalloc_to_write = 0;
2826		/*
2827		 * make sure the wbc mapping index is at least updated
2828		 * to this page.
2829		 */
2830		update_nr_written(page, wbc, 0);
2831
2832		while (delalloc_end < page_end) {
2833			nr_delalloc = find_lock_delalloc_range(inode, tree,
2834						       page,
2835						       &delalloc_start,
2836						       &delalloc_end,
2837						       128 * 1024 * 1024);
2838			if (nr_delalloc == 0) {
2839				delalloc_start = delalloc_end + 1;
2840				continue;
2841			}
2842			ret = tree->ops->fill_delalloc(inode, page,
2843						       delalloc_start,
2844						       delalloc_end,
2845						       &page_started,
2846						       &nr_written);
2847			/* File system has been set read-only */
2848			if (ret) {
2849				SetPageError(page);
2850				goto done;
2851			}
2852			/*
2853			 * delalloc_end is already one less than the total
2854			 * length, so we don't subtract one from
2855			 * PAGE_CACHE_SIZE
2856			 */
2857			delalloc_to_write += (delalloc_end - delalloc_start +
2858					      PAGE_CACHE_SIZE) >>
2859					      PAGE_CACHE_SHIFT;
2860			delalloc_start = delalloc_end + 1;
2861		}
2862		if (wbc->nr_to_write < delalloc_to_write) {
2863			int thresh = 8192;
2864
2865			if (delalloc_to_write < thresh * 2)
2866				thresh = delalloc_to_write;
2867			wbc->nr_to_write = min_t(u64, delalloc_to_write,
2868						 thresh);
2869		}
2870
2871		/* did the fill delalloc function already unlock and start
2872		 * the IO?
2873		 */
2874		if (page_started) {
2875			ret = 0;
2876			/*
2877			 * we've unlocked the page, so we can't update
2878			 * the mapping's writeback index, just update
2879			 * nr_to_write.
2880			 */
2881			wbc->nr_to_write -= nr_written;
2882			goto done_unlocked;
2883		}
2884	}
2885	if (tree->ops && tree->ops->writepage_start_hook) {
2886		ret = tree->ops->writepage_start_hook(page, start,
2887						      page_end);
2888		if (ret) {
2889			/* Fixup worker will requeue */
2890			if (ret == -EBUSY)
2891				wbc->pages_skipped++;
2892			else
2893				redirty_page_for_writepage(wbc, page);
2894			update_nr_written(page, wbc, nr_written);
2895			unlock_page(page);
2896			ret = 0;
2897			goto done_unlocked;
2898		}
2899	}
2900
2901	/*
2902	 * we don't want to touch the inode after unlocking the page,
2903	 * so we update the mapping writeback index now
2904	 */
2905	update_nr_written(page, wbc, nr_written + 1);
2906
2907	end = page_end;
2908	if (last_byte <= start) {
2909		if (tree->ops && tree->ops->writepage_end_io_hook)
2910			tree->ops->writepage_end_io_hook(page, start,
2911							 page_end, NULL, 1);
2912		goto done;
2913	}
2914
2915	blocksize = inode->i_sb->s_blocksize;
2916
2917	while (cur <= end) {
2918		if (cur >= last_byte) {
2919			if (tree->ops && tree->ops->writepage_end_io_hook)
2920				tree->ops->writepage_end_io_hook(page, cur,
2921							 page_end, NULL, 1);
2922			break;
2923		}
2924		em = epd->get_extent(inode, page, pg_offset, cur,
2925				     end - cur + 1, 1);
2926		if (IS_ERR_OR_NULL(em)) {
2927			SetPageError(page);
2928			break;
2929		}
2930
2931		extent_offset = cur - em->start;
2932		BUG_ON(extent_map_end(em) <= cur);
2933		BUG_ON(end < cur);
2934		iosize = min(extent_map_end(em) - cur, end - cur + 1);
2935		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2936		sector = (em->block_start + extent_offset) >> 9;
2937		bdev = em->bdev;
2938		block_start = em->block_start;
2939		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2940		free_extent_map(em);
2941		em = NULL;
2942
2943		/*
2944		 * compressed and inline extents are written through other
2945		 * paths in the FS
2946		 */
2947		if (compressed || block_start == EXTENT_MAP_HOLE ||
2948		    block_start == EXTENT_MAP_INLINE) {
2949			/*
2950			 * end_io notification does not happen here for
2951			 * compressed extents
2952			 */
2953			if (!compressed && tree->ops &&
2954			    tree->ops->writepage_end_io_hook)
2955				tree->ops->writepage_end_io_hook(page, cur,
2956							 cur + iosize - 1,
2957							 NULL, 1);
2958			else if (compressed) {
2959				/* we don't want to end_page_writeback on
2960				 * a compressed extent.  this happens
2961				 * elsewhere
2962				 */
2963				nr++;
2964			}
2965
2966			cur += iosize;
2967			pg_offset += iosize;
2968			continue;
2969		}
2970		/* leave this out until we have a page_mkwrite call */
2971		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2972				   EXTENT_DIRTY, 0, NULL)) {
2973			cur = cur + iosize;
2974			pg_offset += iosize;
2975			continue;
2976		}
2977
2978		if (tree->ops && tree->ops->writepage_io_hook) {
2979			ret = tree->ops->writepage_io_hook(page, cur,
2980						cur + iosize - 1);
2981		} else {
2982			ret = 0;
2983		}
2984		if (ret) {
2985			SetPageError(page);
2986		} else {
2987			unsigned long max_nr = end_index + 1;
2988
2989			set_range_writeback(tree, cur, cur + iosize - 1);
2990			if (!PageWriteback(page)) {
2991				printk(KERN_ERR "btrfs warning page %lu not "
2992				       "writeback, cur %llu end %llu\n",
2993				       page->index, (unsigned long long)cur,
2994				       (unsigned long long)end);
2995			}
2996
2997			ret = submit_extent_page(write_flags, tree, page,
2998						 sector, iosize, pg_offset,
2999						 bdev, &epd->bio, max_nr,
3000						 end_bio_extent_writepage,
3001						 0, 0, 0);
3002			if (ret)
3003				SetPageError(page);
3004		}
3005		cur = cur + iosize;
3006		pg_offset += iosize;
3007		nr++;
3008	}
3009done:
3010	if (nr == 0) {
3011		/* make sure the mapping tag for page dirty gets cleared */
3012		set_page_writeback(page);
3013		end_page_writeback(page);
3014	}
3015	unlock_page(page);
3016
3017done_unlocked:
3018
3019	/* drop our reference on any cached states */
3020	free_extent_state(cached_state);
3021	return 0;
3022}
3023
3024static int eb_wait(void *word)
3025{
3026	io_schedule();
3027	return 0;
3028}
3029
3030static void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
3031{
3032	wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait,
3033		    TASK_UNINTERRUPTIBLE);
3034}
3035
3036static int lock_extent_buffer_for_io(struct extent_buffer *eb,
3037				     struct btrfs_fs_info *fs_info,
3038				     struct extent_page_data *epd)
3039{
3040	unsigned long i, num_pages;
3041	int flush = 0;
3042	int ret = 0;
3043
3044	if (!btrfs_try_tree_write_lock(eb)) {
3045		flush = 1;
3046		flush_write_bio(epd);
3047		btrfs_tree_lock(eb);
3048	}
3049
3050	if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
3051		btrfs_tree_unlock(eb);
3052		if (!epd->sync_io)
3053			return 0;
3054		if (!flush) {
3055			flush_write_bio(epd);
3056			flush = 1;
3057		}
3058		while (1) {
3059			wait_on_extent_buffer_writeback(eb);
3060			btrfs_tree_lock(eb);
3061			if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
3062				break;
3063			btrfs_tree_unlock(eb);
3064		}
3065	}
3066
3067	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3068		set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3069		btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3070		spin_lock(&fs_info->delalloc_lock);
3071		if (fs_info->dirty_metadata_bytes >= eb->len)
3072			fs_info->dirty_metadata_bytes -= eb->len;
3073		else
3074			WARN_ON(1);
3075		spin_unlock(&fs_info->delalloc_lock);
3076		ret = 1;
3077	}
3078
3079	btrfs_tree_unlock(eb);
3080
3081	if (!ret)
3082		return ret;
3083
3084	num_pages = num_extent_pages(eb->start, eb->len);
3085	for (i = 0; i < num_pages; i++) {
3086		struct page *p = extent_buffer_page(eb, i);
3087
3088		if (!trylock_page(p)) {
3089			if (!flush) {
3090				flush_write_bio(epd);
3091				flush = 1;
3092			}
3093			lock_page(p);
3094		}
3095	}
3096
3097	return ret;
3098}
3099
3100static void end_extent_buffer_writeback(struct extent_buffer *eb)
3101{
3102	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3103	smp_mb__after_clear_bit();
3104	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3105}
3106
3107static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
3108{
3109	int uptodate = err == 0;
3110	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
3111	struct extent_buffer *eb;
3112	int done;
3113
3114	do {
3115		struct page *page = bvec->bv_page;
3116
3117		bvec--;
3118		eb = (struct extent_buffer *)page->private;
3119		BUG_ON(!eb);
3120		done = atomic_dec_and_test(&eb->io_pages);
3121
3122		if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
3123			set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3124			ClearPageUptodate(page);
3125			SetPageError(page);
3126		}
3127
3128		end_page_writeback(page);
3129
3130		if (!done)
3131			continue;
3132
3133		end_extent_buffer_writeback(eb);
3134	} while (bvec >= bio->bi_io_vec);
3135
3136	bio_put(bio);
3137
3138}
3139
3140static int write_one_eb(struct extent_buffer *eb,
3141			struct btrfs_fs_info *fs_info,
3142			struct writeback_control *wbc,
3143			struct extent_page_data *epd)
3144{
3145	struct block_device *bdev = fs_info->fs_devices->latest_bdev;
3146	u64 offset = eb->start;
3147	unsigned long i, num_pages;
3148	int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
3149	int ret = 0;
3150
3151	clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3152	num_pages = num_extent_pages(eb->start, eb->len);
3153	atomic_set(&eb->io_pages, num_pages);
3154	for (i = 0; i < num_pages; i++) {
3155		struct page *p = extent_buffer_page(eb, i);
3156
3157		clear_page_dirty_for_io(p);
3158		set_page_writeback(p);
3159		ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
3160					 PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
3161					 -1, end_bio_extent_buffer_writepage,
3162					 0, 0, 0);
3163		if (ret) {
3164			set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3165			SetPageError(p);
3166			if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
3167				end_extent_buffer_writeback(eb);
3168			ret = -EIO;
3169			break;
3170		}
3171		offset += PAGE_CACHE_SIZE;
3172		update_nr_written(p, wbc, 1);
3173		unlock_page(p);
3174	}
3175
3176	if (unlikely(ret)) {
3177		for (; i < num_pages; i++) {
3178			struct page *p = extent_buffer_page(eb, i);
3179			unlock_page(p);
3180		}
3181	}
3182
3183	return ret;
3184}
3185
3186int btree_write_cache_pages(struct address_space *mapping,
3187				   struct writeback_control *wbc)
3188{
3189	struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
3190	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
3191	struct extent_buffer *eb, *prev_eb = NULL;
3192	struct extent_page_data epd = {
3193		.bio = NULL,
3194		.tree = tree,
3195		.extent_locked = 0,
3196		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
3197	};
3198	int ret = 0;
3199	int done = 0;
3200	int nr_to_write_done = 0;
3201	struct pagevec pvec;
3202	int nr_pages;
3203	pgoff_t index;
3204	pgoff_t end;		/* Inclusive */
3205	int scanned = 0;
3206	int tag;
3207
3208	pagevec_init(&pvec, 0);
3209	if (wbc->range_cyclic) {
3210		index = mapping->writeback_index; /* Start from prev offset */
3211		end = -1;
3212	} else {
3213		index = wbc->range_start >> PAGE_CACHE_SHIFT;
3214		end = wbc->range_end >> PAGE_CACHE_SHIFT;
3215		scanned = 1;
3216	}
3217	if (wbc->sync_mode == WB_SYNC_ALL)
3218		tag = PAGECACHE_TAG_TOWRITE;
3219	else
3220		tag = PAGECACHE_TAG_DIRTY;
3221retry:
3222	if (wbc->sync_mode == WB_SYNC_ALL)
3223		tag_pages_for_writeback(mapping, index, end);
3224	while (!done && !nr_to_write_done && (index <= end) &&
3225	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
3226			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
3227		unsigned i;
3228
3229		scanned = 1;
3230		for (i = 0; i < nr_pages; i++) {
3231			struct page *page = pvec.pages[i];
3232
3233			if (!PagePrivate(page))
3234				continue;
3235
3236			if (!wbc->range_cyclic && page->index > end) {
3237				done = 1;
3238				break;
3239			}
3240
3241			eb = (struct extent_buffer *)page->private;
3242			if (!eb) {
3243				WARN_ON(1);
3244				continue;
3245			}
3246
3247			if (eb == prev_eb)
3248				continue;
3249
3250			if (!atomic_inc_not_zero(&eb->refs)) {
3251				WARN_ON(1);
3252				continue;
3253			}
3254
3255			prev_eb = eb;
3256			ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
3257			if (!ret) {
3258				free_extent_buffer(eb);
3259				continue;
3260			}
3261
3262			ret = write_one_eb(eb, fs_info, wbc, &epd);
3263			if (ret) {
3264				done = 1;
3265				free_extent_buffer(eb);
3266				break;
3267			}
3268			free_extent_buffer(eb);
3269
3270			/*
3271			 * the filesystem may choose to bump up nr_to_write.
3272			 * We have to make sure to honor the new nr_to_write
3273			 * at any time
3274			 */
3275			nr_to_write_done = wbc->nr_to_write <= 0;
3276		}
3277		pagevec_release(&pvec);
3278		cond_resched();
3279	}
3280	if (!scanned && !done) {
3281		/*
3282		 * We hit the last page and there is more work to be done: wrap
3283		 * back to the start of the file
3284		 */
3285		scanned = 1;
3286		index = 0;
3287		goto retry;
3288	}
3289	flush_write_bio(&epd);
3290	return ret;
3291}
3292
3293/**
3294 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
3295 * @mapping: address space structure to write
3296 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
3297 * @writepage: function called for each page
3298 * @data: data passed to writepage function
3299 *
3300 * If a page is already under I/O, write_cache_pages() skips it, even
3301 * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
3302 * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
3303 * and msync() need to guarantee that all the data which was dirty at the time
3304 * the call was made get new I/O started against them.  If wbc->sync_mode is
3305 * WB_SYNC_ALL then we were called for data integrity and we must wait for
3306 * existing IO to complete.
3307 */
3308static int extent_write_cache_pages(struct extent_io_tree *tree,
3309			     struct address_space *mapping,
3310			     struct writeback_control *wbc,
3311			     writepage_t writepage, void *data,
3312			     void (*flush_fn)(void *))
3313{
3314	struct inode *inode = mapping->host;
3315	int ret = 0;
3316	int done = 0;
3317	int nr_to_write_done = 0;
3318	struct pagevec pvec;
3319	int nr_pages;
3320	pgoff_t index;
3321	pgoff_t end;		/* Inclusive */
3322	int scanned = 0;
3323	int tag;
3324
3325	/*
3326	 * We have to hold onto the inode so that ordered extents can do their
3327	 * work when the IO finishes.  The alternative to this is failing to add
3328	 * an ordered extent if the igrab() fails there and that is a huge pain
3329	 * to deal with, so instead just hold onto the inode throughout the
3330	 * writepages operation.  If it fails here we are freeing up the inode
3331	 * anyway and we'd rather not waste our time writing out stuff that is
3332	 * going to be truncated anyway.
3333	 */
3334	if (!igrab(inode))
3335		return 0;
3336
3337	pagevec_init(&pvec, 0);
3338	if (wbc->range_cyclic) {
3339		index = mapping->writeback_index; /* Start from prev offset */
3340		end = -1;
3341	} else {
3342		index = wbc->range_start >> PAGE_CACHE_SHIFT;
3343		end = wbc->range_end >> PAGE_CACHE_SHIFT;
3344		scanned = 1;
3345	}
3346	if (wbc->sync_mode == WB_SYNC_ALL)
3347		tag = PAGECACHE_TAG_TOWRITE;
3348	else
3349		tag = PAGECACHE_TAG_DIRTY;
3350retry:
3351	if (wbc->sync_mode == WB_SYNC_ALL)
3352		tag_pages_for_writeback(mapping, index, end);
3353	while (!done && !nr_to_write_done && (index <= end) &&
3354	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
3355			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
3356		unsigned i;
3357
3358		scanned = 1;
3359		for (i = 0; i < nr_pages; i++) {
3360			struct page *page = pvec.pages[i];
3361
3362			/*
3363			 * At this point we hold neither mapping->tree_lock nor
3364			 * lock on the page itself: the page may be truncated or
3365			 * invalidated (changing page->mapping to NULL), or even
3366			 * swizzled back from swapper_space to tmpfs file
3367			 * mapping
3368			 */
3369			if (tree->ops &&
3370			    tree->ops->write_cache_pages_lock_hook) {
3371				tree->ops->write_cache_pages_lock_hook(page,
3372							       data, flush_fn);
3373			} else {
3374				if (!trylock_page(page)) {
3375					flush_fn(data);
3376					lock_page(page);
3377				}
3378			}
3379
3380			if (unlikely(page->mapping != mapping)) {
3381				unlock_page(page);
3382				continue;
3383			}
3384
3385			if (!wbc->range_cyclic && page->index > end) {
3386				done = 1;
3387				unlock_page(page);
3388				continue;
3389			}
3390
3391			if (wbc->sync_mode != WB_SYNC_NONE) {
3392				if (PageWriteback(page))
3393					flush_fn(data);
3394				wait_on_page_writeback(page);
3395			}
3396
3397			if (PageWriteback(page) ||
3398			    !clear_page_dirty_for_io(page)) {
3399				unlock_page(page);
3400				continue;
3401			}
3402
3403			ret = (*writepage)(page, wbc, data);
3404
3405			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
3406				unlock_page(page);
3407				ret = 0;
3408			}
3409			if (ret)
3410				done = 1;
3411
3412			/*
3413			 * the filesystem may choose to bump up nr_to_write.
3414			 * We have to make sure to honor the new nr_to_write
3415			 * at any time
3416			 */
3417			nr_to_write_done = wbc->nr_to_write <= 0;
3418		}
3419		pagevec_release(&pvec);
3420		cond_resched();
3421	}
3422	if (!scanned && !done) {
3423		/*
3424		 * We hit the last page and there is more work to be done: wrap
3425		 * back to the start of the file
3426		 */
3427		scanned = 1;
3428		index = 0;
3429		goto retry;
3430	}
3431	btrfs_add_delayed_iput(inode);
3432	return ret;
3433}
3434
3435static void flush_epd_write_bio(struct extent_page_data *epd)
3436{
3437	if (epd->bio) {
3438		int rw = WRITE;
3439		int ret;
3440
3441		if (epd->sync_io)
3442			rw = WRITE_SYNC;
3443
3444		ret = submit_one_bio(rw, epd->bio, 0, 0);
3445		BUG_ON(ret < 0); /* -ENOMEM */
3446		epd->bio = NULL;
3447	}
3448}
3449
3450static noinline void flush_write_bio(void *data)
3451{
3452	struct extent_page_data *epd = data;
3453	flush_epd_write_bio(epd);
3454}
3455
3456int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
3457			  get_extent_t *get_extent,
3458			  struct writeback_control *wbc)
3459{
3460	int ret;
3461	struct extent_page_data epd = {
3462		.bio = NULL,
3463		.tree = tree,
3464		.get_extent = get_extent,
3465		.extent_locked = 0,
3466		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
3467	};
3468
3469	ret = __extent_writepage(page, wbc, &epd);
3470
3471	flush_epd_write_bio(&epd);
3472	return ret;
3473}
3474
3475int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
3476			      u64 start, u64 end, get_extent_t *get_extent,
3477			      int mode)
3478{
3479	int ret = 0;
3480	struct address_space *mapping = inode->i_mapping;
3481	struct page *page;
3482	unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
3483		PAGE_CACHE_SHIFT;
3484
3485	struct extent_page_data epd = {
3486		.bio = NULL,
3487		.tree = tree,
3488		.get_extent = get_extent,
3489		.extent_locked = 1,
3490		.sync_io = mode == WB_SYNC_ALL,
3491	};
3492	struct writeback_control wbc_writepages = {
3493		.sync_mode	= mode,
3494		.nr_to_write	= nr_pages * 2,
3495		.range_start	= start,
3496		.range_end	= end + 1,
3497	};
3498
3499	while (start <= end) {
3500		page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
3501		if (clear_page_dirty_for_io(page))
3502			ret = __extent_writepage(page, &wbc_writepages, &epd);
3503		else {
3504			if (tree->ops && tree->ops->writepage_end_io_hook)
3505				tree->ops->writepage_end_io_hook(page, start,
3506						 start + PAGE_CACHE_SIZE - 1,
3507						 NULL, 1);
3508			unlock_page(page);
3509		}
3510		page_cache_release(page);
3511		start += PAGE_CACHE_SIZE;
3512	}
3513
3514	flush_epd_write_bio(&epd);
3515	return ret;
3516}
3517
3518int extent_writepages(struct extent_io_tree *tree,
3519		      struct address_space *mapping,
3520		      get_extent_t *get_extent,
3521		      struct writeback_control *wbc)
3522{
3523	int ret = 0;
3524	struct extent_page_data epd = {
3525		.bio = NULL,
3526		.tree = tree,
3527		.get_extent = get_extent,
3528		.extent_locked = 0,
3529		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
3530	};
3531
3532	ret = extent_write_cache_pages(tree, mapping, wbc,
3533				       __extent_writepage, &epd,
3534				       flush_write_bio);
3535	flush_epd_write_bio(&epd);
3536	return ret;
3537}
3538
3539int extent_readpages(struct extent_io_tree *tree,
3540		     struct address_space *mapping,
3541		     struct list_head *pages, unsigned nr_pages,
3542		     get_extent_t get_extent)
3543{
3544	struct bio *bio = NULL;
3545	unsigned page_idx;
3546	unsigned long bio_flags = 0;
3547
3548	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
3549		struct page *page = list_entry(pages->prev, struct page, lru);
3550
3551		prefetchw(&page->flags);
3552		list_del(&page->lru);
3553		if (!add_to_page_cache_lru(page, mapping,
3554					page->index, GFP_NOFS)) {
3555			__extent_read_full_page(tree, page, get_extent,
3556						&bio, 0, &bio_flags);
3557		}
3558		page_cache_release(page);
3559	}
3560	BUG_ON(!list_empty(pages));
3561	if (bio)
3562		return submit_one_bio(READ, bio, 0, bio_flags);
3563	return 0;
3564}
3565
3566/*
3567 * basic invalidatepage code, this waits on any locked or writeback
3568 * ranges corresponding to the page, and then deletes any extent state
3569 * records from the tree
3570 */
3571int extent_invalidatepage(struct extent_io_tree *tree,
3572			  struct page *page, unsigned long offset)
3573{
3574	struct extent_state *cached_state = NULL;
3575	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
3576	u64 end = start + PAGE_CACHE_SIZE - 1;
3577	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
3578
3579	start += (offset + blocksize - 1) & ~(blocksize - 1);
3580	if (start > end)
3581		return 0;
3582
3583	lock_extent_bits(tree, start, end, 0, &cached_state);
3584	wait_on_page_writeback(page);
3585	clear_extent_bit(tree, start, end,
3586			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
3587			 EXTENT_DO_ACCOUNTING,
3588			 1, 1, &cached_state, GFP_NOFS);
3589	return 0;
3590}
3591
3592/*
3593 * a helper for releasepage, this tests for areas of the page that
3594 * are locked or under IO and drops the related state bits if it is safe
3595 * to drop the page.
3596 */
3597int try_release_extent_state(struct extent_map_tree *map,
3598			     struct extent_io_tree *tree, struct page *page,
3599			     gfp_t mask)
3600{
3601	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
3602	u64 end = start + PAGE_CACHE_SIZE - 1;
3603	int ret = 1;
3604
3605	if (test_range_bit(tree, start, end,
3606			   EXTENT_IOBITS, 0, NULL))
3607		ret = 0;
3608	else {
3609		if ((mask & GFP_NOFS) == GFP_NOFS)
3610			mask = GFP_NOFS;
3611		/*
3612		 * at this point we can safely clear everything except the
3613		 * locked bit and the nodatasum bit
3614		 */
3615		ret = clear_extent_bit(tree, start, end,
3616				 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
3617				 0, 0, NULL, mask);
3618
3619		/* if clear_extent_bit failed for enomem reasons,
3620		 * we can't allow the release to continue.
3621		 */
3622		if (ret < 0)
3623			ret = 0;
3624		else
3625			ret = 1;
3626	}
3627	return ret;
3628}
3629
3630/*
3631 * a helper for releasepage.  As long as there are no locked extents
3632 * in the range corresponding to the page, both state records and extent
3633 * map records are removed
3634 */
3635int try_release_extent_mapping(struct extent_map_tree *map,
3636			       struct extent_io_tree *tree, struct page *page,
3637			       gfp_t mask)
3638{
3639	struct extent_map *em;
3640	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
3641	u64 end = start + PAGE_CACHE_SIZE - 1;
3642
3643	if ((mask & __GFP_WAIT) &&
3644	    page->mapping->host->i_size > 16 * 1024 * 1024) {
3645		u64 len;
3646		while (start <= end) {
3647			len = end - start + 1;
3648			write_lock(&map->lock);
3649			em = lookup_extent_mapping(map, start, len);
3650			if (!em) {
3651				write_unlock(&map->lock);
3652				break;
3653			}
3654			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
3655			    em->start != start) {
3656				write_unlock(&map->lock);
3657				free_extent_map(em);
3658				break;
3659			}
3660			if (!test_range_bit(tree, em->start,
3661					    extent_map_end(em) - 1,
3662					    EXTENT_LOCKED | EXTENT_WRITEBACK,
3663					    0, NULL)) {
3664				remove_extent_mapping(map, em);
3665				/* once for the rb tree */
3666				free_extent_map(em);
3667			}
3668			start = extent_map_end(em);
3669			write_unlock(&map->lock);
3670
3671			/* once for us */
3672			free_extent_map(em);
3673		}
3674	}
3675	return try_release_extent_state(map, tree, page, mask);
3676}
3677
3678/*
3679 * helper function for fiemap, which doesn't want to see any holes.
3680 * This maps until we find something past 'last'
3681 */
3682static struct extent_map *get_extent_skip_holes(struct inode *inode,
3683						u64 offset,
3684						u64 last,
3685						get_extent_t *get_extent)
3686{
3687	u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
3688	struct extent_map *em;
3689	u64 len;
3690
3691	if (offset >= last)
3692		return NULL;
3693
3694	while(1) {
3695		len = last - offset;
3696		if (len == 0)
3697			break;
3698		len = (len + sectorsize - 1) & ~(sectorsize - 1);
3699		em = get_extent(inode, NULL, 0, offset, len, 0);
3700		if (IS_ERR_OR_NULL(em))
3701			return em;
3702
3703		/* if this isn't a hole return it */
3704		if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
3705		    em->block_start != EXTENT_MAP_HOLE) {
3706			return em;
3707		}
3708
3709		/* this is a hole, advance to the next extent */
3710		offset = extent_map_end(em);
3711		free_extent_map(em);
3712		if (offset >= last)
3713			break;
3714	}
3715	return NULL;
3716}
3717
3718int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3719		__u64 start, __u64 len, get_extent_t *get_extent)
3720{
3721	int ret = 0;
3722	u64 off = start;
3723	u64 max = start + len;
3724	u32 flags = 0;
3725	u32 found_type;
3726	u64 last;
3727	u64 last_for_get_extent = 0;
3728	u64 disko = 0;
3729	u64 isize = i_size_read(inode);
3730	struct btrfs_key found_key;
3731	struct extent_map *em = NULL;
3732	struct extent_state *cached_state = NULL;
3733	struct btrfs_path *path;
3734	struct btrfs_file_extent_item *item;
3735	int end = 0;
3736	u64 em_start = 0;
3737	u64 em_len = 0;
3738	u64 em_end = 0;
3739	unsigned long emflags;
3740
3741	if (len == 0)
3742		return -EINVAL;
3743
3744	path = btrfs_alloc_path();
3745	if (!path)
3746		return -ENOMEM;
3747	path->leave_spinning = 1;
3748
3749	start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
3750	len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
3751
3752	/*
3753	 * lookup the last file extent.  We're not using i_size here
3754	 * because there might be preallocation past i_size
3755	 */
3756	ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
3757				       path, btrfs_ino(inode), -1, 0);
3758	if (ret < 0) {
3759		btrfs_free_path(path);
3760		return ret;
3761	}
3762	WARN_ON(!ret);
3763	path->slots[0]--;
3764	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3765			      struct btrfs_file_extent_item);
3766	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
3767	found_type = btrfs_key_type(&found_key);
3768
3769	/* No extents, but there might be delalloc bits */
3770	if (found_key.objectid != btrfs_ino(inode) ||
3771	    found_type != BTRFS_EXTENT_DATA_KEY) {
3772		/* have to trust i_size as the end */
3773		last = (u64)-1;
3774		last_for_get_extent = isize;
3775	} else {
3776		/*
3777		 * remember the start of the last extent.  There are a
3778		 * bunch of different factors that go into the length of the
3779		 * extent, so its much less complex to remember where it started
3780		 */
3781		last = found_key.offset;
3782		last_for_get_extent = last + 1;
3783	}
3784	btrfs_free_path(path);
3785
3786	/*
3787	 * we might have some extents allocated but more delalloc past those
3788	 * extents.  so, we trust isize unless the start of the last extent is
3789	 * beyond isize
3790	 */
3791	if (last < isize) {
3792		last = (u64)-1;
3793		last_for_get_extent = isize;
3794	}
3795
3796	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
3797			 &cached_state);
3798
3799	em = get_extent_skip_holes(inode, start, last_for_get_extent,
3800				   get_extent);
3801	if (!em)
3802		goto out;
3803	if (IS_ERR(em)) {
3804		ret = PTR_ERR(em);
3805		goto out;
3806	}
3807
3808	while (!end) {
3809		u64 offset_in_extent;
3810
3811		/* break if the extent we found is outside the range */
3812		if (em->start >= max || extent_map_end(em) < off)
3813			break;
3814
3815		/*
3816		 * get_extent may return an extent that starts before our
3817		 * requested range.  We have to make sure the ranges
3818		 * we return to fiemap always move forward and don't
3819		 * overlap, so adjust the offsets here
3820		 */
3821		em_start = max(em->start, off);
3822
3823		/*
3824		 * record the offset from the start of the extent
3825		 * for adjusting the disk offset below
3826		 */
3827		offset_in_extent = em_start - em->start;
3828		em_end = extent_map_end(em);
3829		em_len = em_end - em_start;
3830		emflags = em->flags;
3831		disko = 0;
3832		flags = 0;
3833
3834		/*
3835		 * bump off for our next call to get_extent
3836		 */
3837		off = extent_map_end(em);
3838		if (off >= max)
3839			end = 1;
3840
3841		if (em->block_start == EXTENT_MAP_LAST_BYTE) {
3842			end = 1;
3843			flags |= FIEMAP_EXTENT_LAST;
3844		} else if (em->block_start == EXTENT_MAP_INLINE) {
3845			flags |= (FIEMAP_EXTENT_DATA_INLINE |
3846				  FIEMAP_EXTENT_NOT_ALIGNED);
3847		} else if (em->block_start == EXTENT_MAP_DELALLOC) {
3848			flags |= (FIEMAP_EXTENT_DELALLOC |
3849				  FIEMAP_EXTENT_UNKNOWN);
3850		} else {
3851			disko = em->block_start + offset_in_extent;
3852		}
3853		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
3854			flags |= FIEMAP_EXTENT_ENCODED;
3855
3856		free_extent_map(em);
3857		em = NULL;
3858		if ((em_start >= last) || em_len == (u64)-1 ||
3859		   (last == (u64)-1 && isize <= em_end)) {
3860			flags |= FIEMAP_EXTENT_LAST;
3861			end = 1;
3862		}
3863
3864		/* now scan forward to see if this is really the last extent. */
3865		em = get_extent_skip_holes(inode, off, last_for_get_extent,
3866					   get_extent);
3867		if (IS_ERR(em)) {
3868			ret = PTR_ERR(em);
3869			goto out;
3870		}
3871		if (!em) {
3872			flags |= FIEMAP_EXTENT_LAST;
3873			end = 1;
3874		}
3875		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
3876					      em_len, flags);
3877		if (ret)
3878			goto out_free;
3879	}
3880out_free:
3881	free_extent_map(em);
3882out:
3883	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
3884			     &cached_state, GFP_NOFS);
3885	return ret;
3886}
3887
3888inline struct page *extent_buffer_page(struct extent_buffer *eb,
3889					      unsigned long i)
3890{
3891	return eb->pages[i];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3892}
3893
3894inline unsigned long num_extent_pages(u64 start, u64 len)
3895{
3896	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
3897		(start >> PAGE_CACHE_SHIFT);
3898}
3899
3900static void __free_extent_buffer(struct extent_buffer *eb)
3901{
3902#if LEAK_DEBUG
3903	unsigned long flags;
3904	spin_lock_irqsave(&leak_lock, flags);
3905	list_del(&eb->leak_list);
3906	spin_unlock_irqrestore(&leak_lock, flags);
3907#endif
3908	if (eb->pages && eb->pages != eb->inline_pages)
3909		kfree(eb->pages);
3910	kmem_cache_free(extent_buffer_cache, eb);
3911}
3912
3913static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3914						   u64 start,
3915						   unsigned long len,
3916						   gfp_t mask)
3917{
3918	struct extent_buffer *eb = NULL;
3919#if LEAK_DEBUG
3920	unsigned long flags;
3921#endif
3922
3923	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
3924	if (eb == NULL)
3925		return NULL;
3926	eb->start = start;
3927	eb->len = len;
3928	eb->tree = tree;
3929	eb->bflags = 0;
3930	rwlock_init(&eb->lock);
3931	atomic_set(&eb->write_locks, 0);
3932	atomic_set(&eb->read_locks, 0);
3933	atomic_set(&eb->blocking_readers, 0);
3934	atomic_set(&eb->blocking_writers, 0);
3935	atomic_set(&eb->spinning_readers, 0);
3936	atomic_set(&eb->spinning_writers, 0);
3937	eb->lock_nested = 0;
3938	init_waitqueue_head(&eb->write_lock_wq);
3939	init_waitqueue_head(&eb->read_lock_wq);
3940
3941#if LEAK_DEBUG
3942	spin_lock_irqsave(&leak_lock, flags);
3943	list_add(&eb->leak_list, &buffers);
3944	spin_unlock_irqrestore(&leak_lock, flags);
3945#endif
3946	spin_lock_init(&eb->refs_lock);
3947	atomic_set(&eb->refs, 1);
3948	atomic_set(&eb->io_pages, 0);
3949
3950	if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) {
3951		struct page **pages;
3952		int num_pages = (len + PAGE_CACHE_SIZE - 1) >>
3953			PAGE_CACHE_SHIFT;
3954		pages = kzalloc(num_pages, mask);
3955		if (!pages) {
3956			__free_extent_buffer(eb);
3957			return NULL;
3958		}
3959		eb->pages = pages;
3960	} else {
3961		eb->pages = eb->inline_pages;
3962	}
3963
3964	return eb;
3965}
3966
3967struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
3968{
3969	unsigned long i;
3970	struct page *p;
3971	struct extent_buffer *new;
3972	unsigned long num_pages = num_extent_pages(src->start, src->len);
3973
3974	new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC);
3975	if (new == NULL)
3976		return NULL;
3977
3978	for (i = 0; i < num_pages; i++) {
3979		p = alloc_page(GFP_ATOMIC);
3980		BUG_ON(!p);
3981		attach_extent_buffer_page(new, p);
3982		WARN_ON(PageDirty(p));
3983		SetPageUptodate(p);
3984		new->pages[i] = p;
3985	}
3986
3987	copy_extent_buffer(new, src, 0, 0, src->len);
3988	set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
3989	set_bit(EXTENT_BUFFER_DUMMY, &new->bflags);
3990
3991	return new;
3992}
3993
3994struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
3995{
3996	struct extent_buffer *eb;
3997	unsigned long num_pages = num_extent_pages(0, len);
3998	unsigned long i;
3999
4000	eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC);
4001	if (!eb)
4002		return NULL;
4003
4004	for (i = 0; i < num_pages; i++) {
4005		eb->pages[i] = alloc_page(GFP_ATOMIC);
4006		if (!eb->pages[i])
4007			goto err;
4008	}
4009	set_extent_buffer_uptodate(eb);
4010	btrfs_set_header_nritems(eb, 0);
4011	set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4012
4013	return eb;
4014err:
4015	for (i--; i > 0; i--)
4016		__free_page(eb->pages[i]);
4017	__free_extent_buffer(eb);
4018	return NULL;
4019}
4020
4021static int extent_buffer_under_io(struct extent_buffer *eb)
4022{
4023	return (atomic_read(&eb->io_pages) ||
4024		test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
4025		test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
 
 
 
 
4026}
4027
4028/*
4029 * Helper for releasing extent buffer page.
4030 */
4031static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
4032						unsigned long start_idx)
4033{
4034	unsigned long index;
4035	unsigned long num_pages;
4036	struct page *page;
4037	int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4038
4039	BUG_ON(extent_buffer_under_io(eb));
 
4040
4041	num_pages = num_extent_pages(eb->start, eb->len);
4042	index = start_idx + num_pages;
4043	if (start_idx >= index)
4044		return;
4045
4046	do {
4047		index--;
4048		page = extent_buffer_page(eb, index);
4049		if (page && mapped) {
4050			spin_lock(&page->mapping->private_lock);
4051			/*
4052			 * We do this since we'll remove the pages after we've
4053			 * removed the eb from the radix tree, so we could race
4054			 * and have this page now attached to the new eb.  So
4055			 * only clear page_private if it's still connected to
4056			 * this eb.
4057			 */
4058			if (PagePrivate(page) &&
4059			    page->private == (unsigned long)eb) {
4060				BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4061				BUG_ON(PageDirty(page));
4062				BUG_ON(PageWriteback(page));
4063				/*
4064				 * We need to make sure we haven't be attached
4065				 * to a new eb.
4066				 */
4067				ClearPagePrivate(page);
4068				set_page_private(page, 0);
4069				/* One for the page private */
4070				page_cache_release(page);
4071			}
4072			spin_unlock(&page->mapping->private_lock);
4073
4074		}
4075		if (page) {
4076			/* One for when we alloced the page */
4077			page_cache_release(page);
4078		}
4079	} while (index != start_idx);
4080}
4081
4082/*
4083 * Helper for releasing the extent buffer.
4084 */
4085static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
4086{
4087	btrfs_release_extent_buffer_page(eb, 0);
4088	__free_extent_buffer(eb);
4089}
4090
4091static void check_buffer_tree_ref(struct extent_buffer *eb)
4092{
4093	/* the ref bit is tricky.  We have to make sure it is set
4094	 * if we have the buffer dirty.   Otherwise the
4095	 * code to free a buffer can end up dropping a dirty
4096	 * page
4097	 *
4098	 * Once the ref bit is set, it won't go away while the
4099	 * buffer is dirty or in writeback, and it also won't
4100	 * go away while we have the reference count on the
4101	 * eb bumped.
4102	 *
4103	 * We can't just set the ref bit without bumping the
4104	 * ref on the eb because free_extent_buffer might
4105	 * see the ref bit and try to clear it.  If this happens
4106	 * free_extent_buffer might end up dropping our original
4107	 * ref by mistake and freeing the page before we are able
4108	 * to add one more ref.
4109	 *
4110	 * So bump the ref count first, then set the bit.  If someone
4111	 * beat us to it, drop the ref we added.
4112	 */
4113	if (!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
4114		atomic_inc(&eb->refs);
4115		if (test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4116			atomic_dec(&eb->refs);
4117	}
4118}
4119
4120static void mark_extent_buffer_accessed(struct extent_buffer *eb)
4121{
4122	unsigned long num_pages, i;
4123
4124	check_buffer_tree_ref(eb);
4125
4126	num_pages = num_extent_pages(eb->start, eb->len);
4127	for (i = 0; i < num_pages; i++) {
4128		struct page *p = extent_buffer_page(eb, i);
4129		mark_page_accessed(p);
4130	}
4131}
4132
4133struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
4134					  u64 start, unsigned long len)
 
4135{
4136	unsigned long num_pages = num_extent_pages(start, len);
4137	unsigned long i;
4138	unsigned long index = start >> PAGE_CACHE_SHIFT;
4139	struct extent_buffer *eb;
4140	struct extent_buffer *exists = NULL;
4141	struct page *p;
4142	struct address_space *mapping = tree->mapping;
4143	int uptodate = 1;
4144	int ret;
4145
4146	rcu_read_lock();
4147	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4148	if (eb && atomic_inc_not_zero(&eb->refs)) {
4149		rcu_read_unlock();
4150		mark_extent_buffer_accessed(eb);
4151		return eb;
4152	}
4153	rcu_read_unlock();
4154
4155	eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
4156	if (!eb)
4157		return NULL;
4158
4159	for (i = 0; i < num_pages; i++, index++) {
 
 
 
 
 
 
 
 
 
 
 
 
4160		p = find_or_create_page(mapping, index, GFP_NOFS);
4161		if (!p) {
4162			WARN_ON(1);
4163			goto free_eb;
4164		}
4165
4166		spin_lock(&mapping->private_lock);
4167		if (PagePrivate(p)) {
4168			/*
4169			 * We could have already allocated an eb for this page
4170			 * and attached one so lets see if we can get a ref on
4171			 * the existing eb, and if we can we know it's good and
4172			 * we can just return that one, else we know we can just
4173			 * overwrite page->private.
4174			 */
4175			exists = (struct extent_buffer *)p->private;
4176			if (atomic_inc_not_zero(&exists->refs)) {
4177				spin_unlock(&mapping->private_lock);
4178				unlock_page(p);
4179				page_cache_release(p);
4180				mark_extent_buffer_accessed(exists);
4181				goto free_eb;
4182			}
4183
4184			/*
4185			 * Do this so attach doesn't complain and we need to
4186			 * drop the ref the old guy had.
4187			 */
4188			ClearPagePrivate(p);
4189			WARN_ON(PageDirty(p));
4190			page_cache_release(p);
4191		}
4192		attach_extent_buffer_page(eb, p);
4193		spin_unlock(&mapping->private_lock);
4194		WARN_ON(PageDirty(p));
4195		mark_page_accessed(p);
4196		eb->pages[i] = p;
 
 
 
 
 
4197		if (!PageUptodate(p))
4198			uptodate = 0;
4199
4200		/*
4201		 * see below about how we avoid a nasty race with release page
4202		 * and why we unlock later
4203		 */
 
 
4204	}
4205	if (uptodate)
4206		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4207again:
4208	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
4209	if (ret)
4210		goto free_eb;
4211
4212	spin_lock(&tree->buffer_lock);
4213	ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
4214	if (ret == -EEXIST) {
4215		exists = radix_tree_lookup(&tree->buffer,
4216						start >> PAGE_CACHE_SHIFT);
4217		if (!atomic_inc_not_zero(&exists->refs)) {
4218			spin_unlock(&tree->buffer_lock);
4219			radix_tree_preload_end();
4220			exists = NULL;
4221			goto again;
4222		}
4223		spin_unlock(&tree->buffer_lock);
4224		radix_tree_preload_end();
4225		mark_extent_buffer_accessed(exists);
4226		goto free_eb;
4227	}
4228	/* add one reference for the tree */
4229	spin_lock(&eb->refs_lock);
4230	check_buffer_tree_ref(eb);
4231	spin_unlock(&eb->refs_lock);
4232	spin_unlock(&tree->buffer_lock);
4233	radix_tree_preload_end();
4234
4235	/*
4236	 * there is a race where release page may have
4237	 * tried to find this extent buffer in the radix
4238	 * but failed.  It will tell the VM it is safe to
4239	 * reclaim the, and it will clear the page private bit.
4240	 * We must make sure to set the page private bit properly
4241	 * after the extent buffer is in the radix tree so
4242	 * it doesn't get lost
4243	 */
4244	SetPageChecked(eb->pages[0]);
4245	for (i = 1; i < num_pages; i++) {
4246		p = extent_buffer_page(eb, i);
4247		ClearPageChecked(p);
4248		unlock_page(p);
4249	}
4250	unlock_page(eb->pages[0]);
4251	return eb;
4252
4253free_eb:
4254	for (i = 0; i < num_pages; i++) {
4255		if (eb->pages[i])
4256			unlock_page(eb->pages[i]);
4257	}
4258
4259	WARN_ON(!atomic_dec_and_test(&eb->refs));
 
4260	btrfs_release_extent_buffer(eb);
4261	return exists;
4262}
4263
4264struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
4265					 u64 start, unsigned long len)
4266{
4267	struct extent_buffer *eb;
4268
4269	rcu_read_lock();
4270	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4271	if (eb && atomic_inc_not_zero(&eb->refs)) {
4272		rcu_read_unlock();
4273		mark_extent_buffer_accessed(eb);
4274		return eb;
4275	}
4276	rcu_read_unlock();
4277
4278	return NULL;
4279}
4280
4281static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
4282{
4283	struct extent_buffer *eb =
4284			container_of(head, struct extent_buffer, rcu_head);
4285
4286	__free_extent_buffer(eb);
4287}
4288
4289/* Expects to have eb->eb_lock already held */
4290static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
4291{
4292	WARN_ON(atomic_read(&eb->refs) == 0);
4293	if (atomic_dec_and_test(&eb->refs)) {
4294		if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) {
4295			spin_unlock(&eb->refs_lock);
4296		} else {
4297			struct extent_io_tree *tree = eb->tree;
4298
4299			spin_unlock(&eb->refs_lock);
4300
4301			spin_lock(&tree->buffer_lock);
4302			radix_tree_delete(&tree->buffer,
4303					  eb->start >> PAGE_CACHE_SHIFT);
4304			spin_unlock(&tree->buffer_lock);
4305		}
4306
4307		/* Should be safe to release our pages at this point */
4308		btrfs_release_extent_buffer_page(eb, 0);
4309
4310		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
4311		return;
4312	}
4313	spin_unlock(&eb->refs_lock);
4314}
4315
4316void free_extent_buffer(struct extent_buffer *eb)
4317{
4318	if (!eb)
4319		return;
4320
4321	spin_lock(&eb->refs_lock);
4322	if (atomic_read(&eb->refs) == 2 &&
4323	    test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
4324		atomic_dec(&eb->refs);
4325
4326	if (atomic_read(&eb->refs) == 2 &&
4327	    test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
4328	    !extent_buffer_under_io(eb) &&
4329	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4330		atomic_dec(&eb->refs);
4331
4332	/*
4333	 * I know this is terrible, but it's temporary until we stop tracking
4334	 * the uptodate bits and such for the extent buffers.
4335	 */
4336	release_extent_buffer(eb, GFP_ATOMIC);
4337}
4338
4339void free_extent_buffer_stale(struct extent_buffer *eb)
4340{
4341	if (!eb)
4342		return;
4343
4344	spin_lock(&eb->refs_lock);
4345	set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
4346
4347	if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
4348	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4349		atomic_dec(&eb->refs);
4350	release_extent_buffer(eb, GFP_NOFS);
4351}
4352
4353void clear_extent_buffer_dirty(struct extent_buffer *eb)
 
4354{
4355	unsigned long i;
4356	unsigned long num_pages;
4357	struct page *page;
4358
4359	num_pages = num_extent_pages(eb->start, eb->len);
4360
4361	for (i = 0; i < num_pages; i++) {
4362		page = extent_buffer_page(eb, i);
4363		if (!PageDirty(page))
4364			continue;
4365
4366		lock_page(page);
4367		WARN_ON(!PagePrivate(page));
4368
 
 
 
 
4369		clear_page_dirty_for_io(page);
4370		spin_lock_irq(&page->mapping->tree_lock);
4371		if (!PageDirty(page)) {
4372			radix_tree_tag_clear(&page->mapping->page_tree,
4373						page_index(page),
4374						PAGECACHE_TAG_DIRTY);
4375		}
4376		spin_unlock_irq(&page->mapping->tree_lock);
4377		ClearPageError(page);
4378		unlock_page(page);
4379	}
4380	WARN_ON(atomic_read(&eb->refs) == 0);
4381}
4382
4383int set_extent_buffer_dirty(struct extent_buffer *eb)
 
4384{
4385	unsigned long i;
4386	unsigned long num_pages;
4387	int was_dirty = 0;
4388
4389	check_buffer_tree_ref(eb);
4390
4391	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
4392
4393	num_pages = num_extent_pages(eb->start, eb->len);
4394	WARN_ON(atomic_read(&eb->refs) == 0);
4395	WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
4396
4397	for (i = 0; i < num_pages; i++)
4398		set_page_dirty(extent_buffer_page(eb, i));
4399	return was_dirty;
4400}
4401
4402static int range_straddles_pages(u64 start, u64 len)
4403{
4404	if (len < PAGE_CACHE_SIZE)
4405		return 1;
4406	if (start & (PAGE_CACHE_SIZE - 1))
4407		return 1;
4408	if ((start + len) & (PAGE_CACHE_SIZE - 1))
4409		return 1;
4410	return 0;
4411}
4412
4413int clear_extent_buffer_uptodate(struct extent_buffer *eb)
 
 
 
 
 
 
 
4414{
4415	unsigned long i;
4416	struct page *page;
4417	unsigned long num_pages;
4418
4419	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4420	num_pages = num_extent_pages(eb->start, eb->len);
 
 
 
 
 
 
4421	for (i = 0; i < num_pages; i++) {
4422		page = extent_buffer_page(eb, i);
4423		if (page)
4424			ClearPageUptodate(page);
4425	}
4426	return 0;
4427}
4428
4429int set_extent_buffer_uptodate(struct extent_buffer *eb)
 
4430{
4431	unsigned long i;
4432	struct page *page;
4433	unsigned long num_pages;
4434
4435	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4436	num_pages = num_extent_pages(eb->start, eb->len);
 
 
 
 
 
4437	for (i = 0; i < num_pages; i++) {
4438		page = extent_buffer_page(eb, i);
 
 
 
 
 
 
4439		SetPageUptodate(page);
4440	}
4441	return 0;
4442}
4443
4444int extent_range_uptodate(struct extent_io_tree *tree,
4445			  u64 start, u64 end)
4446{
4447	struct page *page;
4448	int ret;
4449	int pg_uptodate = 1;
4450	int uptodate;
4451	unsigned long index;
4452
4453	if (range_straddles_pages(start, end - start + 1)) {
4454		ret = test_range_bit(tree, start, end,
4455				     EXTENT_UPTODATE, 1, NULL);
4456		if (ret)
4457			return 1;
4458	}
4459	while (start <= end) {
4460		index = start >> PAGE_CACHE_SHIFT;
4461		page = find_get_page(tree->mapping, index);
4462		if (!page)
4463			return 1;
4464		uptodate = PageUptodate(page);
4465		page_cache_release(page);
4466		if (!uptodate) {
4467			pg_uptodate = 0;
4468			break;
4469		}
4470		start += PAGE_CACHE_SIZE;
4471	}
4472	return pg_uptodate;
4473}
4474
4475int extent_buffer_uptodate(struct extent_buffer *eb)
 
 
4476{
4477	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4478}
4479
4480int read_extent_buffer_pages(struct extent_io_tree *tree,
4481			     struct extent_buffer *eb, u64 start, int wait,
 
4482			     get_extent_t *get_extent, int mirror_num)
4483{
4484	unsigned long i;
4485	unsigned long start_i;
4486	struct page *page;
4487	int err;
4488	int ret = 0;
4489	int locked_pages = 0;
4490	int all_uptodate = 1;
 
4491	unsigned long num_pages;
4492	unsigned long num_reads = 0;
4493	struct bio *bio = NULL;
4494	unsigned long bio_flags = 0;
4495
4496	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
4497		return 0;
4498
 
 
 
 
 
 
 
4499	if (start) {
4500		WARN_ON(start < eb->start);
4501		start_i = (start >> PAGE_CACHE_SHIFT) -
4502			(eb->start >> PAGE_CACHE_SHIFT);
4503	} else {
4504		start_i = 0;
4505	}
4506
4507	num_pages = num_extent_pages(eb->start, eb->len);
4508	for (i = start_i; i < num_pages; i++) {
4509		page = extent_buffer_page(eb, i);
4510		if (wait == WAIT_NONE) {
4511			if (!trylock_page(page))
4512				goto unlock_exit;
4513		} else {
4514			lock_page(page);
4515		}
4516		locked_pages++;
4517		if (!PageUptodate(page)) {
4518			num_reads++;
4519			all_uptodate = 0;
4520		}
4521	}
4522	if (all_uptodate) {
4523		if (start_i == 0)
4524			set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4525		goto unlock_exit;
4526	}
4527
4528	clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
4529	eb->read_mirror = 0;
4530	atomic_set(&eb->io_pages, num_reads);
4531	for (i = start_i; i < num_pages; i++) {
4532		page = extent_buffer_page(eb, i);
 
 
 
 
 
 
 
 
 
4533		if (!PageUptodate(page)) {
 
 
4534			ClearPageError(page);
4535			err = __extent_read_full_page(tree, page,
4536						      get_extent, &bio,
4537						      mirror_num, &bio_flags);
4538			if (err)
4539				ret = err;
4540		} else {
4541			unlock_page(page);
4542		}
4543	}
4544
4545	if (bio) {
4546		err = submit_one_bio(READ, bio, mirror_num, bio_flags);
4547		if (err)
4548			return err;
4549	}
4550
4551	if (ret || wait != WAIT_COMPLETE)
4552		return ret;
4553
4554	for (i = start_i; i < num_pages; i++) {
4555		page = extent_buffer_page(eb, i);
4556		wait_on_page_locked(page);
4557		if (!PageUptodate(page))
4558			ret = -EIO;
4559	}
4560
 
 
4561	return ret;
4562
4563unlock_exit:
4564	i = start_i;
4565	while (locked_pages > 0) {
4566		page = extent_buffer_page(eb, i);
4567		i++;
4568		unlock_page(page);
4569		locked_pages--;
4570	}
4571	return ret;
4572}
4573
4574void read_extent_buffer(struct extent_buffer *eb, void *dstv,
4575			unsigned long start,
4576			unsigned long len)
4577{
4578	size_t cur;
4579	size_t offset;
4580	struct page *page;
4581	char *kaddr;
4582	char *dst = (char *)dstv;
4583	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4584	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4585
4586	WARN_ON(start > eb->len);
4587	WARN_ON(start + len > eb->start + eb->len);
4588
4589	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4590
4591	while (len > 0) {
4592		page = extent_buffer_page(eb, i);
4593
4594		cur = min(len, (PAGE_CACHE_SIZE - offset));
4595		kaddr = page_address(page);
4596		memcpy(dst, kaddr + offset, cur);
4597
4598		dst += cur;
4599		len -= cur;
4600		offset = 0;
4601		i++;
4602	}
4603}
4604
4605int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
4606			       unsigned long min_len, char **map,
4607			       unsigned long *map_start,
4608			       unsigned long *map_len)
4609{
4610	size_t offset = start & (PAGE_CACHE_SIZE - 1);
4611	char *kaddr;
4612	struct page *p;
4613	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4614	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4615	unsigned long end_i = (start_offset + start + min_len - 1) >>
4616		PAGE_CACHE_SHIFT;
4617
4618	if (i != end_i)
4619		return -EINVAL;
4620
4621	if (i == 0) {
4622		offset = start_offset;
4623		*map_start = 0;
4624	} else {
4625		offset = 0;
4626		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
4627	}
4628
4629	if (start + min_len > eb->len) {
4630		printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
4631		       "wanted %lu %lu\n", (unsigned long long)eb->start,
4632		       eb->len, start, min_len);
4633		WARN_ON(1);
4634		return -EINVAL;
4635	}
4636
4637	p = extent_buffer_page(eb, i);
4638	kaddr = page_address(p);
4639	*map = kaddr + offset;
4640	*map_len = PAGE_CACHE_SIZE - offset;
4641	return 0;
4642}
4643
4644int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
4645			  unsigned long start,
4646			  unsigned long len)
4647{
4648	size_t cur;
4649	size_t offset;
4650	struct page *page;
4651	char *kaddr;
4652	char *ptr = (char *)ptrv;
4653	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4654	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4655	int ret = 0;
4656
4657	WARN_ON(start > eb->len);
4658	WARN_ON(start + len > eb->start + eb->len);
4659
4660	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4661
4662	while (len > 0) {
4663		page = extent_buffer_page(eb, i);
4664
4665		cur = min(len, (PAGE_CACHE_SIZE - offset));
4666
4667		kaddr = page_address(page);
4668		ret = memcmp(ptr, kaddr + offset, cur);
4669		if (ret)
4670			break;
4671
4672		ptr += cur;
4673		len -= cur;
4674		offset = 0;
4675		i++;
4676	}
4677	return ret;
4678}
4679
4680void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
4681			 unsigned long start, unsigned long len)
4682{
4683	size_t cur;
4684	size_t offset;
4685	struct page *page;
4686	char *kaddr;
4687	char *src = (char *)srcv;
4688	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4689	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4690
4691	WARN_ON(start > eb->len);
4692	WARN_ON(start + len > eb->start + eb->len);
4693
4694	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4695
4696	while (len > 0) {
4697		page = extent_buffer_page(eb, i);
4698		WARN_ON(!PageUptodate(page));
4699
4700		cur = min(len, PAGE_CACHE_SIZE - offset);
4701		kaddr = page_address(page);
4702		memcpy(kaddr + offset, src, cur);
4703
4704		src += cur;
4705		len -= cur;
4706		offset = 0;
4707		i++;
4708	}
4709}
4710
4711void memset_extent_buffer(struct extent_buffer *eb, char c,
4712			  unsigned long start, unsigned long len)
4713{
4714	size_t cur;
4715	size_t offset;
4716	struct page *page;
4717	char *kaddr;
4718	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4719	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4720
4721	WARN_ON(start > eb->len);
4722	WARN_ON(start + len > eb->start + eb->len);
4723
4724	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4725
4726	while (len > 0) {
4727		page = extent_buffer_page(eb, i);
4728		WARN_ON(!PageUptodate(page));
4729
4730		cur = min(len, PAGE_CACHE_SIZE - offset);
4731		kaddr = page_address(page);
4732		memset(kaddr + offset, c, cur);
4733
4734		len -= cur;
4735		offset = 0;
4736		i++;
4737	}
4738}
4739
4740void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
4741			unsigned long dst_offset, unsigned long src_offset,
4742			unsigned long len)
4743{
4744	u64 dst_len = dst->len;
4745	size_t cur;
4746	size_t offset;
4747	struct page *page;
4748	char *kaddr;
4749	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4750	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
4751
4752	WARN_ON(src->len != dst_len);
4753
4754	offset = (start_offset + dst_offset) &
4755		((unsigned long)PAGE_CACHE_SIZE - 1);
4756
4757	while (len > 0) {
4758		page = extent_buffer_page(dst, i);
4759		WARN_ON(!PageUptodate(page));
4760
4761		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
4762
4763		kaddr = page_address(page);
4764		read_extent_buffer(src, kaddr + offset, src_offset, cur);
4765
4766		src_offset += cur;
4767		len -= cur;
4768		offset = 0;
4769		i++;
4770	}
4771}
4772
4773static void move_pages(struct page *dst_page, struct page *src_page,
4774		       unsigned long dst_off, unsigned long src_off,
4775		       unsigned long len)
4776{
4777	char *dst_kaddr = page_address(dst_page);
4778	if (dst_page == src_page) {
4779		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
4780	} else {
4781		char *src_kaddr = page_address(src_page);
4782		char *p = dst_kaddr + dst_off + len;
4783		char *s = src_kaddr + src_off + len;
4784
4785		while (len--)
4786			*--p = *--s;
4787	}
4788}
4789
4790static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
4791{
4792	unsigned long distance = (src > dst) ? src - dst : dst - src;
4793	return distance < len;
4794}
4795
4796static void copy_pages(struct page *dst_page, struct page *src_page,
4797		       unsigned long dst_off, unsigned long src_off,
4798		       unsigned long len)
4799{
4800	char *dst_kaddr = page_address(dst_page);
4801	char *src_kaddr;
4802	int must_memmove = 0;
4803
4804	if (dst_page != src_page) {
4805		src_kaddr = page_address(src_page);
4806	} else {
4807		src_kaddr = dst_kaddr;
4808		if (areas_overlap(src_off, dst_off, len))
4809			must_memmove = 1;
4810	}
4811
4812	if (must_memmove)
4813		memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
4814	else
4815		memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
4816}
4817
4818void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4819			   unsigned long src_offset, unsigned long len)
4820{
4821	size_t cur;
4822	size_t dst_off_in_page;
4823	size_t src_off_in_page;
4824	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4825	unsigned long dst_i;
4826	unsigned long src_i;
4827
4828	if (src_offset + len > dst->len) {
4829		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
4830		       "len %lu dst len %lu\n", src_offset, len, dst->len);
4831		BUG_ON(1);
4832	}
4833	if (dst_offset + len > dst->len) {
4834		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
4835		       "len %lu dst len %lu\n", dst_offset, len, dst->len);
4836		BUG_ON(1);
4837	}
4838
4839	while (len > 0) {
4840		dst_off_in_page = (start_offset + dst_offset) &
4841			((unsigned long)PAGE_CACHE_SIZE - 1);
4842		src_off_in_page = (start_offset + src_offset) &
4843			((unsigned long)PAGE_CACHE_SIZE - 1);
4844
4845		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
4846		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
4847
4848		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
4849					       src_off_in_page));
4850		cur = min_t(unsigned long, cur,
4851			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
4852
4853		copy_pages(extent_buffer_page(dst, dst_i),
4854			   extent_buffer_page(dst, src_i),
4855			   dst_off_in_page, src_off_in_page, cur);
4856
4857		src_offset += cur;
4858		dst_offset += cur;
4859		len -= cur;
4860	}
4861}
4862
4863void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4864			   unsigned long src_offset, unsigned long len)
4865{
4866	size_t cur;
4867	size_t dst_off_in_page;
4868	size_t src_off_in_page;
4869	unsigned long dst_end = dst_offset + len - 1;
4870	unsigned long src_end = src_offset + len - 1;
4871	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4872	unsigned long dst_i;
4873	unsigned long src_i;
4874
4875	if (src_offset + len > dst->len) {
4876		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
4877		       "len %lu len %lu\n", src_offset, len, dst->len);
4878		BUG_ON(1);
4879	}
4880	if (dst_offset + len > dst->len) {
4881		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
4882		       "len %lu len %lu\n", dst_offset, len, dst->len);
4883		BUG_ON(1);
4884	}
4885	if (dst_offset < src_offset) {
4886		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
4887		return;
4888	}
4889	while (len > 0) {
4890		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
4891		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
4892
4893		dst_off_in_page = (start_offset + dst_end) &
4894			((unsigned long)PAGE_CACHE_SIZE - 1);
4895		src_off_in_page = (start_offset + src_end) &
4896			((unsigned long)PAGE_CACHE_SIZE - 1);
4897
4898		cur = min_t(unsigned long, len, src_off_in_page + 1);
4899		cur = min(cur, dst_off_in_page + 1);
4900		move_pages(extent_buffer_page(dst, dst_i),
4901			   extent_buffer_page(dst, src_i),
4902			   dst_off_in_page - cur + 1,
4903			   src_off_in_page - cur + 1, cur);
4904
4905		dst_end -= cur;
4906		src_end -= cur;
4907		len -= cur;
4908	}
4909}
4910
4911int try_release_extent_buffer(struct page *page, gfp_t mask)
4912{
4913	struct extent_buffer *eb;
 
4914
4915	/*
4916	 * We need to make sure noboody is attaching this page to an eb right
4917	 * now.
4918	 */
4919	spin_lock(&page->mapping->private_lock);
4920	if (!PagePrivate(page)) {
4921		spin_unlock(&page->mapping->private_lock);
4922		return 1;
4923	}
4924
4925	eb = (struct extent_buffer *)page->private;
4926	BUG_ON(!eb);
 
 
 
4927
4928	/*
4929	 * This is a little awful but should be ok, we need to make sure that
4930	 * the eb doesn't disappear out from under us while we're looking at
4931	 * this page.
4932	 */
4933	spin_lock(&eb->refs_lock);
4934	if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
4935		spin_unlock(&eb->refs_lock);
4936		spin_unlock(&page->mapping->private_lock);
4937		return 0;
4938	}
4939	spin_unlock(&page->mapping->private_lock);
4940
4941	if ((mask & GFP_NOFS) == GFP_NOFS)
4942		mask = GFP_NOFS;
 
 
4943
4944	/*
4945	 * If tree ref isn't set then we know the ref on this eb is a real ref,
4946	 * so just return, this page will likely be freed soon anyway.
4947	 */
4948	if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
4949		spin_unlock(&eb->refs_lock);
4950		return 0;
4951	}
4952	release_extent_buffer(eb, mask);
4953
4954	return 1;
 
 
 
 
 
 
 
4955}
v3.1
   1#include <linux/bitops.h>
   2#include <linux/slab.h>
   3#include <linux/bio.h>
   4#include <linux/mm.h>
   5#include <linux/pagemap.h>
   6#include <linux/page-flags.h>
   7#include <linux/module.h>
   8#include <linux/spinlock.h>
   9#include <linux/blkdev.h>
  10#include <linux/swap.h>
  11#include <linux/writeback.h>
  12#include <linux/pagevec.h>
  13#include <linux/prefetch.h>
  14#include <linux/cleancache.h>
  15#include "extent_io.h"
  16#include "extent_map.h"
  17#include "compat.h"
  18#include "ctree.h"
  19#include "btrfs_inode.h"
 
 
 
 
  20
  21static struct kmem_cache *extent_state_cache;
  22static struct kmem_cache *extent_buffer_cache;
  23
  24static LIST_HEAD(buffers);
  25static LIST_HEAD(states);
  26
  27#define LEAK_DEBUG 0
  28#if LEAK_DEBUG
  29static DEFINE_SPINLOCK(leak_lock);
  30#endif
  31
  32#define BUFFER_LRU_MAX 64
  33
  34struct tree_entry {
  35	u64 start;
  36	u64 end;
  37	struct rb_node rb_node;
  38};
  39
  40struct extent_page_data {
  41	struct bio *bio;
  42	struct extent_io_tree *tree;
  43	get_extent_t *get_extent;
  44
  45	/* tells writepage not to lock the state bits for this range
  46	 * it still does the unlocking
  47	 */
  48	unsigned int extent_locked:1;
  49
  50	/* tells the submit_bio code to use a WRITE_SYNC */
  51	unsigned int sync_io:1;
  52};
  53
 
 
 
 
 
 
 
  54int __init extent_io_init(void)
  55{
  56	extent_state_cache = kmem_cache_create("extent_state",
  57			sizeof(struct extent_state), 0,
  58			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
  59	if (!extent_state_cache)
  60		return -ENOMEM;
  61
  62	extent_buffer_cache = kmem_cache_create("extent_buffers",
  63			sizeof(struct extent_buffer), 0,
  64			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
  65	if (!extent_buffer_cache)
  66		goto free_state_cache;
  67	return 0;
  68
  69free_state_cache:
  70	kmem_cache_destroy(extent_state_cache);
  71	return -ENOMEM;
  72}
  73
  74void extent_io_exit(void)
  75{
  76	struct extent_state *state;
  77	struct extent_buffer *eb;
  78
  79	while (!list_empty(&states)) {
  80		state = list_entry(states.next, struct extent_state, leak_list);
  81		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
  82		       "state %lu in tree %p refs %d\n",
  83		       (unsigned long long)state->start,
  84		       (unsigned long long)state->end,
  85		       state->state, state->tree, atomic_read(&state->refs));
  86		list_del(&state->leak_list);
  87		kmem_cache_free(extent_state_cache, state);
  88
  89	}
  90
  91	while (!list_empty(&buffers)) {
  92		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
  93		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
  94		       "refs %d\n", (unsigned long long)eb->start,
  95		       eb->len, atomic_read(&eb->refs));
  96		list_del(&eb->leak_list);
  97		kmem_cache_free(extent_buffer_cache, eb);
  98	}
  99	if (extent_state_cache)
 100		kmem_cache_destroy(extent_state_cache);
 101	if (extent_buffer_cache)
 102		kmem_cache_destroy(extent_buffer_cache);
 103}
 104
 105void extent_io_tree_init(struct extent_io_tree *tree,
 106			 struct address_space *mapping)
 107{
 108	tree->state = RB_ROOT;
 109	INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
 110	tree->ops = NULL;
 111	tree->dirty_bytes = 0;
 112	spin_lock_init(&tree->lock);
 113	spin_lock_init(&tree->buffer_lock);
 114	tree->mapping = mapping;
 115}
 116
 117static struct extent_state *alloc_extent_state(gfp_t mask)
 118{
 119	struct extent_state *state;
 120#if LEAK_DEBUG
 121	unsigned long flags;
 122#endif
 123
 124	state = kmem_cache_alloc(extent_state_cache, mask);
 125	if (!state)
 126		return state;
 127	state->state = 0;
 128	state->private = 0;
 129	state->tree = NULL;
 130#if LEAK_DEBUG
 131	spin_lock_irqsave(&leak_lock, flags);
 132	list_add(&state->leak_list, &states);
 133	spin_unlock_irqrestore(&leak_lock, flags);
 134#endif
 135	atomic_set(&state->refs, 1);
 136	init_waitqueue_head(&state->wq);
 
 137	return state;
 138}
 139
 140void free_extent_state(struct extent_state *state)
 141{
 142	if (!state)
 143		return;
 144	if (atomic_dec_and_test(&state->refs)) {
 145#if LEAK_DEBUG
 146		unsigned long flags;
 147#endif
 148		WARN_ON(state->tree);
 149#if LEAK_DEBUG
 150		spin_lock_irqsave(&leak_lock, flags);
 151		list_del(&state->leak_list);
 152		spin_unlock_irqrestore(&leak_lock, flags);
 153#endif
 
 154		kmem_cache_free(extent_state_cache, state);
 155	}
 156}
 157
 158static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 159				   struct rb_node *node)
 160{
 161	struct rb_node **p = &root->rb_node;
 162	struct rb_node *parent = NULL;
 163	struct tree_entry *entry;
 164
 165	while (*p) {
 166		parent = *p;
 167		entry = rb_entry(parent, struct tree_entry, rb_node);
 168
 169		if (offset < entry->start)
 170			p = &(*p)->rb_left;
 171		else if (offset > entry->end)
 172			p = &(*p)->rb_right;
 173		else
 174			return parent;
 175	}
 176
 177	entry = rb_entry(node, struct tree_entry, rb_node);
 178	rb_link_node(node, parent, p);
 179	rb_insert_color(node, root);
 180	return NULL;
 181}
 182
 183static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 184				     struct rb_node **prev_ret,
 185				     struct rb_node **next_ret)
 186{
 187	struct rb_root *root = &tree->state;
 188	struct rb_node *n = root->rb_node;
 189	struct rb_node *prev = NULL;
 190	struct rb_node *orig_prev = NULL;
 191	struct tree_entry *entry;
 192	struct tree_entry *prev_entry = NULL;
 193
 194	while (n) {
 195		entry = rb_entry(n, struct tree_entry, rb_node);
 196		prev = n;
 197		prev_entry = entry;
 198
 199		if (offset < entry->start)
 200			n = n->rb_left;
 201		else if (offset > entry->end)
 202			n = n->rb_right;
 203		else
 204			return n;
 205	}
 206
 207	if (prev_ret) {
 208		orig_prev = prev;
 209		while (prev && offset > prev_entry->end) {
 210			prev = rb_next(prev);
 211			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 212		}
 213		*prev_ret = prev;
 214		prev = orig_prev;
 215	}
 216
 217	if (next_ret) {
 218		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 219		while (prev && offset < prev_entry->start) {
 220			prev = rb_prev(prev);
 221			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 222		}
 223		*next_ret = prev;
 224	}
 225	return NULL;
 226}
 227
 228static inline struct rb_node *tree_search(struct extent_io_tree *tree,
 229					  u64 offset)
 230{
 231	struct rb_node *prev = NULL;
 232	struct rb_node *ret;
 233
 234	ret = __etree_search(tree, offset, &prev, NULL);
 235	if (!ret)
 236		return prev;
 237	return ret;
 238}
 239
 240static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
 241		     struct extent_state *other)
 242{
 243	if (tree->ops && tree->ops->merge_extent_hook)
 244		tree->ops->merge_extent_hook(tree->mapping->host, new,
 245					     other);
 246}
 247
 248/*
 249 * utility function to look for merge candidates inside a given range.
 250 * Any extents with matching state are merged together into a single
 251 * extent in the tree.  Extents with EXTENT_IO in their state field
 252 * are not merged because the end_io handlers need to be able to do
 253 * operations on them without sleeping (or doing allocations/splits).
 254 *
 255 * This should be called with the tree lock held.
 256 */
 257static void merge_state(struct extent_io_tree *tree,
 258		        struct extent_state *state)
 259{
 260	struct extent_state *other;
 261	struct rb_node *other_node;
 262
 263	if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
 264		return;
 265
 266	other_node = rb_prev(&state->rb_node);
 267	if (other_node) {
 268		other = rb_entry(other_node, struct extent_state, rb_node);
 269		if (other->end == state->start - 1 &&
 270		    other->state == state->state) {
 271			merge_cb(tree, state, other);
 272			state->start = other->start;
 273			other->tree = NULL;
 274			rb_erase(&other->rb_node, &tree->state);
 275			free_extent_state(other);
 276		}
 277	}
 278	other_node = rb_next(&state->rb_node);
 279	if (other_node) {
 280		other = rb_entry(other_node, struct extent_state, rb_node);
 281		if (other->start == state->end + 1 &&
 282		    other->state == state->state) {
 283			merge_cb(tree, state, other);
 284			state->end = other->end;
 285			other->tree = NULL;
 286			rb_erase(&other->rb_node, &tree->state);
 287			free_extent_state(other);
 288		}
 289	}
 290}
 291
 292static void set_state_cb(struct extent_io_tree *tree,
 293			 struct extent_state *state, int *bits)
 294{
 295	if (tree->ops && tree->ops->set_bit_hook)
 296		tree->ops->set_bit_hook(tree->mapping->host, state, bits);
 297}
 298
 299static void clear_state_cb(struct extent_io_tree *tree,
 300			   struct extent_state *state, int *bits)
 301{
 302	if (tree->ops && tree->ops->clear_bit_hook)
 303		tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
 304}
 305
 306static void set_state_bits(struct extent_io_tree *tree,
 307			   struct extent_state *state, int *bits);
 308
 309/*
 310 * insert an extent_state struct into the tree.  'bits' are set on the
 311 * struct before it is inserted.
 312 *
 313 * This may return -EEXIST if the extent is already there, in which case the
 314 * state struct is freed.
 315 *
 316 * The tree lock is not taken internally.  This is a utility function and
 317 * probably isn't what you want to call (see set/clear_extent_bit).
 318 */
 319static int insert_state(struct extent_io_tree *tree,
 320			struct extent_state *state, u64 start, u64 end,
 321			int *bits)
 322{
 323	struct rb_node *node;
 324
 325	if (end < start) {
 326		printk(KERN_ERR "btrfs end < start %llu %llu\n",
 327		       (unsigned long long)end,
 328		       (unsigned long long)start);
 329		WARN_ON(1);
 330	}
 331	state->start = start;
 332	state->end = end;
 333
 334	set_state_bits(tree, state, bits);
 335
 336	node = tree_insert(&tree->state, end, &state->rb_node);
 337	if (node) {
 338		struct extent_state *found;
 339		found = rb_entry(node, struct extent_state, rb_node);
 340		printk(KERN_ERR "btrfs found node %llu %llu on insert of "
 341		       "%llu %llu\n", (unsigned long long)found->start,
 342		       (unsigned long long)found->end,
 343		       (unsigned long long)start, (unsigned long long)end);
 344		return -EEXIST;
 345	}
 346	state->tree = tree;
 347	merge_state(tree, state);
 348	return 0;
 349}
 350
 351static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
 352		     u64 split)
 353{
 354	if (tree->ops && tree->ops->split_extent_hook)
 355		tree->ops->split_extent_hook(tree->mapping->host, orig, split);
 356}
 357
 358/*
 359 * split a given extent state struct in two, inserting the preallocated
 360 * struct 'prealloc' as the newly created second half.  'split' indicates an
 361 * offset inside 'orig' where it should be split.
 362 *
 363 * Before calling,
 364 * the tree has 'orig' at [orig->start, orig->end].  After calling, there
 365 * are two extent state structs in the tree:
 366 * prealloc: [orig->start, split - 1]
 367 * orig: [ split, orig->end ]
 368 *
 369 * The tree locks are not taken by this function. They need to be held
 370 * by the caller.
 371 */
 372static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 373		       struct extent_state *prealloc, u64 split)
 374{
 375	struct rb_node *node;
 376
 377	split_cb(tree, orig, split);
 378
 379	prealloc->start = orig->start;
 380	prealloc->end = split - 1;
 381	prealloc->state = orig->state;
 382	orig->start = split;
 383
 384	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
 385	if (node) {
 386		free_extent_state(prealloc);
 387		return -EEXIST;
 388	}
 389	prealloc->tree = tree;
 390	return 0;
 391}
 392
 
 
 
 
 
 
 
 
 
 393/*
 394 * utility function to clear some bits in an extent state struct.
 395 * it will optionally wake up any one waiting on this state (wake == 1), or
 396 * forcibly remove the state from the tree (delete == 1).
 397 *
 398 * If no bits are set on the state struct after clearing things, the
 399 * struct is freed and removed from the tree
 400 */
 401static int clear_state_bit(struct extent_io_tree *tree,
 402			    struct extent_state *state,
 403			    int *bits, int wake)
 404{
 
 405	int bits_to_clear = *bits & ~EXTENT_CTLBITS;
 406	int ret = state->state & bits_to_clear;
 407
 408	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
 409		u64 range = state->end - state->start + 1;
 410		WARN_ON(range > tree->dirty_bytes);
 411		tree->dirty_bytes -= range;
 412	}
 413	clear_state_cb(tree, state, bits);
 414	state->state &= ~bits_to_clear;
 415	if (wake)
 416		wake_up(&state->wq);
 417	if (state->state == 0) {
 
 418		if (state->tree) {
 419			rb_erase(&state->rb_node, &tree->state);
 420			state->tree = NULL;
 421			free_extent_state(state);
 422		} else {
 423			WARN_ON(1);
 424		}
 425	} else {
 426		merge_state(tree, state);
 
 427	}
 428	return ret;
 429}
 430
 431static struct extent_state *
 432alloc_extent_state_atomic(struct extent_state *prealloc)
 433{
 434	if (!prealloc)
 435		prealloc = alloc_extent_state(GFP_ATOMIC);
 436
 437	return prealloc;
 438}
 439
 
 
 
 
 
 
 
 440/*
 441 * clear some bits on a range in the tree.  This may require splitting
 442 * or inserting elements in the tree, so the gfp mask is used to
 443 * indicate which allocations or sleeping are allowed.
 444 *
 445 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
 446 * the given range from the tree regardless of state (ie for truncate).
 447 *
 448 * the range [start, end] is inclusive.
 449 *
 450 * This takes the tree lock, and returns < 0 on error, > 0 if any of the
 451 * bits were already set, or zero if none of the bits were already set.
 452 */
 453int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 454		     int bits, int wake, int delete,
 455		     struct extent_state **cached_state,
 456		     gfp_t mask)
 457{
 458	struct extent_state *state;
 459	struct extent_state *cached;
 460	struct extent_state *prealloc = NULL;
 461	struct rb_node *next_node;
 462	struct rb_node *node;
 463	u64 last_end;
 464	int err;
 465	int set = 0;
 466	int clear = 0;
 467
 468	if (delete)
 469		bits |= ~EXTENT_CTLBITS;
 470	bits |= EXTENT_FIRST_DELALLOC;
 471
 472	if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
 473		clear = 1;
 474again:
 475	if (!prealloc && (mask & __GFP_WAIT)) {
 476		prealloc = alloc_extent_state(mask);
 477		if (!prealloc)
 478			return -ENOMEM;
 479	}
 480
 481	spin_lock(&tree->lock);
 482	if (cached_state) {
 483		cached = *cached_state;
 484
 485		if (clear) {
 486			*cached_state = NULL;
 487			cached_state = NULL;
 488		}
 489
 490		if (cached && cached->tree && cached->start <= start &&
 491		    cached->end > start) {
 492			if (clear)
 493				atomic_dec(&cached->refs);
 494			state = cached;
 495			goto hit_next;
 496		}
 497		if (clear)
 498			free_extent_state(cached);
 499	}
 500	/*
 501	 * this search will find the extents that end after
 502	 * our range starts
 503	 */
 504	node = tree_search(tree, start);
 505	if (!node)
 506		goto out;
 507	state = rb_entry(node, struct extent_state, rb_node);
 508hit_next:
 509	if (state->start > end)
 510		goto out;
 511	WARN_ON(state->end < start);
 512	last_end = state->end;
 513
 
 
 
 
 
 
 514	/*
 515	 *     | ---- desired range ---- |
 516	 *  | state | or
 517	 *  | ------------- state -------------- |
 518	 *
 519	 * We need to split the extent we found, and may flip
 520	 * bits on second half.
 521	 *
 522	 * If the extent we found extends past our range, we
 523	 * just split and search again.  It'll get split again
 524	 * the next time though.
 525	 *
 526	 * If the extent we found is inside our range, we clear
 527	 * the desired bit on it.
 528	 */
 529
 530	if (state->start < start) {
 531		prealloc = alloc_extent_state_atomic(prealloc);
 532		BUG_ON(!prealloc);
 533		err = split_state(tree, state, prealloc, start);
 534		BUG_ON(err == -EEXIST);
 
 
 535		prealloc = NULL;
 536		if (err)
 537			goto out;
 538		if (state->end <= end) {
 539			set |= clear_state_bit(tree, state, &bits, wake);
 540			if (last_end == (u64)-1)
 541				goto out;
 542			start = last_end + 1;
 543		}
 544		goto search_again;
 545	}
 546	/*
 547	 * | ---- desired range ---- |
 548	 *                        | state |
 549	 * We need to split the extent, and clear the bit
 550	 * on the first half
 551	 */
 552	if (state->start <= end && state->end > end) {
 553		prealloc = alloc_extent_state_atomic(prealloc);
 554		BUG_ON(!prealloc);
 555		err = split_state(tree, state, prealloc, end + 1);
 556		BUG_ON(err == -EEXIST);
 
 
 557		if (wake)
 558			wake_up(&state->wq);
 559
 560		set |= clear_state_bit(tree, prealloc, &bits, wake);
 561
 562		prealloc = NULL;
 563		goto out;
 564	}
 565
 566	if (state->end < end && prealloc && !need_resched())
 567		next_node = rb_next(&state->rb_node);
 568	else
 569		next_node = NULL;
 570
 571	set |= clear_state_bit(tree, state, &bits, wake);
 572	if (last_end == (u64)-1)
 573		goto out;
 574	start = last_end + 1;
 575	if (start <= end && next_node) {
 576		state = rb_entry(next_node, struct extent_state,
 577				 rb_node);
 578		if (state->start == start)
 579			goto hit_next;
 580	}
 581	goto search_again;
 582
 583out:
 584	spin_unlock(&tree->lock);
 585	if (prealloc)
 586		free_extent_state(prealloc);
 587
 588	return set;
 589
 590search_again:
 591	if (start > end)
 592		goto out;
 593	spin_unlock(&tree->lock);
 594	if (mask & __GFP_WAIT)
 595		cond_resched();
 596	goto again;
 597}
 598
 599static int wait_on_state(struct extent_io_tree *tree,
 600			 struct extent_state *state)
 601		__releases(tree->lock)
 602		__acquires(tree->lock)
 603{
 604	DEFINE_WAIT(wait);
 605	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
 606	spin_unlock(&tree->lock);
 607	schedule();
 608	spin_lock(&tree->lock);
 609	finish_wait(&state->wq, &wait);
 610	return 0;
 611}
 612
 613/*
 614 * waits for one or more bits to clear on a range in the state tree.
 615 * The range [start, end] is inclusive.
 616 * The tree lock is taken by this function
 617 */
 618int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
 619{
 620	struct extent_state *state;
 621	struct rb_node *node;
 622
 623	spin_lock(&tree->lock);
 624again:
 625	while (1) {
 626		/*
 627		 * this search will find all the extents that end after
 628		 * our range starts
 629		 */
 630		node = tree_search(tree, start);
 631		if (!node)
 632			break;
 633
 634		state = rb_entry(node, struct extent_state, rb_node);
 635
 636		if (state->start > end)
 637			goto out;
 638
 639		if (state->state & bits) {
 640			start = state->start;
 641			atomic_inc(&state->refs);
 642			wait_on_state(tree, state);
 643			free_extent_state(state);
 644			goto again;
 645		}
 646		start = state->end + 1;
 647
 648		if (start > end)
 649			break;
 650
 651		cond_resched_lock(&tree->lock);
 652	}
 653out:
 654	spin_unlock(&tree->lock);
 655	return 0;
 656}
 657
 658static void set_state_bits(struct extent_io_tree *tree,
 659			   struct extent_state *state,
 660			   int *bits)
 661{
 662	int bits_to_set = *bits & ~EXTENT_CTLBITS;
 663
 664	set_state_cb(tree, state, bits);
 665	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
 666		u64 range = state->end - state->start + 1;
 667		tree->dirty_bytes += range;
 668	}
 669	state->state |= bits_to_set;
 670}
 671
 672static void cache_state(struct extent_state *state,
 673			struct extent_state **cached_ptr)
 674{
 675	if (cached_ptr && !(*cached_ptr)) {
 676		if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
 677			*cached_ptr = state;
 678			atomic_inc(&state->refs);
 679		}
 680	}
 681}
 682
 683static void uncache_state(struct extent_state **cached_ptr)
 684{
 685	if (cached_ptr && (*cached_ptr)) {
 686		struct extent_state *state = *cached_ptr;
 687		*cached_ptr = NULL;
 688		free_extent_state(state);
 689	}
 690}
 691
 692/*
 693 * set some bits on a range in the tree.  This may require allocations or
 694 * sleeping, so the gfp mask is used to indicate what is allowed.
 695 *
 696 * If any of the exclusive bits are set, this will fail with -EEXIST if some
 697 * part of the range already has the desired bits set.  The start of the
 698 * existing range is returned in failed_start in this case.
 699 *
 700 * [start, end] is inclusive This takes the tree lock.
 701 */
 702
 703int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 704		   int bits, int exclusive_bits, u64 *failed_start,
 705		   struct extent_state **cached_state, gfp_t mask)
 
 706{
 707	struct extent_state *state;
 708	struct extent_state *prealloc = NULL;
 709	struct rb_node *node;
 710	int err = 0;
 711	u64 last_start;
 712	u64 last_end;
 713
 714	bits |= EXTENT_FIRST_DELALLOC;
 715again:
 716	if (!prealloc && (mask & __GFP_WAIT)) {
 717		prealloc = alloc_extent_state(mask);
 718		BUG_ON(!prealloc);
 719	}
 720
 721	spin_lock(&tree->lock);
 722	if (cached_state && *cached_state) {
 723		state = *cached_state;
 724		if (state->start <= start && state->end > start &&
 725		    state->tree) {
 726			node = &state->rb_node;
 727			goto hit_next;
 728		}
 729	}
 730	/*
 731	 * this search will find all the extents that end after
 732	 * our range starts.
 733	 */
 734	node = tree_search(tree, start);
 735	if (!node) {
 736		prealloc = alloc_extent_state_atomic(prealloc);
 737		BUG_ON(!prealloc);
 738		err = insert_state(tree, prealloc, start, end, &bits);
 
 
 
 739		prealloc = NULL;
 740		BUG_ON(err == -EEXIST);
 741		goto out;
 742	}
 743	state = rb_entry(node, struct extent_state, rb_node);
 744hit_next:
 745	last_start = state->start;
 746	last_end = state->end;
 747
 748	/*
 749	 * | ---- desired range ---- |
 750	 * | state |
 751	 *
 752	 * Just lock what we found and keep going
 753	 */
 754	if (state->start == start && state->end <= end) {
 755		struct rb_node *next_node;
 756		if (state->state & exclusive_bits) {
 757			*failed_start = state->start;
 758			err = -EEXIST;
 759			goto out;
 760		}
 761
 762		set_state_bits(tree, state, &bits);
 763
 764		cache_state(state, cached_state);
 765		merge_state(tree, state);
 766		if (last_end == (u64)-1)
 767			goto out;
 768
 769		start = last_end + 1;
 770		next_node = rb_next(&state->rb_node);
 771		if (next_node && start < end && prealloc && !need_resched()) {
 772			state = rb_entry(next_node, struct extent_state,
 773					 rb_node);
 774			if (state->start == start)
 775				goto hit_next;
 776		}
 777		goto search_again;
 778	}
 779
 780	/*
 781	 *     | ---- desired range ---- |
 782	 * | state |
 783	 *   or
 784	 * | ------------- state -------------- |
 785	 *
 786	 * We need to split the extent we found, and may flip bits on
 787	 * second half.
 788	 *
 789	 * If the extent we found extends past our
 790	 * range, we just split and search again.  It'll get split
 791	 * again the next time though.
 792	 *
 793	 * If the extent we found is inside our range, we set the
 794	 * desired bit on it.
 795	 */
 796	if (state->start < start) {
 797		if (state->state & exclusive_bits) {
 798			*failed_start = start;
 799			err = -EEXIST;
 800			goto out;
 801		}
 802
 803		prealloc = alloc_extent_state_atomic(prealloc);
 804		BUG_ON(!prealloc);
 805		err = split_state(tree, state, prealloc, start);
 806		BUG_ON(err == -EEXIST);
 
 
 807		prealloc = NULL;
 808		if (err)
 809			goto out;
 810		if (state->end <= end) {
 811			set_state_bits(tree, state, &bits);
 812			cache_state(state, cached_state);
 813			merge_state(tree, state);
 814			if (last_end == (u64)-1)
 815				goto out;
 816			start = last_end + 1;
 
 
 
 
 817		}
 818		goto search_again;
 819	}
 820	/*
 821	 * | ---- desired range ---- |
 822	 *     | state | or               | state |
 823	 *
 824	 * There's a hole, we need to insert something in it and
 825	 * ignore the extent we found.
 826	 */
 827	if (state->start > start) {
 828		u64 this_end;
 829		if (end < last_start)
 830			this_end = end;
 831		else
 832			this_end = last_start - 1;
 833
 834		prealloc = alloc_extent_state_atomic(prealloc);
 835		BUG_ON(!prealloc);
 836
 837		/*
 838		 * Avoid to free 'prealloc' if it can be merged with
 839		 * the later extent.
 840		 */
 841		err = insert_state(tree, prealloc, start, this_end,
 842				   &bits);
 843		BUG_ON(err == -EEXIST);
 844		if (err) {
 845			free_extent_state(prealloc);
 846			prealloc = NULL;
 847			goto out;
 848		}
 849		cache_state(prealloc, cached_state);
 850		prealloc = NULL;
 851		start = this_end + 1;
 852		goto search_again;
 853	}
 854	/*
 855	 * | ---- desired range ---- |
 856	 *                        | state |
 857	 * We need to split the extent, and set the bit
 858	 * on the first half
 859	 */
 860	if (state->start <= end && state->end > end) {
 861		if (state->state & exclusive_bits) {
 862			*failed_start = start;
 863			err = -EEXIST;
 864			goto out;
 865		}
 866
 867		prealloc = alloc_extent_state_atomic(prealloc);
 868		BUG_ON(!prealloc);
 869		err = split_state(tree, state, prealloc, end + 1);
 870		BUG_ON(err == -EEXIST);
 
 871
 872		set_state_bits(tree, prealloc, &bits);
 873		cache_state(prealloc, cached_state);
 874		merge_state(tree, prealloc);
 875		prealloc = NULL;
 876		goto out;
 877	}
 878
 879	goto search_again;
 880
 881out:
 882	spin_unlock(&tree->lock);
 883	if (prealloc)
 884		free_extent_state(prealloc);
 885
 886	return err;
 887
 888search_again:
 889	if (start > end)
 890		goto out;
 891	spin_unlock(&tree->lock);
 892	if (mask & __GFP_WAIT)
 893		cond_resched();
 894	goto again;
 895}
 896
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 897/* wrappers around set/clear extent bit */
 898int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 899		     gfp_t mask)
 900{
 901	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
 902			      NULL, mask);
 903}
 904
 905int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 906		    int bits, gfp_t mask)
 907{
 908	return set_extent_bit(tree, start, end, bits, 0, NULL,
 909			      NULL, mask);
 910}
 911
 912int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 913		      int bits, gfp_t mask)
 914{
 915	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
 916}
 917
 918int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 919			struct extent_state **cached_state, gfp_t mask)
 920{
 921	return set_extent_bit(tree, start, end,
 922			      EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
 923			      0, NULL, cached_state, mask);
 924}
 925
 926int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 927		       gfp_t mask)
 928{
 929	return clear_extent_bit(tree, start, end,
 930				EXTENT_DIRTY | EXTENT_DELALLOC |
 931				EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
 932}
 933
 934int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 935		     gfp_t mask)
 936{
 937	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
 938			      NULL, mask);
 939}
 940
 941int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 942			struct extent_state **cached_state, gfp_t mask)
 943{
 944	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
 945			      NULL, cached_state, mask);
 946}
 947
 948static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
 949				 u64 end, struct extent_state **cached_state,
 950				 gfp_t mask)
 951{
 952	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
 953				cached_state, mask);
 954}
 955
 956/*
 957 * either insert or lock state struct between start and end use mask to tell
 958 * us if waiting is desired.
 959 */
 960int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 961		     int bits, struct extent_state **cached_state, gfp_t mask)
 962{
 963	int err;
 964	u64 failed_start;
 965	while (1) {
 966		err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
 967				     EXTENT_LOCKED, &failed_start,
 968				     cached_state, mask);
 969		if (err == -EEXIST && (mask & __GFP_WAIT)) {
 970			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
 971			start = failed_start;
 972		} else {
 973			break;
 974		}
 975		WARN_ON(start > end);
 976	}
 977	return err;
 978}
 979
 980int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
 981{
 982	return lock_extent_bits(tree, start, end, 0, NULL, mask);
 983}
 984
 985int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
 986		    gfp_t mask)
 987{
 988	int err;
 989	u64 failed_start;
 990
 991	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
 992			     &failed_start, NULL, mask);
 993	if (err == -EEXIST) {
 994		if (failed_start > start)
 995			clear_extent_bit(tree, start, failed_start - 1,
 996					 EXTENT_LOCKED, 1, 0, NULL, mask);
 997		return 0;
 998	}
 999	return 1;
1000}
1001
1002int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1003			 struct extent_state **cached, gfp_t mask)
1004{
1005	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1006				mask);
1007}
1008
1009int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
1010{
1011	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1012				mask);
1013}
1014
1015/*
1016 * helper function to set both pages and extents in the tree writeback
1017 */
1018static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1019{
1020	unsigned long index = start >> PAGE_CACHE_SHIFT;
1021	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1022	struct page *page;
1023
1024	while (index <= end_index) {
1025		page = find_get_page(tree->mapping, index);
1026		BUG_ON(!page);
1027		set_page_writeback(page);
1028		page_cache_release(page);
1029		index++;
1030	}
1031	return 0;
1032}
1033
1034/* find the first state struct with 'bits' set after 'start', and
1035 * return it.  tree->lock must be held.  NULL will returned if
1036 * nothing was found after 'start'
1037 */
1038struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1039						 u64 start, int bits)
1040{
1041	struct rb_node *node;
1042	struct extent_state *state;
1043
1044	/*
1045	 * this search will find all the extents that end after
1046	 * our range starts.
1047	 */
1048	node = tree_search(tree, start);
1049	if (!node)
1050		goto out;
1051
1052	while (1) {
1053		state = rb_entry(node, struct extent_state, rb_node);
1054		if (state->end >= start && (state->state & bits))
1055			return state;
1056
1057		node = rb_next(node);
1058		if (!node)
1059			break;
1060	}
1061out:
1062	return NULL;
1063}
1064
1065/*
1066 * find the first offset in the io tree with 'bits' set. zero is
1067 * returned if we find something, and *start_ret and *end_ret are
1068 * set to reflect the state struct that was found.
1069 *
1070 * If nothing was found, 1 is returned, < 0 on error
1071 */
1072int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1073			  u64 *start_ret, u64 *end_ret, int bits)
1074{
1075	struct extent_state *state;
1076	int ret = 1;
1077
1078	spin_lock(&tree->lock);
1079	state = find_first_extent_bit_state(tree, start, bits);
1080	if (state) {
1081		*start_ret = state->start;
1082		*end_ret = state->end;
1083		ret = 0;
1084	}
1085	spin_unlock(&tree->lock);
1086	return ret;
1087}
1088
1089/*
1090 * find a contiguous range of bytes in the file marked as delalloc, not
1091 * more than 'max_bytes'.  start and end are used to return the range,
1092 *
1093 * 1 is returned if we find something, 0 if nothing was in the tree
1094 */
1095static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1096					u64 *start, u64 *end, u64 max_bytes,
1097					struct extent_state **cached_state)
1098{
1099	struct rb_node *node;
1100	struct extent_state *state;
1101	u64 cur_start = *start;
1102	u64 found = 0;
1103	u64 total_bytes = 0;
1104
1105	spin_lock(&tree->lock);
1106
1107	/*
1108	 * this search will find all the extents that end after
1109	 * our range starts.
1110	 */
1111	node = tree_search(tree, cur_start);
1112	if (!node) {
1113		if (!found)
1114			*end = (u64)-1;
1115		goto out;
1116	}
1117
1118	while (1) {
1119		state = rb_entry(node, struct extent_state, rb_node);
1120		if (found && (state->start != cur_start ||
1121			      (state->state & EXTENT_BOUNDARY))) {
1122			goto out;
1123		}
1124		if (!(state->state & EXTENT_DELALLOC)) {
1125			if (!found)
1126				*end = state->end;
1127			goto out;
1128		}
1129		if (!found) {
1130			*start = state->start;
1131			*cached_state = state;
1132			atomic_inc(&state->refs);
1133		}
1134		found++;
1135		*end = state->end;
1136		cur_start = state->end + 1;
1137		node = rb_next(node);
1138		if (!node)
1139			break;
1140		total_bytes += state->end - state->start + 1;
1141		if (total_bytes >= max_bytes)
1142			break;
1143	}
1144out:
1145	spin_unlock(&tree->lock);
1146	return found;
1147}
1148
1149static noinline int __unlock_for_delalloc(struct inode *inode,
1150					  struct page *locked_page,
1151					  u64 start, u64 end)
1152{
1153	int ret;
1154	struct page *pages[16];
1155	unsigned long index = start >> PAGE_CACHE_SHIFT;
1156	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1157	unsigned long nr_pages = end_index - index + 1;
1158	int i;
1159
1160	if (index == locked_page->index && end_index == index)
1161		return 0;
1162
1163	while (nr_pages > 0) {
1164		ret = find_get_pages_contig(inode->i_mapping, index,
1165				     min_t(unsigned long, nr_pages,
1166				     ARRAY_SIZE(pages)), pages);
1167		for (i = 0; i < ret; i++) {
1168			if (pages[i] != locked_page)
1169				unlock_page(pages[i]);
1170			page_cache_release(pages[i]);
1171		}
1172		nr_pages -= ret;
1173		index += ret;
1174		cond_resched();
1175	}
1176	return 0;
1177}
1178
1179static noinline int lock_delalloc_pages(struct inode *inode,
1180					struct page *locked_page,
1181					u64 delalloc_start,
1182					u64 delalloc_end)
1183{
1184	unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1185	unsigned long start_index = index;
1186	unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1187	unsigned long pages_locked = 0;
1188	struct page *pages[16];
1189	unsigned long nrpages;
1190	int ret;
1191	int i;
1192
1193	/* the caller is responsible for locking the start index */
1194	if (index == locked_page->index && index == end_index)
1195		return 0;
1196
1197	/* skip the page at the start index */
1198	nrpages = end_index - index + 1;
1199	while (nrpages > 0) {
1200		ret = find_get_pages_contig(inode->i_mapping, index,
1201				     min_t(unsigned long,
1202				     nrpages, ARRAY_SIZE(pages)), pages);
1203		if (ret == 0) {
1204			ret = -EAGAIN;
1205			goto done;
1206		}
1207		/* now we have an array of pages, lock them all */
1208		for (i = 0; i < ret; i++) {
1209			/*
1210			 * the caller is taking responsibility for
1211			 * locked_page
1212			 */
1213			if (pages[i] != locked_page) {
1214				lock_page(pages[i]);
1215				if (!PageDirty(pages[i]) ||
1216				    pages[i]->mapping != inode->i_mapping) {
1217					ret = -EAGAIN;
1218					unlock_page(pages[i]);
1219					page_cache_release(pages[i]);
1220					goto done;
1221				}
1222			}
1223			page_cache_release(pages[i]);
1224			pages_locked++;
1225		}
1226		nrpages -= ret;
1227		index += ret;
1228		cond_resched();
1229	}
1230	ret = 0;
1231done:
1232	if (ret && pages_locked) {
1233		__unlock_for_delalloc(inode, locked_page,
1234			      delalloc_start,
1235			      ((u64)(start_index + pages_locked - 1)) <<
1236			      PAGE_CACHE_SHIFT);
1237	}
1238	return ret;
1239}
1240
1241/*
1242 * find a contiguous range of bytes in the file marked as delalloc, not
1243 * more than 'max_bytes'.  start and end are used to return the range,
1244 *
1245 * 1 is returned if we find something, 0 if nothing was in the tree
1246 */
1247static noinline u64 find_lock_delalloc_range(struct inode *inode,
1248					     struct extent_io_tree *tree,
1249					     struct page *locked_page,
1250					     u64 *start, u64 *end,
1251					     u64 max_bytes)
1252{
1253	u64 delalloc_start;
1254	u64 delalloc_end;
1255	u64 found;
1256	struct extent_state *cached_state = NULL;
1257	int ret;
1258	int loops = 0;
1259
1260again:
1261	/* step one, find a bunch of delalloc bytes starting at start */
1262	delalloc_start = *start;
1263	delalloc_end = 0;
1264	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1265				    max_bytes, &cached_state);
1266	if (!found || delalloc_end <= *start) {
1267		*start = delalloc_start;
1268		*end = delalloc_end;
1269		free_extent_state(cached_state);
1270		return found;
1271	}
1272
1273	/*
1274	 * start comes from the offset of locked_page.  We have to lock
1275	 * pages in order, so we can't process delalloc bytes before
1276	 * locked_page
1277	 */
1278	if (delalloc_start < *start)
1279		delalloc_start = *start;
1280
1281	/*
1282	 * make sure to limit the number of pages we try to lock down
1283	 * if we're looping.
1284	 */
1285	if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1286		delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1287
1288	/* step two, lock all the pages after the page that has start */
1289	ret = lock_delalloc_pages(inode, locked_page,
1290				  delalloc_start, delalloc_end);
1291	if (ret == -EAGAIN) {
1292		/* some of the pages are gone, lets avoid looping by
1293		 * shortening the size of the delalloc range we're searching
1294		 */
1295		free_extent_state(cached_state);
1296		if (!loops) {
1297			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1298			max_bytes = PAGE_CACHE_SIZE - offset;
1299			loops = 1;
1300			goto again;
1301		} else {
1302			found = 0;
1303			goto out_failed;
1304		}
1305	}
1306	BUG_ON(ret);
1307
1308	/* step three, lock the state bits for the whole range */
1309	lock_extent_bits(tree, delalloc_start, delalloc_end,
1310			 0, &cached_state, GFP_NOFS);
1311
1312	/* then test to make sure it is all still delalloc */
1313	ret = test_range_bit(tree, delalloc_start, delalloc_end,
1314			     EXTENT_DELALLOC, 1, cached_state);
1315	if (!ret) {
1316		unlock_extent_cached(tree, delalloc_start, delalloc_end,
1317				     &cached_state, GFP_NOFS);
1318		__unlock_for_delalloc(inode, locked_page,
1319			      delalloc_start, delalloc_end);
1320		cond_resched();
1321		goto again;
1322	}
1323	free_extent_state(cached_state);
1324	*start = delalloc_start;
1325	*end = delalloc_end;
1326out_failed:
1327	return found;
1328}
1329
1330int extent_clear_unlock_delalloc(struct inode *inode,
1331				struct extent_io_tree *tree,
1332				u64 start, u64 end, struct page *locked_page,
1333				unsigned long op)
1334{
1335	int ret;
1336	struct page *pages[16];
1337	unsigned long index = start >> PAGE_CACHE_SHIFT;
1338	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1339	unsigned long nr_pages = end_index - index + 1;
1340	int i;
1341	int clear_bits = 0;
1342
1343	if (op & EXTENT_CLEAR_UNLOCK)
1344		clear_bits |= EXTENT_LOCKED;
1345	if (op & EXTENT_CLEAR_DIRTY)
1346		clear_bits |= EXTENT_DIRTY;
1347
1348	if (op & EXTENT_CLEAR_DELALLOC)
1349		clear_bits |= EXTENT_DELALLOC;
1350
1351	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1352	if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1353		    EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
1354		    EXTENT_SET_PRIVATE2)))
1355		return 0;
1356
1357	while (nr_pages > 0) {
1358		ret = find_get_pages_contig(inode->i_mapping, index,
1359				     min_t(unsigned long,
1360				     nr_pages, ARRAY_SIZE(pages)), pages);
1361		for (i = 0; i < ret; i++) {
1362
1363			if (op & EXTENT_SET_PRIVATE2)
1364				SetPagePrivate2(pages[i]);
1365
1366			if (pages[i] == locked_page) {
1367				page_cache_release(pages[i]);
1368				continue;
1369			}
1370			if (op & EXTENT_CLEAR_DIRTY)
1371				clear_page_dirty_for_io(pages[i]);
1372			if (op & EXTENT_SET_WRITEBACK)
1373				set_page_writeback(pages[i]);
1374			if (op & EXTENT_END_WRITEBACK)
1375				end_page_writeback(pages[i]);
1376			if (op & EXTENT_CLEAR_UNLOCK_PAGE)
1377				unlock_page(pages[i]);
1378			page_cache_release(pages[i]);
1379		}
1380		nr_pages -= ret;
1381		index += ret;
1382		cond_resched();
1383	}
1384	return 0;
1385}
1386
1387/*
1388 * count the number of bytes in the tree that have a given bit(s)
1389 * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1390 * cached.  The total number found is returned.
1391 */
1392u64 count_range_bits(struct extent_io_tree *tree,
1393		     u64 *start, u64 search_end, u64 max_bytes,
1394		     unsigned long bits, int contig)
1395{
1396	struct rb_node *node;
1397	struct extent_state *state;
1398	u64 cur_start = *start;
1399	u64 total_bytes = 0;
1400	u64 last = 0;
1401	int found = 0;
1402
1403	if (search_end <= cur_start) {
1404		WARN_ON(1);
1405		return 0;
1406	}
1407
1408	spin_lock(&tree->lock);
1409	if (cur_start == 0 && bits == EXTENT_DIRTY) {
1410		total_bytes = tree->dirty_bytes;
1411		goto out;
1412	}
1413	/*
1414	 * this search will find all the extents that end after
1415	 * our range starts.
1416	 */
1417	node = tree_search(tree, cur_start);
1418	if (!node)
1419		goto out;
1420
1421	while (1) {
1422		state = rb_entry(node, struct extent_state, rb_node);
1423		if (state->start > search_end)
1424			break;
1425		if (contig && found && state->start > last + 1)
1426			break;
1427		if (state->end >= cur_start && (state->state & bits) == bits) {
1428			total_bytes += min(search_end, state->end) + 1 -
1429				       max(cur_start, state->start);
1430			if (total_bytes >= max_bytes)
1431				break;
1432			if (!found) {
1433				*start = max(cur_start, state->start);
1434				found = 1;
1435			}
1436			last = state->end;
1437		} else if (contig && found) {
1438			break;
1439		}
1440		node = rb_next(node);
1441		if (!node)
1442			break;
1443	}
1444out:
1445	spin_unlock(&tree->lock);
1446	return total_bytes;
1447}
1448
1449/*
1450 * set the private field for a given byte offset in the tree.  If there isn't
1451 * an extent_state there already, this does nothing.
1452 */
1453int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1454{
1455	struct rb_node *node;
1456	struct extent_state *state;
1457	int ret = 0;
1458
1459	spin_lock(&tree->lock);
1460	/*
1461	 * this search will find all the extents that end after
1462	 * our range starts.
1463	 */
1464	node = tree_search(tree, start);
1465	if (!node) {
1466		ret = -ENOENT;
1467		goto out;
1468	}
1469	state = rb_entry(node, struct extent_state, rb_node);
1470	if (state->start != start) {
1471		ret = -ENOENT;
1472		goto out;
1473	}
1474	state->private = private;
1475out:
1476	spin_unlock(&tree->lock);
1477	return ret;
1478}
1479
1480int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1481{
1482	struct rb_node *node;
1483	struct extent_state *state;
1484	int ret = 0;
1485
1486	spin_lock(&tree->lock);
1487	/*
1488	 * this search will find all the extents that end after
1489	 * our range starts.
1490	 */
1491	node = tree_search(tree, start);
1492	if (!node) {
1493		ret = -ENOENT;
1494		goto out;
1495	}
1496	state = rb_entry(node, struct extent_state, rb_node);
1497	if (state->start != start) {
1498		ret = -ENOENT;
1499		goto out;
1500	}
1501	*private = state->private;
1502out:
1503	spin_unlock(&tree->lock);
1504	return ret;
1505}
1506
1507/*
1508 * searches a range in the state tree for a given mask.
1509 * If 'filled' == 1, this returns 1 only if every extent in the tree
1510 * has the bits set.  Otherwise, 1 is returned if any bit in the
1511 * range is found set.
1512 */
1513int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1514		   int bits, int filled, struct extent_state *cached)
1515{
1516	struct extent_state *state = NULL;
1517	struct rb_node *node;
1518	int bitset = 0;
1519
1520	spin_lock(&tree->lock);
1521	if (cached && cached->tree && cached->start <= start &&
1522	    cached->end > start)
1523		node = &cached->rb_node;
1524	else
1525		node = tree_search(tree, start);
1526	while (node && start <= end) {
1527		state = rb_entry(node, struct extent_state, rb_node);
1528
1529		if (filled && state->start > start) {
1530			bitset = 0;
1531			break;
1532		}
1533
1534		if (state->start > end)
1535			break;
1536
1537		if (state->state & bits) {
1538			bitset = 1;
1539			if (!filled)
1540				break;
1541		} else if (filled) {
1542			bitset = 0;
1543			break;
1544		}
1545
1546		if (state->end == (u64)-1)
1547			break;
1548
1549		start = state->end + 1;
1550		if (start > end)
1551			break;
1552		node = rb_next(node);
1553		if (!node) {
1554			if (filled)
1555				bitset = 0;
1556			break;
1557		}
1558	}
1559	spin_unlock(&tree->lock);
1560	return bitset;
1561}
1562
1563/*
1564 * helper function to set a given page up to date if all the
1565 * extents in the tree for that page are up to date
1566 */
1567static int check_page_uptodate(struct extent_io_tree *tree,
1568			       struct page *page)
1569{
1570	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1571	u64 end = start + PAGE_CACHE_SIZE - 1;
1572	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1573		SetPageUptodate(page);
1574	return 0;
1575}
1576
1577/*
1578 * helper function to unlock a page if all the extents in the tree
1579 * for that page are unlocked
1580 */
1581static int check_page_locked(struct extent_io_tree *tree,
1582			     struct page *page)
1583{
1584	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1585	u64 end = start + PAGE_CACHE_SIZE - 1;
1586	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1587		unlock_page(page);
1588	return 0;
1589}
1590
1591/*
1592 * helper function to end page writeback if all the extents
1593 * in the tree for that page are done with writeback
1594 */
1595static int check_page_writeback(struct extent_io_tree *tree,
1596			     struct page *page)
1597{
1598	end_page_writeback(page);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1599	return 0;
1600}
1601
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1602/* lots and lots of room for performance fixes in the end_bio funcs */
1603
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1604/*
1605 * after a writepage IO is done, we need to:
1606 * clear the uptodate bits on error
1607 * clear the writeback bits in the extent tree for this IO
1608 * end_page_writeback if the page has no more pending IO
1609 *
1610 * Scheduling is not allowed, so the extent state tree is expected
1611 * to have one and only one object corresponding to this IO.
1612 */
1613static void end_bio_extent_writepage(struct bio *bio, int err)
1614{
1615	int uptodate = err == 0;
1616	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1617	struct extent_io_tree *tree;
1618	u64 start;
1619	u64 end;
1620	int whole_page;
1621	int ret;
1622
1623	do {
1624		struct page *page = bvec->bv_page;
1625		tree = &BTRFS_I(page->mapping->host)->io_tree;
1626
1627		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1628			 bvec->bv_offset;
1629		end = start + bvec->bv_len - 1;
1630
1631		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1632			whole_page = 1;
1633		else
1634			whole_page = 0;
1635
1636		if (--bvec >= bio->bi_io_vec)
1637			prefetchw(&bvec->bv_page->flags);
1638		if (tree->ops && tree->ops->writepage_end_io_hook) {
1639			ret = tree->ops->writepage_end_io_hook(page, start,
1640						       end, NULL, uptodate);
1641			if (ret)
1642				uptodate = 0;
1643		}
1644
1645		if (!uptodate && tree->ops &&
1646		    tree->ops->writepage_io_failed_hook) {
1647			ret = tree->ops->writepage_io_failed_hook(bio, page,
1648							 start, end, NULL);
1649			if (ret == 0) {
1650				uptodate = (err == 0);
1651				continue;
1652			}
1653		}
1654
1655		if (!uptodate) {
1656			clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
1657			ClearPageUptodate(page);
1658			SetPageError(page);
1659		}
1660
1661		if (whole_page)
1662			end_page_writeback(page);
1663		else
1664			check_page_writeback(tree, page);
1665	} while (bvec >= bio->bi_io_vec);
1666
1667	bio_put(bio);
1668}
1669
1670/*
1671 * after a readpage IO is done, we need to:
1672 * clear the uptodate bits on error
1673 * set the uptodate bits if things worked
1674 * set the page up to date if all extents in the tree are uptodate
1675 * clear the lock bit in the extent tree
1676 * unlock the page if there are no other extents locked for it
1677 *
1678 * Scheduling is not allowed, so the extent state tree is expected
1679 * to have one and only one object corresponding to this IO.
1680 */
1681static void end_bio_extent_readpage(struct bio *bio, int err)
1682{
1683	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1684	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
1685	struct bio_vec *bvec = bio->bi_io_vec;
1686	struct extent_io_tree *tree;
1687	u64 start;
1688	u64 end;
1689	int whole_page;
 
1690	int ret;
1691
1692	if (err)
1693		uptodate = 0;
1694
1695	do {
1696		struct page *page = bvec->bv_page;
1697		struct extent_state *cached = NULL;
1698		struct extent_state *state;
1699
 
 
 
1700		tree = &BTRFS_I(page->mapping->host)->io_tree;
1701
1702		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1703			bvec->bv_offset;
1704		end = start + bvec->bv_len - 1;
1705
1706		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1707			whole_page = 1;
1708		else
1709			whole_page = 0;
1710
1711		if (++bvec <= bvec_end)
1712			prefetchw(&bvec->bv_page->flags);
1713
1714		spin_lock(&tree->lock);
1715		state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
1716		if (state && state->start == start) {
1717			/*
1718			 * take a reference on the state, unlock will drop
1719			 * the ref
1720			 */
1721			cache_state(state, &cached);
1722		}
1723		spin_unlock(&tree->lock);
1724
 
1725		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1726			ret = tree->ops->readpage_end_io_hook(page, start, end,
1727							      state);
1728			if (ret)
1729				uptodate = 0;
 
 
1730		}
1731		if (!uptodate && tree->ops &&
1732		    tree->ops->readpage_io_failed_hook) {
1733			ret = tree->ops->readpage_io_failed_hook(bio, page,
1734							 start, end, NULL);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1735			if (ret == 0) {
1736				uptodate =
1737					test_bit(BIO_UPTODATE, &bio->bi_flags);
1738				if (err)
1739					uptodate = 0;
1740				uncache_state(&cached);
1741				continue;
1742			}
1743		}
1744
1745		if (uptodate) {
1746			set_extent_uptodate(tree, start, end, &cached,
1747					    GFP_ATOMIC);
1748		}
1749		unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
1750
1751		if (whole_page) {
1752			if (uptodate) {
1753				SetPageUptodate(page);
1754			} else {
1755				ClearPageUptodate(page);
1756				SetPageError(page);
1757			}
1758			unlock_page(page);
1759		} else {
1760			if (uptodate) {
1761				check_page_uptodate(tree, page);
1762			} else {
1763				ClearPageUptodate(page);
1764				SetPageError(page);
1765			}
1766			check_page_locked(tree, page);
1767		}
1768	} while (bvec <= bvec_end);
1769
1770	bio_put(bio);
1771}
1772
1773struct bio *
1774btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1775		gfp_t gfp_flags)
1776{
1777	struct bio *bio;
1778
1779	bio = bio_alloc(gfp_flags, nr_vecs);
1780
1781	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1782		while (!bio && (nr_vecs /= 2))
1783			bio = bio_alloc(gfp_flags, nr_vecs);
1784	}
1785
1786	if (bio) {
1787		bio->bi_size = 0;
1788		bio->bi_bdev = bdev;
1789		bio->bi_sector = first_sector;
1790	}
1791	return bio;
1792}
1793
1794static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1795			  unsigned long bio_flags)
 
 
 
 
1796{
1797	int ret = 0;
1798	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1799	struct page *page = bvec->bv_page;
1800	struct extent_io_tree *tree = bio->bi_private;
1801	u64 start;
1802
1803	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1804
1805	bio->bi_private = NULL;
1806
1807	bio_get(bio);
1808
1809	if (tree->ops && tree->ops->submit_bio_hook)
1810		ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1811					   mirror_num, bio_flags, start);
1812	else
1813		submit_bio(rw, bio);
 
1814	if (bio_flagged(bio, BIO_EOPNOTSUPP))
1815		ret = -EOPNOTSUPP;
1816	bio_put(bio);
1817	return ret;
1818}
1819
 
 
 
 
 
 
 
 
 
 
 
 
 
1820static int submit_extent_page(int rw, struct extent_io_tree *tree,
1821			      struct page *page, sector_t sector,
1822			      size_t size, unsigned long offset,
1823			      struct block_device *bdev,
1824			      struct bio **bio_ret,
1825			      unsigned long max_pages,
1826			      bio_end_io_t end_io_func,
1827			      int mirror_num,
1828			      unsigned long prev_bio_flags,
1829			      unsigned long bio_flags)
1830{
1831	int ret = 0;
1832	struct bio *bio;
1833	int nr;
1834	int contig = 0;
1835	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
1836	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
1837	size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
1838
1839	if (bio_ret && *bio_ret) {
1840		bio = *bio_ret;
1841		if (old_compressed)
1842			contig = bio->bi_sector == sector;
1843		else
1844			contig = bio->bi_sector + (bio->bi_size >> 9) ==
1845				sector;
1846
1847		if (prev_bio_flags != bio_flags || !contig ||
1848		    (tree->ops && tree->ops->merge_bio_hook &&
1849		     tree->ops->merge_bio_hook(page, offset, page_size, bio,
1850					       bio_flags)) ||
1851		    bio_add_page(bio, page, page_size, offset) < page_size) {
1852			ret = submit_one_bio(rw, bio, mirror_num,
1853					     prev_bio_flags);
 
 
1854			bio = NULL;
1855		} else {
1856			return 0;
1857		}
1858	}
1859	if (this_compressed)
1860		nr = BIO_MAX_PAGES;
1861	else
1862		nr = bio_get_nr_vecs(bdev);
1863
1864	bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1865	if (!bio)
1866		return -ENOMEM;
1867
1868	bio_add_page(bio, page, page_size, offset);
1869	bio->bi_end_io = end_io_func;
1870	bio->bi_private = tree;
1871
1872	if (bio_ret)
1873		*bio_ret = bio;
1874	else
1875		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
1876
1877	return ret;
1878}
1879
1880void set_page_extent_mapped(struct page *page)
1881{
1882	if (!PagePrivate(page)) {
1883		SetPagePrivate(page);
1884		page_cache_get(page);
1885		set_page_private(page, EXTENT_PAGE_PRIVATE);
 
 
1886	}
1887}
1888
1889static void set_page_extent_head(struct page *page, unsigned long len)
1890{
1891	WARN_ON(!PagePrivate(page));
1892	set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
 
 
 
1893}
1894
1895/*
1896 * basic readpage implementation.  Locked extent state structs are inserted
1897 * into the tree that are removed when the IO is done (by the end_io
1898 * handlers)
 
1899 */
1900static int __extent_read_full_page(struct extent_io_tree *tree,
1901				   struct page *page,
1902				   get_extent_t *get_extent,
1903				   struct bio **bio, int mirror_num,
1904				   unsigned long *bio_flags)
1905{
1906	struct inode *inode = page->mapping->host;
1907	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1908	u64 page_end = start + PAGE_CACHE_SIZE - 1;
1909	u64 end;
1910	u64 cur = start;
1911	u64 extent_offset;
1912	u64 last_byte = i_size_read(inode);
1913	u64 block_start;
1914	u64 cur_end;
1915	sector_t sector;
1916	struct extent_map *em;
1917	struct block_device *bdev;
1918	struct btrfs_ordered_extent *ordered;
1919	int ret;
1920	int nr = 0;
1921	size_t pg_offset = 0;
1922	size_t iosize;
1923	size_t disk_io_size;
1924	size_t blocksize = inode->i_sb->s_blocksize;
1925	unsigned long this_bio_flag = 0;
1926
1927	set_page_extent_mapped(page);
1928
1929	if (!PageUptodate(page)) {
1930		if (cleancache_get_page(page) == 0) {
1931			BUG_ON(blocksize != PAGE_SIZE);
1932			goto out;
1933		}
1934	}
1935
1936	end = page_end;
1937	while (1) {
1938		lock_extent(tree, start, end, GFP_NOFS);
1939		ordered = btrfs_lookup_ordered_extent(inode, start);
1940		if (!ordered)
1941			break;
1942		unlock_extent(tree, start, end, GFP_NOFS);
1943		btrfs_start_ordered_extent(inode, ordered, 1);
1944		btrfs_put_ordered_extent(ordered);
1945	}
1946
1947	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
1948		char *userpage;
1949		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
1950
1951		if (zero_offset) {
1952			iosize = PAGE_CACHE_SIZE - zero_offset;
1953			userpage = kmap_atomic(page, KM_USER0);
1954			memset(userpage + zero_offset, 0, iosize);
1955			flush_dcache_page(page);
1956			kunmap_atomic(userpage, KM_USER0);
1957		}
1958	}
1959	while (cur <= end) {
1960		if (cur >= last_byte) {
1961			char *userpage;
1962			struct extent_state *cached = NULL;
1963
1964			iosize = PAGE_CACHE_SIZE - pg_offset;
1965			userpage = kmap_atomic(page, KM_USER0);
1966			memset(userpage + pg_offset, 0, iosize);
1967			flush_dcache_page(page);
1968			kunmap_atomic(userpage, KM_USER0);
1969			set_extent_uptodate(tree, cur, cur + iosize - 1,
1970					    &cached, GFP_NOFS);
1971			unlock_extent_cached(tree, cur, cur + iosize - 1,
1972					     &cached, GFP_NOFS);
1973			break;
1974		}
1975		em = get_extent(inode, page, pg_offset, cur,
1976				end - cur + 1, 0);
1977		if (IS_ERR_OR_NULL(em)) {
1978			SetPageError(page);
1979			unlock_extent(tree, cur, end, GFP_NOFS);
1980			break;
1981		}
1982		extent_offset = cur - em->start;
1983		BUG_ON(extent_map_end(em) <= cur);
1984		BUG_ON(end < cur);
1985
1986		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1987			this_bio_flag = EXTENT_BIO_COMPRESSED;
1988			extent_set_compress_type(&this_bio_flag,
1989						 em->compress_type);
1990		}
1991
1992		iosize = min(extent_map_end(em) - cur, end - cur + 1);
1993		cur_end = min(extent_map_end(em) - 1, end);
1994		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1995		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
1996			disk_io_size = em->block_len;
1997			sector = em->block_start >> 9;
1998		} else {
1999			sector = (em->block_start + extent_offset) >> 9;
2000			disk_io_size = iosize;
2001		}
2002		bdev = em->bdev;
2003		block_start = em->block_start;
2004		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2005			block_start = EXTENT_MAP_HOLE;
2006		free_extent_map(em);
2007		em = NULL;
2008
2009		/* we've found a hole, just zero and go on */
2010		if (block_start == EXTENT_MAP_HOLE) {
2011			char *userpage;
2012			struct extent_state *cached = NULL;
2013
2014			userpage = kmap_atomic(page, KM_USER0);
2015			memset(userpage + pg_offset, 0, iosize);
2016			flush_dcache_page(page);
2017			kunmap_atomic(userpage, KM_USER0);
2018
2019			set_extent_uptodate(tree, cur, cur + iosize - 1,
2020					    &cached, GFP_NOFS);
2021			unlock_extent_cached(tree, cur, cur + iosize - 1,
2022			                     &cached, GFP_NOFS);
2023			cur = cur + iosize;
2024			pg_offset += iosize;
2025			continue;
2026		}
2027		/* the get_extent function already copied into the page */
2028		if (test_range_bit(tree, cur, cur_end,
2029				   EXTENT_UPTODATE, 1, NULL)) {
2030			check_page_uptodate(tree, page);
2031			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2032			cur = cur + iosize;
2033			pg_offset += iosize;
2034			continue;
2035		}
2036		/* we have an inline extent but it didn't get marked up
2037		 * to date.  Error out
2038		 */
2039		if (block_start == EXTENT_MAP_INLINE) {
2040			SetPageError(page);
2041			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2042			cur = cur + iosize;
2043			pg_offset += iosize;
2044			continue;
2045		}
2046
2047		ret = 0;
2048		if (tree->ops && tree->ops->readpage_io_hook) {
2049			ret = tree->ops->readpage_io_hook(page, cur,
2050							  cur + iosize - 1);
2051		}
2052		if (!ret) {
2053			unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2054			pnr -= page->index;
2055			ret = submit_extent_page(READ, tree, page,
2056					 sector, disk_io_size, pg_offset,
2057					 bdev, bio, pnr,
2058					 end_bio_extent_readpage, mirror_num,
2059					 *bio_flags,
2060					 this_bio_flag);
 
2061			nr++;
2062			*bio_flags = this_bio_flag;
2063		}
2064		if (ret)
2065			SetPageError(page);
2066		cur = cur + iosize;
2067		pg_offset += iosize;
2068	}
2069out:
2070	if (!nr) {
2071		if (!PageError(page))
2072			SetPageUptodate(page);
2073		unlock_page(page);
2074	}
2075	return 0;
2076}
2077
2078int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2079			    get_extent_t *get_extent)
2080{
2081	struct bio *bio = NULL;
2082	unsigned long bio_flags = 0;
2083	int ret;
2084
2085	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2086				      &bio_flags);
2087	if (bio)
2088		ret = submit_one_bio(READ, bio, 0, bio_flags);
2089	return ret;
2090}
2091
2092static noinline void update_nr_written(struct page *page,
2093				      struct writeback_control *wbc,
2094				      unsigned long nr_written)
2095{
2096	wbc->nr_to_write -= nr_written;
2097	if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2098	    wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2099		page->mapping->writeback_index = page->index + nr_written;
2100}
2101
2102/*
2103 * the writepage semantics are similar to regular writepage.  extent
2104 * records are inserted to lock ranges in the tree, and as dirty areas
2105 * are found, they are marked writeback.  Then the lock bits are removed
2106 * and the end_io handler clears the writeback ranges
2107 */
2108static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2109			      void *data)
2110{
2111	struct inode *inode = page->mapping->host;
2112	struct extent_page_data *epd = data;
2113	struct extent_io_tree *tree = epd->tree;
2114	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2115	u64 delalloc_start;
2116	u64 page_end = start + PAGE_CACHE_SIZE - 1;
2117	u64 end;
2118	u64 cur = start;
2119	u64 extent_offset;
2120	u64 last_byte = i_size_read(inode);
2121	u64 block_start;
2122	u64 iosize;
2123	sector_t sector;
2124	struct extent_state *cached_state = NULL;
2125	struct extent_map *em;
2126	struct block_device *bdev;
2127	int ret;
2128	int nr = 0;
2129	size_t pg_offset = 0;
2130	size_t blocksize;
2131	loff_t i_size = i_size_read(inode);
2132	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2133	u64 nr_delalloc;
2134	u64 delalloc_end;
2135	int page_started;
2136	int compressed;
2137	int write_flags;
2138	unsigned long nr_written = 0;
 
2139
2140	if (wbc->sync_mode == WB_SYNC_ALL)
2141		write_flags = WRITE_SYNC;
2142	else
2143		write_flags = WRITE;
2144
2145	trace___extent_writepage(page, inode, wbc);
2146
2147	WARN_ON(!PageLocked(page));
 
 
 
2148	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2149	if (page->index > end_index ||
2150	   (page->index == end_index && !pg_offset)) {
2151		page->mapping->a_ops->invalidatepage(page, 0);
2152		unlock_page(page);
2153		return 0;
2154	}
2155
2156	if (page->index == end_index) {
2157		char *userpage;
2158
2159		userpage = kmap_atomic(page, KM_USER0);
2160		memset(userpage + pg_offset, 0,
2161		       PAGE_CACHE_SIZE - pg_offset);
2162		kunmap_atomic(userpage, KM_USER0);
2163		flush_dcache_page(page);
2164	}
2165	pg_offset = 0;
2166
2167	set_page_extent_mapped(page);
2168
 
 
 
2169	delalloc_start = start;
2170	delalloc_end = 0;
2171	page_started = 0;
2172	if (!epd->extent_locked) {
2173		u64 delalloc_to_write = 0;
2174		/*
2175		 * make sure the wbc mapping index is at least updated
2176		 * to this page.
2177		 */
2178		update_nr_written(page, wbc, 0);
2179
2180		while (delalloc_end < page_end) {
2181			nr_delalloc = find_lock_delalloc_range(inode, tree,
2182						       page,
2183						       &delalloc_start,
2184						       &delalloc_end,
2185						       128 * 1024 * 1024);
2186			if (nr_delalloc == 0) {
2187				delalloc_start = delalloc_end + 1;
2188				continue;
2189			}
2190			tree->ops->fill_delalloc(inode, page, delalloc_start,
2191						 delalloc_end, &page_started,
2192						 &nr_written);
 
 
 
 
 
 
 
2193			/*
2194			 * delalloc_end is already one less than the total
2195			 * length, so we don't subtract one from
2196			 * PAGE_CACHE_SIZE
2197			 */
2198			delalloc_to_write += (delalloc_end - delalloc_start +
2199					      PAGE_CACHE_SIZE) >>
2200					      PAGE_CACHE_SHIFT;
2201			delalloc_start = delalloc_end + 1;
2202		}
2203		if (wbc->nr_to_write < delalloc_to_write) {
2204			int thresh = 8192;
2205
2206			if (delalloc_to_write < thresh * 2)
2207				thresh = delalloc_to_write;
2208			wbc->nr_to_write = min_t(u64, delalloc_to_write,
2209						 thresh);
2210		}
2211
2212		/* did the fill delalloc function already unlock and start
2213		 * the IO?
2214		 */
2215		if (page_started) {
2216			ret = 0;
2217			/*
2218			 * we've unlocked the page, so we can't update
2219			 * the mapping's writeback index, just update
2220			 * nr_to_write.
2221			 */
2222			wbc->nr_to_write -= nr_written;
2223			goto done_unlocked;
2224		}
2225	}
2226	if (tree->ops && tree->ops->writepage_start_hook) {
2227		ret = tree->ops->writepage_start_hook(page, start,
2228						      page_end);
2229		if (ret == -EAGAIN) {
2230			redirty_page_for_writepage(wbc, page);
 
 
 
 
2231			update_nr_written(page, wbc, nr_written);
2232			unlock_page(page);
2233			ret = 0;
2234			goto done_unlocked;
2235		}
2236	}
2237
2238	/*
2239	 * we don't want to touch the inode after unlocking the page,
2240	 * so we update the mapping writeback index now
2241	 */
2242	update_nr_written(page, wbc, nr_written + 1);
2243
2244	end = page_end;
2245	if (last_byte <= start) {
2246		if (tree->ops && tree->ops->writepage_end_io_hook)
2247			tree->ops->writepage_end_io_hook(page, start,
2248							 page_end, NULL, 1);
2249		goto done;
2250	}
2251
2252	blocksize = inode->i_sb->s_blocksize;
2253
2254	while (cur <= end) {
2255		if (cur >= last_byte) {
2256			if (tree->ops && tree->ops->writepage_end_io_hook)
2257				tree->ops->writepage_end_io_hook(page, cur,
2258							 page_end, NULL, 1);
2259			break;
2260		}
2261		em = epd->get_extent(inode, page, pg_offset, cur,
2262				     end - cur + 1, 1);
2263		if (IS_ERR_OR_NULL(em)) {
2264			SetPageError(page);
2265			break;
2266		}
2267
2268		extent_offset = cur - em->start;
2269		BUG_ON(extent_map_end(em) <= cur);
2270		BUG_ON(end < cur);
2271		iosize = min(extent_map_end(em) - cur, end - cur + 1);
2272		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2273		sector = (em->block_start + extent_offset) >> 9;
2274		bdev = em->bdev;
2275		block_start = em->block_start;
2276		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2277		free_extent_map(em);
2278		em = NULL;
2279
2280		/*
2281		 * compressed and inline extents are written through other
2282		 * paths in the FS
2283		 */
2284		if (compressed || block_start == EXTENT_MAP_HOLE ||
2285		    block_start == EXTENT_MAP_INLINE) {
2286			/*
2287			 * end_io notification does not happen here for
2288			 * compressed extents
2289			 */
2290			if (!compressed && tree->ops &&
2291			    tree->ops->writepage_end_io_hook)
2292				tree->ops->writepage_end_io_hook(page, cur,
2293							 cur + iosize - 1,
2294							 NULL, 1);
2295			else if (compressed) {
2296				/* we don't want to end_page_writeback on
2297				 * a compressed extent.  this happens
2298				 * elsewhere
2299				 */
2300				nr++;
2301			}
2302
2303			cur += iosize;
2304			pg_offset += iosize;
2305			continue;
2306		}
2307		/* leave this out until we have a page_mkwrite call */
2308		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2309				   EXTENT_DIRTY, 0, NULL)) {
2310			cur = cur + iosize;
2311			pg_offset += iosize;
2312			continue;
2313		}
2314
2315		if (tree->ops && tree->ops->writepage_io_hook) {
2316			ret = tree->ops->writepage_io_hook(page, cur,
2317						cur + iosize - 1);
2318		} else {
2319			ret = 0;
2320		}
2321		if (ret) {
2322			SetPageError(page);
2323		} else {
2324			unsigned long max_nr = end_index + 1;
2325
2326			set_range_writeback(tree, cur, cur + iosize - 1);
2327			if (!PageWriteback(page)) {
2328				printk(KERN_ERR "btrfs warning page %lu not "
2329				       "writeback, cur %llu end %llu\n",
2330				       page->index, (unsigned long long)cur,
2331				       (unsigned long long)end);
2332			}
2333
2334			ret = submit_extent_page(write_flags, tree, page,
2335						 sector, iosize, pg_offset,
2336						 bdev, &epd->bio, max_nr,
2337						 end_bio_extent_writepage,
2338						 0, 0, 0);
2339			if (ret)
2340				SetPageError(page);
2341		}
2342		cur = cur + iosize;
2343		pg_offset += iosize;
2344		nr++;
2345	}
2346done:
2347	if (nr == 0) {
2348		/* make sure the mapping tag for page dirty gets cleared */
2349		set_page_writeback(page);
2350		end_page_writeback(page);
2351	}
2352	unlock_page(page);
2353
2354done_unlocked:
2355
2356	/* drop our reference on any cached states */
2357	free_extent_state(cached_state);
2358	return 0;
2359}
2360
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2361/**
2362 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2363 * @mapping: address space structure to write
2364 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2365 * @writepage: function called for each page
2366 * @data: data passed to writepage function
2367 *
2368 * If a page is already under I/O, write_cache_pages() skips it, even
2369 * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
2370 * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
2371 * and msync() need to guarantee that all the data which was dirty at the time
2372 * the call was made get new I/O started against them.  If wbc->sync_mode is
2373 * WB_SYNC_ALL then we were called for data integrity and we must wait for
2374 * existing IO to complete.
2375 */
2376static int extent_write_cache_pages(struct extent_io_tree *tree,
2377			     struct address_space *mapping,
2378			     struct writeback_control *wbc,
2379			     writepage_t writepage, void *data,
2380			     void (*flush_fn)(void *))
2381{
 
2382	int ret = 0;
2383	int done = 0;
2384	int nr_to_write_done = 0;
2385	struct pagevec pvec;
2386	int nr_pages;
2387	pgoff_t index;
2388	pgoff_t end;		/* Inclusive */
2389	int scanned = 0;
2390	int tag;
2391
 
 
 
 
 
 
 
 
 
 
 
 
2392	pagevec_init(&pvec, 0);
2393	if (wbc->range_cyclic) {
2394		index = mapping->writeback_index; /* Start from prev offset */
2395		end = -1;
2396	} else {
2397		index = wbc->range_start >> PAGE_CACHE_SHIFT;
2398		end = wbc->range_end >> PAGE_CACHE_SHIFT;
2399		scanned = 1;
2400	}
2401	if (wbc->sync_mode == WB_SYNC_ALL)
2402		tag = PAGECACHE_TAG_TOWRITE;
2403	else
2404		tag = PAGECACHE_TAG_DIRTY;
2405retry:
2406	if (wbc->sync_mode == WB_SYNC_ALL)
2407		tag_pages_for_writeback(mapping, index, end);
2408	while (!done && !nr_to_write_done && (index <= end) &&
2409	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2410			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2411		unsigned i;
2412
2413		scanned = 1;
2414		for (i = 0; i < nr_pages; i++) {
2415			struct page *page = pvec.pages[i];
2416
2417			/*
2418			 * At this point we hold neither mapping->tree_lock nor
2419			 * lock on the page itself: the page may be truncated or
2420			 * invalidated (changing page->mapping to NULL), or even
2421			 * swizzled back from swapper_space to tmpfs file
2422			 * mapping
2423			 */
2424			if (tree->ops && tree->ops->write_cache_pages_lock_hook)
2425				tree->ops->write_cache_pages_lock_hook(page);
2426			else
2427				lock_page(page);
 
 
 
 
 
 
2428
2429			if (unlikely(page->mapping != mapping)) {
2430				unlock_page(page);
2431				continue;
2432			}
2433
2434			if (!wbc->range_cyclic && page->index > end) {
2435				done = 1;
2436				unlock_page(page);
2437				continue;
2438			}
2439
2440			if (wbc->sync_mode != WB_SYNC_NONE) {
2441				if (PageWriteback(page))
2442					flush_fn(data);
2443				wait_on_page_writeback(page);
2444			}
2445
2446			if (PageWriteback(page) ||
2447			    !clear_page_dirty_for_io(page)) {
2448				unlock_page(page);
2449				continue;
2450			}
2451
2452			ret = (*writepage)(page, wbc, data);
2453
2454			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2455				unlock_page(page);
2456				ret = 0;
2457			}
2458			if (ret)
2459				done = 1;
2460
2461			/*
2462			 * the filesystem may choose to bump up nr_to_write.
2463			 * We have to make sure to honor the new nr_to_write
2464			 * at any time
2465			 */
2466			nr_to_write_done = wbc->nr_to_write <= 0;
2467		}
2468		pagevec_release(&pvec);
2469		cond_resched();
2470	}
2471	if (!scanned && !done) {
2472		/*
2473		 * We hit the last page and there is more work to be done: wrap
2474		 * back to the start of the file
2475		 */
2476		scanned = 1;
2477		index = 0;
2478		goto retry;
2479	}
 
2480	return ret;
2481}
2482
2483static void flush_epd_write_bio(struct extent_page_data *epd)
2484{
2485	if (epd->bio) {
 
 
 
2486		if (epd->sync_io)
2487			submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
2488		else
2489			submit_one_bio(WRITE, epd->bio, 0, 0);
 
2490		epd->bio = NULL;
2491	}
2492}
2493
2494static noinline void flush_write_bio(void *data)
2495{
2496	struct extent_page_data *epd = data;
2497	flush_epd_write_bio(epd);
2498}
2499
2500int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2501			  get_extent_t *get_extent,
2502			  struct writeback_control *wbc)
2503{
2504	int ret;
2505	struct extent_page_data epd = {
2506		.bio = NULL,
2507		.tree = tree,
2508		.get_extent = get_extent,
2509		.extent_locked = 0,
2510		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
2511	};
2512
2513	ret = __extent_writepage(page, wbc, &epd);
2514
2515	flush_epd_write_bio(&epd);
2516	return ret;
2517}
2518
2519int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2520			      u64 start, u64 end, get_extent_t *get_extent,
2521			      int mode)
2522{
2523	int ret = 0;
2524	struct address_space *mapping = inode->i_mapping;
2525	struct page *page;
2526	unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
2527		PAGE_CACHE_SHIFT;
2528
2529	struct extent_page_data epd = {
2530		.bio = NULL,
2531		.tree = tree,
2532		.get_extent = get_extent,
2533		.extent_locked = 1,
2534		.sync_io = mode == WB_SYNC_ALL,
2535	};
2536	struct writeback_control wbc_writepages = {
2537		.sync_mode	= mode,
2538		.nr_to_write	= nr_pages * 2,
2539		.range_start	= start,
2540		.range_end	= end + 1,
2541	};
2542
2543	while (start <= end) {
2544		page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
2545		if (clear_page_dirty_for_io(page))
2546			ret = __extent_writepage(page, &wbc_writepages, &epd);
2547		else {
2548			if (tree->ops && tree->ops->writepage_end_io_hook)
2549				tree->ops->writepage_end_io_hook(page, start,
2550						 start + PAGE_CACHE_SIZE - 1,
2551						 NULL, 1);
2552			unlock_page(page);
2553		}
2554		page_cache_release(page);
2555		start += PAGE_CACHE_SIZE;
2556	}
2557
2558	flush_epd_write_bio(&epd);
2559	return ret;
2560}
2561
2562int extent_writepages(struct extent_io_tree *tree,
2563		      struct address_space *mapping,
2564		      get_extent_t *get_extent,
2565		      struct writeback_control *wbc)
2566{
2567	int ret = 0;
2568	struct extent_page_data epd = {
2569		.bio = NULL,
2570		.tree = tree,
2571		.get_extent = get_extent,
2572		.extent_locked = 0,
2573		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
2574	};
2575
2576	ret = extent_write_cache_pages(tree, mapping, wbc,
2577				       __extent_writepage, &epd,
2578				       flush_write_bio);
2579	flush_epd_write_bio(&epd);
2580	return ret;
2581}
2582
2583int extent_readpages(struct extent_io_tree *tree,
2584		     struct address_space *mapping,
2585		     struct list_head *pages, unsigned nr_pages,
2586		     get_extent_t get_extent)
2587{
2588	struct bio *bio = NULL;
2589	unsigned page_idx;
2590	unsigned long bio_flags = 0;
2591
2592	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2593		struct page *page = list_entry(pages->prev, struct page, lru);
2594
2595		prefetchw(&page->flags);
2596		list_del(&page->lru);
2597		if (!add_to_page_cache_lru(page, mapping,
2598					page->index, GFP_NOFS)) {
2599			__extent_read_full_page(tree, page, get_extent,
2600						&bio, 0, &bio_flags);
2601		}
2602		page_cache_release(page);
2603	}
2604	BUG_ON(!list_empty(pages));
2605	if (bio)
2606		submit_one_bio(READ, bio, 0, bio_flags);
2607	return 0;
2608}
2609
2610/*
2611 * basic invalidatepage code, this waits on any locked or writeback
2612 * ranges corresponding to the page, and then deletes any extent state
2613 * records from the tree
2614 */
2615int extent_invalidatepage(struct extent_io_tree *tree,
2616			  struct page *page, unsigned long offset)
2617{
2618	struct extent_state *cached_state = NULL;
2619	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2620	u64 end = start + PAGE_CACHE_SIZE - 1;
2621	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2622
2623	start += (offset + blocksize - 1) & ~(blocksize - 1);
2624	if (start > end)
2625		return 0;
2626
2627	lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
2628	wait_on_page_writeback(page);
2629	clear_extent_bit(tree, start, end,
2630			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
2631			 EXTENT_DO_ACCOUNTING,
2632			 1, 1, &cached_state, GFP_NOFS);
2633	return 0;
2634}
2635
2636/*
2637 * a helper for releasepage, this tests for areas of the page that
2638 * are locked or under IO and drops the related state bits if it is safe
2639 * to drop the page.
2640 */
2641int try_release_extent_state(struct extent_map_tree *map,
2642			     struct extent_io_tree *tree, struct page *page,
2643			     gfp_t mask)
2644{
2645	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2646	u64 end = start + PAGE_CACHE_SIZE - 1;
2647	int ret = 1;
2648
2649	if (test_range_bit(tree, start, end,
2650			   EXTENT_IOBITS, 0, NULL))
2651		ret = 0;
2652	else {
2653		if ((mask & GFP_NOFS) == GFP_NOFS)
2654			mask = GFP_NOFS;
2655		/*
2656		 * at this point we can safely clear everything except the
2657		 * locked bit and the nodatasum bit
2658		 */
2659		ret = clear_extent_bit(tree, start, end,
2660				 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
2661				 0, 0, NULL, mask);
2662
2663		/* if clear_extent_bit failed for enomem reasons,
2664		 * we can't allow the release to continue.
2665		 */
2666		if (ret < 0)
2667			ret = 0;
2668		else
2669			ret = 1;
2670	}
2671	return ret;
2672}
2673
2674/*
2675 * a helper for releasepage.  As long as there are no locked extents
2676 * in the range corresponding to the page, both state records and extent
2677 * map records are removed
2678 */
2679int try_release_extent_mapping(struct extent_map_tree *map,
2680			       struct extent_io_tree *tree, struct page *page,
2681			       gfp_t mask)
2682{
2683	struct extent_map *em;
2684	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2685	u64 end = start + PAGE_CACHE_SIZE - 1;
2686
2687	if ((mask & __GFP_WAIT) &&
2688	    page->mapping->host->i_size > 16 * 1024 * 1024) {
2689		u64 len;
2690		while (start <= end) {
2691			len = end - start + 1;
2692			write_lock(&map->lock);
2693			em = lookup_extent_mapping(map, start, len);
2694			if (IS_ERR_OR_NULL(em)) {
2695				write_unlock(&map->lock);
2696				break;
2697			}
2698			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2699			    em->start != start) {
2700				write_unlock(&map->lock);
2701				free_extent_map(em);
2702				break;
2703			}
2704			if (!test_range_bit(tree, em->start,
2705					    extent_map_end(em) - 1,
2706					    EXTENT_LOCKED | EXTENT_WRITEBACK,
2707					    0, NULL)) {
2708				remove_extent_mapping(map, em);
2709				/* once for the rb tree */
2710				free_extent_map(em);
2711			}
2712			start = extent_map_end(em);
2713			write_unlock(&map->lock);
2714
2715			/* once for us */
2716			free_extent_map(em);
2717		}
2718	}
2719	return try_release_extent_state(map, tree, page, mask);
2720}
2721
2722/*
2723 * helper function for fiemap, which doesn't want to see any holes.
2724 * This maps until we find something past 'last'
2725 */
2726static struct extent_map *get_extent_skip_holes(struct inode *inode,
2727						u64 offset,
2728						u64 last,
2729						get_extent_t *get_extent)
2730{
2731	u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
2732	struct extent_map *em;
2733	u64 len;
2734
2735	if (offset >= last)
2736		return NULL;
2737
2738	while(1) {
2739		len = last - offset;
2740		if (len == 0)
2741			break;
2742		len = (len + sectorsize - 1) & ~(sectorsize - 1);
2743		em = get_extent(inode, NULL, 0, offset, len, 0);
2744		if (IS_ERR_OR_NULL(em))
2745			return em;
2746
2747		/* if this isn't a hole return it */
2748		if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
2749		    em->block_start != EXTENT_MAP_HOLE) {
2750			return em;
2751		}
2752
2753		/* this is a hole, advance to the next extent */
2754		offset = extent_map_end(em);
2755		free_extent_map(em);
2756		if (offset >= last)
2757			break;
2758	}
2759	return NULL;
2760}
2761
2762int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2763		__u64 start, __u64 len, get_extent_t *get_extent)
2764{
2765	int ret = 0;
2766	u64 off = start;
2767	u64 max = start + len;
2768	u32 flags = 0;
2769	u32 found_type;
2770	u64 last;
2771	u64 last_for_get_extent = 0;
2772	u64 disko = 0;
2773	u64 isize = i_size_read(inode);
2774	struct btrfs_key found_key;
2775	struct extent_map *em = NULL;
2776	struct extent_state *cached_state = NULL;
2777	struct btrfs_path *path;
2778	struct btrfs_file_extent_item *item;
2779	int end = 0;
2780	u64 em_start = 0;
2781	u64 em_len = 0;
2782	u64 em_end = 0;
2783	unsigned long emflags;
2784
2785	if (len == 0)
2786		return -EINVAL;
2787
2788	path = btrfs_alloc_path();
2789	if (!path)
2790		return -ENOMEM;
2791	path->leave_spinning = 1;
2792
 
 
 
2793	/*
2794	 * lookup the last file extent.  We're not using i_size here
2795	 * because there might be preallocation past i_size
2796	 */
2797	ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
2798				       path, btrfs_ino(inode), -1, 0);
2799	if (ret < 0) {
2800		btrfs_free_path(path);
2801		return ret;
2802	}
2803	WARN_ON(!ret);
2804	path->slots[0]--;
2805	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2806			      struct btrfs_file_extent_item);
2807	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
2808	found_type = btrfs_key_type(&found_key);
2809
2810	/* No extents, but there might be delalloc bits */
2811	if (found_key.objectid != btrfs_ino(inode) ||
2812	    found_type != BTRFS_EXTENT_DATA_KEY) {
2813		/* have to trust i_size as the end */
2814		last = (u64)-1;
2815		last_for_get_extent = isize;
2816	} else {
2817		/*
2818		 * remember the start of the last extent.  There are a
2819		 * bunch of different factors that go into the length of the
2820		 * extent, so its much less complex to remember where it started
2821		 */
2822		last = found_key.offset;
2823		last_for_get_extent = last + 1;
2824	}
2825	btrfs_free_path(path);
2826
2827	/*
2828	 * we might have some extents allocated but more delalloc past those
2829	 * extents.  so, we trust isize unless the start of the last extent is
2830	 * beyond isize
2831	 */
2832	if (last < isize) {
2833		last = (u64)-1;
2834		last_for_get_extent = isize;
2835	}
2836
2837	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2838			 &cached_state, GFP_NOFS);
2839
2840	em = get_extent_skip_holes(inode, off, last_for_get_extent,
2841				   get_extent);
2842	if (!em)
2843		goto out;
2844	if (IS_ERR(em)) {
2845		ret = PTR_ERR(em);
2846		goto out;
2847	}
2848
2849	while (!end) {
2850		u64 offset_in_extent;
2851
2852		/* break if the extent we found is outside the range */
2853		if (em->start >= max || extent_map_end(em) < off)
2854			break;
2855
2856		/*
2857		 * get_extent may return an extent that starts before our
2858		 * requested range.  We have to make sure the ranges
2859		 * we return to fiemap always move forward and don't
2860		 * overlap, so adjust the offsets here
2861		 */
2862		em_start = max(em->start, off);
2863
2864		/*
2865		 * record the offset from the start of the extent
2866		 * for adjusting the disk offset below
2867		 */
2868		offset_in_extent = em_start - em->start;
2869		em_end = extent_map_end(em);
2870		em_len = em_end - em_start;
2871		emflags = em->flags;
2872		disko = 0;
2873		flags = 0;
2874
2875		/*
2876		 * bump off for our next call to get_extent
2877		 */
2878		off = extent_map_end(em);
2879		if (off >= max)
2880			end = 1;
2881
2882		if (em->block_start == EXTENT_MAP_LAST_BYTE) {
2883			end = 1;
2884			flags |= FIEMAP_EXTENT_LAST;
2885		} else if (em->block_start == EXTENT_MAP_INLINE) {
2886			flags |= (FIEMAP_EXTENT_DATA_INLINE |
2887				  FIEMAP_EXTENT_NOT_ALIGNED);
2888		} else if (em->block_start == EXTENT_MAP_DELALLOC) {
2889			flags |= (FIEMAP_EXTENT_DELALLOC |
2890				  FIEMAP_EXTENT_UNKNOWN);
2891		} else {
2892			disko = em->block_start + offset_in_extent;
2893		}
2894		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2895			flags |= FIEMAP_EXTENT_ENCODED;
2896
2897		free_extent_map(em);
2898		em = NULL;
2899		if ((em_start >= last) || em_len == (u64)-1 ||
2900		   (last == (u64)-1 && isize <= em_end)) {
2901			flags |= FIEMAP_EXTENT_LAST;
2902			end = 1;
2903		}
2904
2905		/* now scan forward to see if this is really the last extent. */
2906		em = get_extent_skip_holes(inode, off, last_for_get_extent,
2907					   get_extent);
2908		if (IS_ERR(em)) {
2909			ret = PTR_ERR(em);
2910			goto out;
2911		}
2912		if (!em) {
2913			flags |= FIEMAP_EXTENT_LAST;
2914			end = 1;
2915		}
2916		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
2917					      em_len, flags);
2918		if (ret)
2919			goto out_free;
2920	}
2921out_free:
2922	free_extent_map(em);
2923out:
2924	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
2925			     &cached_state, GFP_NOFS);
2926	return ret;
2927}
2928
2929static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2930					      unsigned long i)
2931{
2932	struct page *p;
2933	struct address_space *mapping;
2934
2935	if (i == 0)
2936		return eb->first_page;
2937	i += eb->start >> PAGE_CACHE_SHIFT;
2938	mapping = eb->first_page->mapping;
2939	if (!mapping)
2940		return NULL;
2941
2942	/*
2943	 * extent_buffer_page is only called after pinning the page
2944	 * by increasing the reference count.  So we know the page must
2945	 * be in the radix tree.
2946	 */
2947	rcu_read_lock();
2948	p = radix_tree_lookup(&mapping->page_tree, i);
2949	rcu_read_unlock();
2950
2951	return p;
2952}
2953
2954static inline unsigned long num_extent_pages(u64 start, u64 len)
2955{
2956	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2957		(start >> PAGE_CACHE_SHIFT);
2958}
2959
 
 
 
 
 
 
 
 
 
 
 
 
 
2960static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2961						   u64 start,
2962						   unsigned long len,
2963						   gfp_t mask)
2964{
2965	struct extent_buffer *eb = NULL;
2966#if LEAK_DEBUG
2967	unsigned long flags;
2968#endif
2969
2970	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2971	if (eb == NULL)
2972		return NULL;
2973	eb->start = start;
2974	eb->len = len;
 
 
2975	rwlock_init(&eb->lock);
2976	atomic_set(&eb->write_locks, 0);
2977	atomic_set(&eb->read_locks, 0);
2978	atomic_set(&eb->blocking_readers, 0);
2979	atomic_set(&eb->blocking_writers, 0);
2980	atomic_set(&eb->spinning_readers, 0);
2981	atomic_set(&eb->spinning_writers, 0);
 
2982	init_waitqueue_head(&eb->write_lock_wq);
2983	init_waitqueue_head(&eb->read_lock_wq);
2984
2985#if LEAK_DEBUG
2986	spin_lock_irqsave(&leak_lock, flags);
2987	list_add(&eb->leak_list, &buffers);
2988	spin_unlock_irqrestore(&leak_lock, flags);
2989#endif
 
2990	atomic_set(&eb->refs, 1);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2991
2992	return eb;
2993}
2994
2995static void __free_extent_buffer(struct extent_buffer *eb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2996{
2997#if LEAK_DEBUG
2998	unsigned long flags;
2999	spin_lock_irqsave(&leak_lock, flags);
3000	list_del(&eb->leak_list);
3001	spin_unlock_irqrestore(&leak_lock, flags);
3002#endif
3003	kmem_cache_free(extent_buffer_cache, eb);
3004}
3005
3006/*
3007 * Helper for releasing extent buffer page.
3008 */
3009static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
3010						unsigned long start_idx)
3011{
3012	unsigned long index;
 
3013	struct page *page;
 
3014
3015	if (!eb->first_page)
3016		return;
3017
3018	index = num_extent_pages(eb->start, eb->len);
 
3019	if (start_idx >= index)
3020		return;
3021
3022	do {
3023		index--;
3024		page = extent_buffer_page(eb, index);
3025		if (page)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3026			page_cache_release(page);
 
3027	} while (index != start_idx);
3028}
3029
3030/*
3031 * Helper for releasing the extent buffer.
3032 */
3033static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
3034{
3035	btrfs_release_extent_buffer_page(eb, 0);
3036	__free_extent_buffer(eb);
3037}
3038
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3039struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3040					  u64 start, unsigned long len,
3041					  struct page *page0)
3042{
3043	unsigned long num_pages = num_extent_pages(start, len);
3044	unsigned long i;
3045	unsigned long index = start >> PAGE_CACHE_SHIFT;
3046	struct extent_buffer *eb;
3047	struct extent_buffer *exists = NULL;
3048	struct page *p;
3049	struct address_space *mapping = tree->mapping;
3050	int uptodate = 1;
3051	int ret;
3052
3053	rcu_read_lock();
3054	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3055	if (eb && atomic_inc_not_zero(&eb->refs)) {
3056		rcu_read_unlock();
3057		mark_page_accessed(eb->first_page);
3058		return eb;
3059	}
3060	rcu_read_unlock();
3061
3062	eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
3063	if (!eb)
3064		return NULL;
3065
3066	if (page0) {
3067		eb->first_page = page0;
3068		i = 1;
3069		index++;
3070		page_cache_get(page0);
3071		mark_page_accessed(page0);
3072		set_page_extent_mapped(page0);
3073		set_page_extent_head(page0, len);
3074		uptodate = PageUptodate(page0);
3075	} else {
3076		i = 0;
3077	}
3078	for (; i < num_pages; i++, index++) {
3079		p = find_or_create_page(mapping, index, GFP_NOFS);
3080		if (!p) {
3081			WARN_ON(1);
3082			goto free_eb;
3083		}
3084		set_page_extent_mapped(p);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3085		mark_page_accessed(p);
3086		if (i == 0) {
3087			eb->first_page = p;
3088			set_page_extent_head(p, len);
3089		} else {
3090			set_page_private(p, EXTENT_PAGE_PRIVATE);
3091		}
3092		if (!PageUptodate(p))
3093			uptodate = 0;
3094
3095		/*
3096		 * see below about how we avoid a nasty race with release page
3097		 * and why we unlock later
3098		 */
3099		if (i != 0)
3100			unlock_page(p);
3101	}
3102	if (uptodate)
3103		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3104
3105	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
3106	if (ret)
3107		goto free_eb;
3108
3109	spin_lock(&tree->buffer_lock);
3110	ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
3111	if (ret == -EEXIST) {
3112		exists = radix_tree_lookup(&tree->buffer,
3113						start >> PAGE_CACHE_SHIFT);
3114		/* add one reference for the caller */
3115		atomic_inc(&exists->refs);
 
 
 
 
3116		spin_unlock(&tree->buffer_lock);
3117		radix_tree_preload_end();
 
3118		goto free_eb;
3119	}
3120	/* add one reference for the tree */
3121	atomic_inc(&eb->refs);
 
 
3122	spin_unlock(&tree->buffer_lock);
3123	radix_tree_preload_end();
3124
3125	/*
3126	 * there is a race where release page may have
3127	 * tried to find this extent buffer in the radix
3128	 * but failed.  It will tell the VM it is safe to
3129	 * reclaim the, and it will clear the page private bit.
3130	 * We must make sure to set the page private bit properly
3131	 * after the extent buffer is in the radix tree so
3132	 * it doesn't get lost
3133	 */
3134	set_page_extent_mapped(eb->first_page);
3135	set_page_extent_head(eb->first_page, eb->len);
3136	if (!page0)
3137		unlock_page(eb->first_page);
 
 
 
3138	return eb;
3139
3140free_eb:
3141	if (eb->first_page && !page0)
3142		unlock_page(eb->first_page);
 
 
3143
3144	if (!atomic_dec_and_test(&eb->refs))
3145		return exists;
3146	btrfs_release_extent_buffer(eb);
3147	return exists;
3148}
3149
3150struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3151					 u64 start, unsigned long len)
3152{
3153	struct extent_buffer *eb;
3154
3155	rcu_read_lock();
3156	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3157	if (eb && atomic_inc_not_zero(&eb->refs)) {
3158		rcu_read_unlock();
3159		mark_page_accessed(eb->first_page);
3160		return eb;
3161	}
3162	rcu_read_unlock();
3163
3164	return NULL;
3165}
3166
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3167void free_extent_buffer(struct extent_buffer *eb)
3168{
3169	if (!eb)
3170		return;
3171
3172	if (!atomic_dec_and_test(&eb->refs))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3173		return;
3174
3175	WARN_ON(1);
 
 
 
 
 
 
3176}
3177
3178int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3179			      struct extent_buffer *eb)
3180{
3181	unsigned long i;
3182	unsigned long num_pages;
3183	struct page *page;
3184
3185	num_pages = num_extent_pages(eb->start, eb->len);
3186
3187	for (i = 0; i < num_pages; i++) {
3188		page = extent_buffer_page(eb, i);
3189		if (!PageDirty(page))
3190			continue;
3191
3192		lock_page(page);
3193		WARN_ON(!PagePrivate(page));
3194
3195		set_page_extent_mapped(page);
3196		if (i == 0)
3197			set_page_extent_head(page, eb->len);
3198
3199		clear_page_dirty_for_io(page);
3200		spin_lock_irq(&page->mapping->tree_lock);
3201		if (!PageDirty(page)) {
3202			radix_tree_tag_clear(&page->mapping->page_tree,
3203						page_index(page),
3204						PAGECACHE_TAG_DIRTY);
3205		}
3206		spin_unlock_irq(&page->mapping->tree_lock);
 
3207		unlock_page(page);
3208	}
3209	return 0;
3210}
3211
3212int set_extent_buffer_dirty(struct extent_io_tree *tree,
3213			     struct extent_buffer *eb)
3214{
3215	unsigned long i;
3216	unsigned long num_pages;
3217	int was_dirty = 0;
3218
 
 
3219	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
 
3220	num_pages = num_extent_pages(eb->start, eb->len);
 
 
 
3221	for (i = 0; i < num_pages; i++)
3222		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3223	return was_dirty;
3224}
3225
3226static int __eb_straddles_pages(u64 start, u64 len)
3227{
3228	if (len < PAGE_CACHE_SIZE)
3229		return 1;
3230	if (start & (PAGE_CACHE_SIZE - 1))
3231		return 1;
3232	if ((start + len) & (PAGE_CACHE_SIZE - 1))
3233		return 1;
3234	return 0;
3235}
3236
3237static int eb_straddles_pages(struct extent_buffer *eb)
3238{
3239	return __eb_straddles_pages(eb->start, eb->len);
3240}
3241
3242int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3243				struct extent_buffer *eb,
3244				struct extent_state **cached_state)
3245{
3246	unsigned long i;
3247	struct page *page;
3248	unsigned long num_pages;
3249
 
3250	num_pages = num_extent_pages(eb->start, eb->len);
3251	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3252
3253	if (eb_straddles_pages(eb)) {
3254		clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3255				      cached_state, GFP_NOFS);
3256	}
3257	for (i = 0; i < num_pages; i++) {
3258		page = extent_buffer_page(eb, i);
3259		if (page)
3260			ClearPageUptodate(page);
3261	}
3262	return 0;
3263}
3264
3265int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3266				struct extent_buffer *eb)
3267{
3268	unsigned long i;
3269	struct page *page;
3270	unsigned long num_pages;
3271
 
3272	num_pages = num_extent_pages(eb->start, eb->len);
3273
3274	if (eb_straddles_pages(eb)) {
3275		set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3276				    NULL, GFP_NOFS);
3277	}
3278	for (i = 0; i < num_pages; i++) {
3279		page = extent_buffer_page(eb, i);
3280		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3281		    ((i == num_pages - 1) &&
3282		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3283			check_page_uptodate(tree, page);
3284			continue;
3285		}
3286		SetPageUptodate(page);
3287	}
3288	return 0;
3289}
3290
3291int extent_range_uptodate(struct extent_io_tree *tree,
3292			  u64 start, u64 end)
3293{
3294	struct page *page;
3295	int ret;
3296	int pg_uptodate = 1;
3297	int uptodate;
3298	unsigned long index;
3299
3300	if (__eb_straddles_pages(start, end - start + 1)) {
3301		ret = test_range_bit(tree, start, end,
3302				     EXTENT_UPTODATE, 1, NULL);
3303		if (ret)
3304			return 1;
3305	}
3306	while (start <= end) {
3307		index = start >> PAGE_CACHE_SHIFT;
3308		page = find_get_page(tree->mapping, index);
 
 
3309		uptodate = PageUptodate(page);
3310		page_cache_release(page);
3311		if (!uptodate) {
3312			pg_uptodate = 0;
3313			break;
3314		}
3315		start += PAGE_CACHE_SIZE;
3316	}
3317	return pg_uptodate;
3318}
3319
3320int extent_buffer_uptodate(struct extent_io_tree *tree,
3321			   struct extent_buffer *eb,
3322			   struct extent_state *cached_state)
3323{
3324	int ret = 0;
3325	unsigned long num_pages;
3326	unsigned long i;
3327	struct page *page;
3328	int pg_uptodate = 1;
3329
3330	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3331		return 1;
3332
3333	if (eb_straddles_pages(eb)) {
3334		ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3335				   EXTENT_UPTODATE, 1, cached_state);
3336		if (ret)
3337			return ret;
3338	}
3339
3340	num_pages = num_extent_pages(eb->start, eb->len);
3341	for (i = 0; i < num_pages; i++) {
3342		page = extent_buffer_page(eb, i);
3343		if (!PageUptodate(page)) {
3344			pg_uptodate = 0;
3345			break;
3346		}
3347	}
3348	return pg_uptodate;
3349}
3350
3351int read_extent_buffer_pages(struct extent_io_tree *tree,
3352			     struct extent_buffer *eb,
3353			     u64 start, int wait,
3354			     get_extent_t *get_extent, int mirror_num)
3355{
3356	unsigned long i;
3357	unsigned long start_i;
3358	struct page *page;
3359	int err;
3360	int ret = 0;
3361	int locked_pages = 0;
3362	int all_uptodate = 1;
3363	int inc_all_pages = 0;
3364	unsigned long num_pages;
 
3365	struct bio *bio = NULL;
3366	unsigned long bio_flags = 0;
3367
3368	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3369		return 0;
3370
3371	if (eb_straddles_pages(eb)) {
3372		if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3373				   EXTENT_UPTODATE, 1, NULL)) {
3374			return 0;
3375		}
3376	}
3377
3378	if (start) {
3379		WARN_ON(start < eb->start);
3380		start_i = (start >> PAGE_CACHE_SHIFT) -
3381			(eb->start >> PAGE_CACHE_SHIFT);
3382	} else {
3383		start_i = 0;
3384	}
3385
3386	num_pages = num_extent_pages(eb->start, eb->len);
3387	for (i = start_i; i < num_pages; i++) {
3388		page = extent_buffer_page(eb, i);
3389		if (!wait) {
3390			if (!trylock_page(page))
3391				goto unlock_exit;
3392		} else {
3393			lock_page(page);
3394		}
3395		locked_pages++;
3396		if (!PageUptodate(page))
 
3397			all_uptodate = 0;
 
3398	}
3399	if (all_uptodate) {
3400		if (start_i == 0)
3401			set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3402		goto unlock_exit;
3403	}
3404
 
 
 
3405	for (i = start_i; i < num_pages; i++) {
3406		page = extent_buffer_page(eb, i);
3407
3408		WARN_ON(!PagePrivate(page));
3409
3410		set_page_extent_mapped(page);
3411		if (i == 0)
3412			set_page_extent_head(page, eb->len);
3413
3414		if (inc_all_pages)
3415			page_cache_get(page);
3416		if (!PageUptodate(page)) {
3417			if (start_i == 0)
3418				inc_all_pages = 1;
3419			ClearPageError(page);
3420			err = __extent_read_full_page(tree, page,
3421						      get_extent, &bio,
3422						      mirror_num, &bio_flags);
3423			if (err)
3424				ret = err;
3425		} else {
3426			unlock_page(page);
3427		}
3428	}
3429
3430	if (bio)
3431		submit_one_bio(READ, bio, mirror_num, bio_flags);
 
 
 
3432
3433	if (ret || !wait)
3434		return ret;
3435
3436	for (i = start_i; i < num_pages; i++) {
3437		page = extent_buffer_page(eb, i);
3438		wait_on_page_locked(page);
3439		if (!PageUptodate(page))
3440			ret = -EIO;
3441	}
3442
3443	if (!ret)
3444		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3445	return ret;
3446
3447unlock_exit:
3448	i = start_i;
3449	while (locked_pages > 0) {
3450		page = extent_buffer_page(eb, i);
3451		i++;
3452		unlock_page(page);
3453		locked_pages--;
3454	}
3455	return ret;
3456}
3457
3458void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3459			unsigned long start,
3460			unsigned long len)
3461{
3462	size_t cur;
3463	size_t offset;
3464	struct page *page;
3465	char *kaddr;
3466	char *dst = (char *)dstv;
3467	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3468	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3469
3470	WARN_ON(start > eb->len);
3471	WARN_ON(start + len > eb->start + eb->len);
3472
3473	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3474
3475	while (len > 0) {
3476		page = extent_buffer_page(eb, i);
3477
3478		cur = min(len, (PAGE_CACHE_SIZE - offset));
3479		kaddr = page_address(page);
3480		memcpy(dst, kaddr + offset, cur);
3481
3482		dst += cur;
3483		len -= cur;
3484		offset = 0;
3485		i++;
3486	}
3487}
3488
3489int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3490			       unsigned long min_len, char **map,
3491			       unsigned long *map_start,
3492			       unsigned long *map_len)
3493{
3494	size_t offset = start & (PAGE_CACHE_SIZE - 1);
3495	char *kaddr;
3496	struct page *p;
3497	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3498	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3499	unsigned long end_i = (start_offset + start + min_len - 1) >>
3500		PAGE_CACHE_SHIFT;
3501
3502	if (i != end_i)
3503		return -EINVAL;
3504
3505	if (i == 0) {
3506		offset = start_offset;
3507		*map_start = 0;
3508	} else {
3509		offset = 0;
3510		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
3511	}
3512
3513	if (start + min_len > eb->len) {
3514		printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
3515		       "wanted %lu %lu\n", (unsigned long long)eb->start,
3516		       eb->len, start, min_len);
3517		WARN_ON(1);
3518		return -EINVAL;
3519	}
3520
3521	p = extent_buffer_page(eb, i);
3522	kaddr = page_address(p);
3523	*map = kaddr + offset;
3524	*map_len = PAGE_CACHE_SIZE - offset;
3525	return 0;
3526}
3527
3528int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3529			  unsigned long start,
3530			  unsigned long len)
3531{
3532	size_t cur;
3533	size_t offset;
3534	struct page *page;
3535	char *kaddr;
3536	char *ptr = (char *)ptrv;
3537	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3538	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3539	int ret = 0;
3540
3541	WARN_ON(start > eb->len);
3542	WARN_ON(start + len > eb->start + eb->len);
3543
3544	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3545
3546	while (len > 0) {
3547		page = extent_buffer_page(eb, i);
3548
3549		cur = min(len, (PAGE_CACHE_SIZE - offset));
3550
3551		kaddr = page_address(page);
3552		ret = memcmp(ptr, kaddr + offset, cur);
3553		if (ret)
3554			break;
3555
3556		ptr += cur;
3557		len -= cur;
3558		offset = 0;
3559		i++;
3560	}
3561	return ret;
3562}
3563
3564void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3565			 unsigned long start, unsigned long len)
3566{
3567	size_t cur;
3568	size_t offset;
3569	struct page *page;
3570	char *kaddr;
3571	char *src = (char *)srcv;
3572	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3573	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3574
3575	WARN_ON(start > eb->len);
3576	WARN_ON(start + len > eb->start + eb->len);
3577
3578	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3579
3580	while (len > 0) {
3581		page = extent_buffer_page(eb, i);
3582		WARN_ON(!PageUptodate(page));
3583
3584		cur = min(len, PAGE_CACHE_SIZE - offset);
3585		kaddr = page_address(page);
3586		memcpy(kaddr + offset, src, cur);
3587
3588		src += cur;
3589		len -= cur;
3590		offset = 0;
3591		i++;
3592	}
3593}
3594
3595void memset_extent_buffer(struct extent_buffer *eb, char c,
3596			  unsigned long start, unsigned long len)
3597{
3598	size_t cur;
3599	size_t offset;
3600	struct page *page;
3601	char *kaddr;
3602	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3603	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3604
3605	WARN_ON(start > eb->len);
3606	WARN_ON(start + len > eb->start + eb->len);
3607
3608	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3609
3610	while (len > 0) {
3611		page = extent_buffer_page(eb, i);
3612		WARN_ON(!PageUptodate(page));
3613
3614		cur = min(len, PAGE_CACHE_SIZE - offset);
3615		kaddr = page_address(page);
3616		memset(kaddr + offset, c, cur);
3617
3618		len -= cur;
3619		offset = 0;
3620		i++;
3621	}
3622}
3623
3624void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3625			unsigned long dst_offset, unsigned long src_offset,
3626			unsigned long len)
3627{
3628	u64 dst_len = dst->len;
3629	size_t cur;
3630	size_t offset;
3631	struct page *page;
3632	char *kaddr;
3633	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3634	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3635
3636	WARN_ON(src->len != dst_len);
3637
3638	offset = (start_offset + dst_offset) &
3639		((unsigned long)PAGE_CACHE_SIZE - 1);
3640
3641	while (len > 0) {
3642		page = extent_buffer_page(dst, i);
3643		WARN_ON(!PageUptodate(page));
3644
3645		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3646
3647		kaddr = page_address(page);
3648		read_extent_buffer(src, kaddr + offset, src_offset, cur);
3649
3650		src_offset += cur;
3651		len -= cur;
3652		offset = 0;
3653		i++;
3654	}
3655}
3656
3657static void move_pages(struct page *dst_page, struct page *src_page,
3658		       unsigned long dst_off, unsigned long src_off,
3659		       unsigned long len)
3660{
3661	char *dst_kaddr = page_address(dst_page);
3662	if (dst_page == src_page) {
3663		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3664	} else {
3665		char *src_kaddr = page_address(src_page);
3666		char *p = dst_kaddr + dst_off + len;
3667		char *s = src_kaddr + src_off + len;
3668
3669		while (len--)
3670			*--p = *--s;
3671	}
3672}
3673
3674static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
3675{
3676	unsigned long distance = (src > dst) ? src - dst : dst - src;
3677	return distance < len;
3678}
3679
3680static void copy_pages(struct page *dst_page, struct page *src_page,
3681		       unsigned long dst_off, unsigned long src_off,
3682		       unsigned long len)
3683{
3684	char *dst_kaddr = page_address(dst_page);
3685	char *src_kaddr;
 
3686
3687	if (dst_page != src_page) {
3688		src_kaddr = page_address(src_page);
3689	} else {
3690		src_kaddr = dst_kaddr;
3691		BUG_ON(areas_overlap(src_off, dst_off, len));
 
3692	}
3693
3694	memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
 
 
 
3695}
3696
3697void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3698			   unsigned long src_offset, unsigned long len)
3699{
3700	size_t cur;
3701	size_t dst_off_in_page;
3702	size_t src_off_in_page;
3703	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3704	unsigned long dst_i;
3705	unsigned long src_i;
3706
3707	if (src_offset + len > dst->len) {
3708		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3709		       "len %lu dst len %lu\n", src_offset, len, dst->len);
3710		BUG_ON(1);
3711	}
3712	if (dst_offset + len > dst->len) {
3713		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3714		       "len %lu dst len %lu\n", dst_offset, len, dst->len);
3715		BUG_ON(1);
3716	}
3717
3718	while (len > 0) {
3719		dst_off_in_page = (start_offset + dst_offset) &
3720			((unsigned long)PAGE_CACHE_SIZE - 1);
3721		src_off_in_page = (start_offset + src_offset) &
3722			((unsigned long)PAGE_CACHE_SIZE - 1);
3723
3724		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3725		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3726
3727		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3728					       src_off_in_page));
3729		cur = min_t(unsigned long, cur,
3730			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3731
3732		copy_pages(extent_buffer_page(dst, dst_i),
3733			   extent_buffer_page(dst, src_i),
3734			   dst_off_in_page, src_off_in_page, cur);
3735
3736		src_offset += cur;
3737		dst_offset += cur;
3738		len -= cur;
3739	}
3740}
3741
3742void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3743			   unsigned long src_offset, unsigned long len)
3744{
3745	size_t cur;
3746	size_t dst_off_in_page;
3747	size_t src_off_in_page;
3748	unsigned long dst_end = dst_offset + len - 1;
3749	unsigned long src_end = src_offset + len - 1;
3750	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3751	unsigned long dst_i;
3752	unsigned long src_i;
3753
3754	if (src_offset + len > dst->len) {
3755		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3756		       "len %lu len %lu\n", src_offset, len, dst->len);
3757		BUG_ON(1);
3758	}
3759	if (dst_offset + len > dst->len) {
3760		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3761		       "len %lu len %lu\n", dst_offset, len, dst->len);
3762		BUG_ON(1);
3763	}
3764	if (!areas_overlap(src_offset, dst_offset, len)) {
3765		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3766		return;
3767	}
3768	while (len > 0) {
3769		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3770		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3771
3772		dst_off_in_page = (start_offset + dst_end) &
3773			((unsigned long)PAGE_CACHE_SIZE - 1);
3774		src_off_in_page = (start_offset + src_end) &
3775			((unsigned long)PAGE_CACHE_SIZE - 1);
3776
3777		cur = min_t(unsigned long, len, src_off_in_page + 1);
3778		cur = min(cur, dst_off_in_page + 1);
3779		move_pages(extent_buffer_page(dst, dst_i),
3780			   extent_buffer_page(dst, src_i),
3781			   dst_off_in_page - cur + 1,
3782			   src_off_in_page - cur + 1, cur);
3783
3784		dst_end -= cur;
3785		src_end -= cur;
3786		len -= cur;
3787	}
3788}
3789
3790static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
3791{
3792	struct extent_buffer *eb =
3793			container_of(head, struct extent_buffer, rcu_head);
3794
3795	btrfs_release_extent_buffer(eb);
3796}
 
 
 
 
 
 
 
3797
3798int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3799{
3800	u64 start = page_offset(page);
3801	struct extent_buffer *eb;
3802	int ret = 1;
3803
3804	spin_lock(&tree->buffer_lock);
3805	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3806	if (!eb) {
3807		spin_unlock(&tree->buffer_lock);
3808		return ret;
 
 
 
 
 
3809	}
 
3810
3811	if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3812		ret = 0;
3813		goto out;
3814	}
3815
3816	/*
3817	 * set @eb->refs to 0 if it is already 1, and then release the @eb.
3818	 * Or go back.
3819	 */
3820	if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
3821		ret = 0;
3822		goto out;
3823	}
 
3824
3825	radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3826out:
3827	spin_unlock(&tree->buffer_lock);
3828
3829	/* at this point we can safely release the extent buffer */
3830	if (atomic_read(&eb->refs) == 0)
3831		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
3832	return ret;
3833}