Linux Audio

Check our new training course

Loading...
v3.5.6
   1#include <linux/bitops.h>
   2#include <linux/slab.h>
   3#include <linux/bio.h>
   4#include <linux/mm.h>
   5#include <linux/pagemap.h>
   6#include <linux/page-flags.h>
   7#include <linux/module.h>
   8#include <linux/spinlock.h>
   9#include <linux/blkdev.h>
  10#include <linux/swap.h>
  11#include <linux/writeback.h>
  12#include <linux/pagevec.h>
  13#include <linux/prefetch.h>
  14#include <linux/cleancache.h>
  15#include "extent_io.h"
  16#include "extent_map.h"
  17#include "compat.h"
  18#include "ctree.h"
  19#include "btrfs_inode.h"
  20#include "volumes.h"
  21#include "check-integrity.h"
  22#include "locking.h"
  23#include "rcu-string.h"
 
  24
  25static struct kmem_cache *extent_state_cache;
  26static struct kmem_cache *extent_buffer_cache;
 
  27
 
  28static LIST_HEAD(buffers);
  29static LIST_HEAD(states);
  30
  31#define LEAK_DEBUG 0
  32#if LEAK_DEBUG
  33static DEFINE_SPINLOCK(leak_lock);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  34#endif
  35
  36#define BUFFER_LRU_MAX 64
  37
  38struct tree_entry {
  39	u64 start;
  40	u64 end;
  41	struct rb_node rb_node;
  42};
  43
  44struct extent_page_data {
  45	struct bio *bio;
  46	struct extent_io_tree *tree;
  47	get_extent_t *get_extent;
 
  48
  49	/* tells writepage not to lock the state bits for this range
  50	 * it still does the unlocking
  51	 */
  52	unsigned int extent_locked:1;
  53
  54	/* tells the submit_bio code to use a WRITE_SYNC */
  55	unsigned int sync_io:1;
  56};
  57
  58static noinline void flush_write_bio(void *data);
  59static inline struct btrfs_fs_info *
  60tree_fs_info(struct extent_io_tree *tree)
  61{
 
 
  62	return btrfs_sb(tree->mapping->host->i_sb);
  63}
  64
  65int __init extent_io_init(void)
  66{
  67	extent_state_cache = kmem_cache_create("extent_state",
  68			sizeof(struct extent_state), 0,
  69			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
  70	if (!extent_state_cache)
  71		return -ENOMEM;
  72
  73	extent_buffer_cache = kmem_cache_create("extent_buffers",
  74			sizeof(struct extent_buffer), 0,
  75			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
  76	if (!extent_buffer_cache)
  77		goto free_state_cache;
 
 
 
 
 
 
 
 
 
  78	return 0;
  79
 
 
 
 
 
 
 
 
  80free_state_cache:
  81	kmem_cache_destroy(extent_state_cache);
 
  82	return -ENOMEM;
  83}
  84
  85void extent_io_exit(void)
  86{
  87	struct extent_state *state;
  88	struct extent_buffer *eb;
  89
  90	while (!list_empty(&states)) {
  91		state = list_entry(states.next, struct extent_state, leak_list);
  92		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
  93		       "state %lu in tree %p refs %d\n",
  94		       (unsigned long long)state->start,
  95		       (unsigned long long)state->end,
  96		       state->state, state->tree, atomic_read(&state->refs));
  97		list_del(&state->leak_list);
  98		kmem_cache_free(extent_state_cache, state);
  99
 100	}
 101
 102	while (!list_empty(&buffers)) {
 103		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
 104		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
 105		       "refs %d\n", (unsigned long long)eb->start,
 106		       eb->len, atomic_read(&eb->refs));
 107		list_del(&eb->leak_list);
 108		kmem_cache_free(extent_buffer_cache, eb);
 109	}
 110	if (extent_state_cache)
 111		kmem_cache_destroy(extent_state_cache);
 112	if (extent_buffer_cache)
 113		kmem_cache_destroy(extent_buffer_cache);
 
 
 114}
 115
 116void extent_io_tree_init(struct extent_io_tree *tree,
 117			 struct address_space *mapping)
 118{
 119	tree->state = RB_ROOT;
 120	INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
 121	tree->ops = NULL;
 122	tree->dirty_bytes = 0;
 123	spin_lock_init(&tree->lock);
 124	spin_lock_init(&tree->buffer_lock);
 125	tree->mapping = mapping;
 126}
 127
 128static struct extent_state *alloc_extent_state(gfp_t mask)
 129{
 130	struct extent_state *state;
 131#if LEAK_DEBUG
 132	unsigned long flags;
 133#endif
 134
 135	state = kmem_cache_alloc(extent_state_cache, mask);
 136	if (!state)
 137		return state;
 138	state->state = 0;
 139	state->private = 0;
 140	state->tree = NULL;
 141#if LEAK_DEBUG
 142	spin_lock_irqsave(&leak_lock, flags);
 143	list_add(&state->leak_list, &states);
 144	spin_unlock_irqrestore(&leak_lock, flags);
 145#endif
 146	atomic_set(&state->refs, 1);
 147	init_waitqueue_head(&state->wq);
 148	trace_alloc_extent_state(state, mask, _RET_IP_);
 149	return state;
 150}
 151
 152void free_extent_state(struct extent_state *state)
 153{
 154	if (!state)
 155		return;
 156	if (atomic_dec_and_test(&state->refs)) {
 157#if LEAK_DEBUG
 158		unsigned long flags;
 159#endif
 160		WARN_ON(state->tree);
 161#if LEAK_DEBUG
 162		spin_lock_irqsave(&leak_lock, flags);
 163		list_del(&state->leak_list);
 164		spin_unlock_irqrestore(&leak_lock, flags);
 165#endif
 166		trace_free_extent_state(state, _RET_IP_);
 167		kmem_cache_free(extent_state_cache, state);
 168	}
 169}
 170
 171static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 172				   struct rb_node *node)
 
 
 
 
 173{
 174	struct rb_node **p = &root->rb_node;
 175	struct rb_node *parent = NULL;
 176	struct tree_entry *entry;
 177
 
 
 
 
 
 
 
 178	while (*p) {
 179		parent = *p;
 180		entry = rb_entry(parent, struct tree_entry, rb_node);
 181
 182		if (offset < entry->start)
 183			p = &(*p)->rb_left;
 184		else if (offset > entry->end)
 185			p = &(*p)->rb_right;
 186		else
 187			return parent;
 188	}
 189
 
 190	rb_link_node(node, parent, p);
 191	rb_insert_color(node, root);
 192	return NULL;
 193}
 194
 195static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 196				     struct rb_node **prev_ret,
 197				     struct rb_node **next_ret)
 
 
 198{
 199	struct rb_root *root = &tree->state;
 200	struct rb_node *n = root->rb_node;
 201	struct rb_node *prev = NULL;
 202	struct rb_node *orig_prev = NULL;
 203	struct tree_entry *entry;
 204	struct tree_entry *prev_entry = NULL;
 205
 206	while (n) {
 207		entry = rb_entry(n, struct tree_entry, rb_node);
 208		prev = n;
 209		prev_entry = entry;
 210
 211		if (offset < entry->start)
 212			n = n->rb_left;
 213		else if (offset > entry->end)
 214			n = n->rb_right;
 215		else
 216			return n;
 217	}
 218
 
 
 
 
 
 219	if (prev_ret) {
 220		orig_prev = prev;
 221		while (prev && offset > prev_entry->end) {
 222			prev = rb_next(prev);
 223			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 224		}
 225		*prev_ret = prev;
 226		prev = orig_prev;
 227	}
 228
 229	if (next_ret) {
 230		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 231		while (prev && offset < prev_entry->start) {
 232			prev = rb_prev(prev);
 233			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 234		}
 235		*next_ret = prev;
 236	}
 237	return NULL;
 238}
 239
 240static inline struct rb_node *tree_search(struct extent_io_tree *tree,
 241					  u64 offset)
 
 
 
 242{
 243	struct rb_node *prev = NULL;
 244	struct rb_node *ret;
 245
 246	ret = __etree_search(tree, offset, &prev, NULL);
 247	if (!ret)
 248		return prev;
 249	return ret;
 250}
 251
 
 
 
 
 
 
 252static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
 253		     struct extent_state *other)
 254{
 255	if (tree->ops && tree->ops->merge_extent_hook)
 256		tree->ops->merge_extent_hook(tree->mapping->host, new,
 257					     other);
 258}
 259
 260/*
 261 * utility function to look for merge candidates inside a given range.
 262 * Any extents with matching state are merged together into a single
 263 * extent in the tree.  Extents with EXTENT_IO in their state field
 264 * are not merged because the end_io handlers need to be able to do
 265 * operations on them without sleeping (or doing allocations/splits).
 266 *
 267 * This should be called with the tree lock held.
 268 */
 269static void merge_state(struct extent_io_tree *tree,
 270		        struct extent_state *state)
 271{
 272	struct extent_state *other;
 273	struct rb_node *other_node;
 274
 275	if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
 276		return;
 277
 278	other_node = rb_prev(&state->rb_node);
 279	if (other_node) {
 280		other = rb_entry(other_node, struct extent_state, rb_node);
 281		if (other->end == state->start - 1 &&
 282		    other->state == state->state) {
 283			merge_cb(tree, state, other);
 284			state->start = other->start;
 285			other->tree = NULL;
 286			rb_erase(&other->rb_node, &tree->state);
 287			free_extent_state(other);
 288		}
 289	}
 290	other_node = rb_next(&state->rb_node);
 291	if (other_node) {
 292		other = rb_entry(other_node, struct extent_state, rb_node);
 293		if (other->start == state->end + 1 &&
 294		    other->state == state->state) {
 295			merge_cb(tree, state, other);
 296			state->end = other->end;
 297			other->tree = NULL;
 298			rb_erase(&other->rb_node, &tree->state);
 299			free_extent_state(other);
 300		}
 301	}
 302}
 303
 304static void set_state_cb(struct extent_io_tree *tree,
 305			 struct extent_state *state, int *bits)
 306{
 307	if (tree->ops && tree->ops->set_bit_hook)
 308		tree->ops->set_bit_hook(tree->mapping->host, state, bits);
 309}
 310
 311static void clear_state_cb(struct extent_io_tree *tree,
 312			   struct extent_state *state, int *bits)
 313{
 314	if (tree->ops && tree->ops->clear_bit_hook)
 315		tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
 316}
 317
 318static void set_state_bits(struct extent_io_tree *tree,
 319			   struct extent_state *state, int *bits);
 320
 321/*
 322 * insert an extent_state struct into the tree.  'bits' are set on the
 323 * struct before it is inserted.
 324 *
 325 * This may return -EEXIST if the extent is already there, in which case the
 326 * state struct is freed.
 327 *
 328 * The tree lock is not taken internally.  This is a utility function and
 329 * probably isn't what you want to call (see set/clear_extent_bit).
 330 */
 331static int insert_state(struct extent_io_tree *tree,
 332			struct extent_state *state, u64 start, u64 end,
 333			int *bits)
 
 
 334{
 335	struct rb_node *node;
 336
 337	if (end < start) {
 338		printk(KERN_ERR "btrfs end < start %llu %llu\n",
 339		       (unsigned long long)end,
 340		       (unsigned long long)start);
 341		WARN_ON(1);
 342	}
 343	state->start = start;
 344	state->end = end;
 345
 346	set_state_bits(tree, state, bits);
 347
 348	node = tree_insert(&tree->state, end, &state->rb_node);
 349	if (node) {
 350		struct extent_state *found;
 351		found = rb_entry(node, struct extent_state, rb_node);
 352		printk(KERN_ERR "btrfs found node %llu %llu on insert of "
 353		       "%llu %llu\n", (unsigned long long)found->start,
 354		       (unsigned long long)found->end,
 355		       (unsigned long long)start, (unsigned long long)end);
 356		return -EEXIST;
 357	}
 358	state->tree = tree;
 359	merge_state(tree, state);
 360	return 0;
 361}
 362
 363static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
 364		     u64 split)
 365{
 366	if (tree->ops && tree->ops->split_extent_hook)
 367		tree->ops->split_extent_hook(tree->mapping->host, orig, split);
 368}
 369
 370/*
 371 * split a given extent state struct in two, inserting the preallocated
 372 * struct 'prealloc' as the newly created second half.  'split' indicates an
 373 * offset inside 'orig' where it should be split.
 374 *
 375 * Before calling,
 376 * the tree has 'orig' at [orig->start, orig->end].  After calling, there
 377 * are two extent state structs in the tree:
 378 * prealloc: [orig->start, split - 1]
 379 * orig: [ split, orig->end ]
 380 *
 381 * The tree locks are not taken by this function. They need to be held
 382 * by the caller.
 383 */
 384static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 385		       struct extent_state *prealloc, u64 split)
 386{
 387	struct rb_node *node;
 388
 389	split_cb(tree, orig, split);
 390
 391	prealloc->start = orig->start;
 392	prealloc->end = split - 1;
 393	prealloc->state = orig->state;
 394	orig->start = split;
 395
 396	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
 
 397	if (node) {
 398		free_extent_state(prealloc);
 399		return -EEXIST;
 400	}
 401	prealloc->tree = tree;
 402	return 0;
 403}
 404
 405static struct extent_state *next_state(struct extent_state *state)
 406{
 407	struct rb_node *next = rb_next(&state->rb_node);
 408	if (next)
 409		return rb_entry(next, struct extent_state, rb_node);
 410	else
 411		return NULL;
 412}
 413
 414/*
 415 * utility function to clear some bits in an extent state struct.
 416 * it will optionally wake up any one waiting on this state (wake == 1).
 417 *
 418 * If no bits are set on the state struct after clearing things, the
 419 * struct is freed and removed from the tree
 420 */
 421static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
 422					    struct extent_state *state,
 423					    int *bits, int wake)
 424{
 425	struct extent_state *next;
 426	int bits_to_clear = *bits & ~EXTENT_CTLBITS;
 427
 428	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
 429		u64 range = state->end - state->start + 1;
 430		WARN_ON(range > tree->dirty_bytes);
 431		tree->dirty_bytes -= range;
 432	}
 433	clear_state_cb(tree, state, bits);
 434	state->state &= ~bits_to_clear;
 435	if (wake)
 436		wake_up(&state->wq);
 437	if (state->state == 0) {
 438		next = next_state(state);
 439		if (state->tree) {
 440			rb_erase(&state->rb_node, &tree->state);
 441			state->tree = NULL;
 442			free_extent_state(state);
 443		} else {
 444			WARN_ON(1);
 445		}
 446	} else {
 447		merge_state(tree, state);
 448		next = next_state(state);
 449	}
 450	return next;
 451}
 452
 453static struct extent_state *
 454alloc_extent_state_atomic(struct extent_state *prealloc)
 455{
 456	if (!prealloc)
 457		prealloc = alloc_extent_state(GFP_ATOMIC);
 458
 459	return prealloc;
 460}
 461
 462void extent_io_tree_panic(struct extent_io_tree *tree, int err)
 463{
 464	btrfs_panic(tree_fs_info(tree), err, "Locking error: "
 465		    "Extent tree was modified by another "
 466		    "thread while locked.");
 467}
 468
 469/*
 470 * clear some bits on a range in the tree.  This may require splitting
 471 * or inserting elements in the tree, so the gfp mask is used to
 472 * indicate which allocations or sleeping are allowed.
 473 *
 474 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
 475 * the given range from the tree regardless of state (ie for truncate).
 476 *
 477 * the range [start, end] is inclusive.
 478 *
 479 * This takes the tree lock, and returns 0 on success and < 0 on error.
 480 */
 481int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 482		     int bits, int wake, int delete,
 483		     struct extent_state **cached_state,
 484		     gfp_t mask)
 485{
 486	struct extent_state *state;
 487	struct extent_state *cached;
 488	struct extent_state *prealloc = NULL;
 489	struct rb_node *node;
 490	u64 last_end;
 491	int err;
 492	int clear = 0;
 493
 
 
 
 
 
 494	if (delete)
 495		bits |= ~EXTENT_CTLBITS;
 496	bits |= EXTENT_FIRST_DELALLOC;
 497
 498	if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
 499		clear = 1;
 500again:
 501	if (!prealloc && (mask & __GFP_WAIT)) {
 502		prealloc = alloc_extent_state(mask);
 503		if (!prealloc)
 504			return -ENOMEM;
 505	}
 506
 507	spin_lock(&tree->lock);
 508	if (cached_state) {
 509		cached = *cached_state;
 510
 511		if (clear) {
 512			*cached_state = NULL;
 513			cached_state = NULL;
 514		}
 515
 516		if (cached && cached->tree && cached->start <= start &&
 517		    cached->end > start) {
 518			if (clear)
 519				atomic_dec(&cached->refs);
 520			state = cached;
 521			goto hit_next;
 522		}
 523		if (clear)
 524			free_extent_state(cached);
 525	}
 526	/*
 527	 * this search will find the extents that end after
 528	 * our range starts
 529	 */
 530	node = tree_search(tree, start);
 531	if (!node)
 532		goto out;
 533	state = rb_entry(node, struct extent_state, rb_node);
 534hit_next:
 535	if (state->start > end)
 536		goto out;
 537	WARN_ON(state->end < start);
 538	last_end = state->end;
 539
 540	/* the state doesn't have the wanted bits, go ahead */
 541	if (!(state->state & bits)) {
 542		state = next_state(state);
 543		goto next;
 544	}
 545
 546	/*
 547	 *     | ---- desired range ---- |
 548	 *  | state | or
 549	 *  | ------------- state -------------- |
 550	 *
 551	 * We need to split the extent we found, and may flip
 552	 * bits on second half.
 553	 *
 554	 * If the extent we found extends past our range, we
 555	 * just split and search again.  It'll get split again
 556	 * the next time though.
 557	 *
 558	 * If the extent we found is inside our range, we clear
 559	 * the desired bit on it.
 560	 */
 561
 562	if (state->start < start) {
 563		prealloc = alloc_extent_state_atomic(prealloc);
 564		BUG_ON(!prealloc);
 565		err = split_state(tree, state, prealloc, start);
 566		if (err)
 567			extent_io_tree_panic(tree, err);
 568
 569		prealloc = NULL;
 570		if (err)
 571			goto out;
 572		if (state->end <= end) {
 573			state = clear_state_bit(tree, state, &bits, wake);
 574			goto next;
 575		}
 576		goto search_again;
 577	}
 578	/*
 579	 * | ---- desired range ---- |
 580	 *                        | state |
 581	 * We need to split the extent, and clear the bit
 582	 * on the first half
 583	 */
 584	if (state->start <= end && state->end > end) {
 585		prealloc = alloc_extent_state_atomic(prealloc);
 586		BUG_ON(!prealloc);
 587		err = split_state(tree, state, prealloc, end + 1);
 588		if (err)
 589			extent_io_tree_panic(tree, err);
 590
 591		if (wake)
 592			wake_up(&state->wq);
 593
 594		clear_state_bit(tree, prealloc, &bits, wake);
 595
 596		prealloc = NULL;
 597		goto out;
 598	}
 599
 600	state = clear_state_bit(tree, state, &bits, wake);
 601next:
 602	if (last_end == (u64)-1)
 603		goto out;
 604	start = last_end + 1;
 605	if (start <= end && state && !need_resched())
 606		goto hit_next;
 607	goto search_again;
 608
 609out:
 610	spin_unlock(&tree->lock);
 611	if (prealloc)
 612		free_extent_state(prealloc);
 613
 614	return 0;
 615
 616search_again:
 617	if (start > end)
 618		goto out;
 619	spin_unlock(&tree->lock);
 620	if (mask & __GFP_WAIT)
 621		cond_resched();
 622	goto again;
 623}
 624
 625static void wait_on_state(struct extent_io_tree *tree,
 626			  struct extent_state *state)
 627		__releases(tree->lock)
 628		__acquires(tree->lock)
 629{
 630	DEFINE_WAIT(wait);
 631	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
 632	spin_unlock(&tree->lock);
 633	schedule();
 634	spin_lock(&tree->lock);
 635	finish_wait(&state->wq, &wait);
 636}
 637
 638/*
 639 * waits for one or more bits to clear on a range in the state tree.
 640 * The range [start, end] is inclusive.
 641 * The tree lock is taken by this function
 642 */
 643void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
 
 644{
 645	struct extent_state *state;
 646	struct rb_node *node;
 647
 
 
 648	spin_lock(&tree->lock);
 649again:
 650	while (1) {
 651		/*
 652		 * this search will find all the extents that end after
 653		 * our range starts
 654		 */
 655		node = tree_search(tree, start);
 
 656		if (!node)
 657			break;
 658
 659		state = rb_entry(node, struct extent_state, rb_node);
 660
 661		if (state->start > end)
 662			goto out;
 663
 664		if (state->state & bits) {
 665			start = state->start;
 666			atomic_inc(&state->refs);
 667			wait_on_state(tree, state);
 668			free_extent_state(state);
 669			goto again;
 670		}
 671		start = state->end + 1;
 672
 673		if (start > end)
 674			break;
 675
 676		cond_resched_lock(&tree->lock);
 
 
 
 677	}
 678out:
 679	spin_unlock(&tree->lock);
 680}
 681
 682static void set_state_bits(struct extent_io_tree *tree,
 683			   struct extent_state *state,
 684			   int *bits)
 685{
 686	int bits_to_set = *bits & ~EXTENT_CTLBITS;
 687
 688	set_state_cb(tree, state, bits);
 689	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
 690		u64 range = state->end - state->start + 1;
 691		tree->dirty_bytes += range;
 692	}
 693	state->state |= bits_to_set;
 694}
 695
 696static void cache_state(struct extent_state *state,
 697			struct extent_state **cached_ptr)
 698{
 699	if (cached_ptr && !(*cached_ptr)) {
 700		if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
 701			*cached_ptr = state;
 702			atomic_inc(&state->refs);
 703		}
 704	}
 705}
 706
 707static void uncache_state(struct extent_state **cached_ptr)
 708{
 709	if (cached_ptr && (*cached_ptr)) {
 710		struct extent_state *state = *cached_ptr;
 711		*cached_ptr = NULL;
 712		free_extent_state(state);
 713	}
 714}
 715
 716/*
 717 * set some bits on a range in the tree.  This may require allocations or
 718 * sleeping, so the gfp mask is used to indicate what is allowed.
 719 *
 720 * If any of the exclusive bits are set, this will fail with -EEXIST if some
 721 * part of the range already has the desired bits set.  The start of the
 722 * existing range is returned in failed_start in this case.
 723 *
 724 * [start, end] is inclusive This takes the tree lock.
 725 */
 726
 727static int __must_check
 728__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 729		 int bits, int exclusive_bits, u64 *failed_start,
 730		 struct extent_state **cached_state, gfp_t mask)
 
 731{
 732	struct extent_state *state;
 733	struct extent_state *prealloc = NULL;
 734	struct rb_node *node;
 
 
 735	int err = 0;
 736	u64 last_start;
 737	u64 last_end;
 738
 
 
 739	bits |= EXTENT_FIRST_DELALLOC;
 740again:
 741	if (!prealloc && (mask & __GFP_WAIT)) {
 742		prealloc = alloc_extent_state(mask);
 743		BUG_ON(!prealloc);
 744	}
 745
 746	spin_lock(&tree->lock);
 747	if (cached_state && *cached_state) {
 748		state = *cached_state;
 749		if (state->start <= start && state->end > start &&
 750		    state->tree) {
 751			node = &state->rb_node;
 752			goto hit_next;
 753		}
 754	}
 755	/*
 756	 * this search will find all the extents that end after
 757	 * our range starts.
 758	 */
 759	node = tree_search(tree, start);
 760	if (!node) {
 761		prealloc = alloc_extent_state_atomic(prealloc);
 762		BUG_ON(!prealloc);
 763		err = insert_state(tree, prealloc, start, end, &bits);
 
 764		if (err)
 765			extent_io_tree_panic(tree, err);
 766
 
 767		prealloc = NULL;
 768		goto out;
 769	}
 770	state = rb_entry(node, struct extent_state, rb_node);
 771hit_next:
 772	last_start = state->start;
 773	last_end = state->end;
 774
 775	/*
 776	 * | ---- desired range ---- |
 777	 * | state |
 778	 *
 779	 * Just lock what we found and keep going
 780	 */
 781	if (state->start == start && state->end <= end) {
 782		if (state->state & exclusive_bits) {
 783			*failed_start = state->start;
 784			err = -EEXIST;
 785			goto out;
 786		}
 787
 788		set_state_bits(tree, state, &bits);
 789		cache_state(state, cached_state);
 790		merge_state(tree, state);
 791		if (last_end == (u64)-1)
 792			goto out;
 793		start = last_end + 1;
 794		state = next_state(state);
 795		if (start < end && state && state->start == start &&
 796		    !need_resched())
 797			goto hit_next;
 798		goto search_again;
 799	}
 800
 801	/*
 802	 *     | ---- desired range ---- |
 803	 * | state |
 804	 *   or
 805	 * | ------------- state -------------- |
 806	 *
 807	 * We need to split the extent we found, and may flip bits on
 808	 * second half.
 809	 *
 810	 * If the extent we found extends past our
 811	 * range, we just split and search again.  It'll get split
 812	 * again the next time though.
 813	 *
 814	 * If the extent we found is inside our range, we set the
 815	 * desired bit on it.
 816	 */
 817	if (state->start < start) {
 818		if (state->state & exclusive_bits) {
 819			*failed_start = start;
 820			err = -EEXIST;
 821			goto out;
 822		}
 823
 824		prealloc = alloc_extent_state_atomic(prealloc);
 825		BUG_ON(!prealloc);
 826		err = split_state(tree, state, prealloc, start);
 827		if (err)
 828			extent_io_tree_panic(tree, err);
 829
 830		prealloc = NULL;
 831		if (err)
 832			goto out;
 833		if (state->end <= end) {
 834			set_state_bits(tree, state, &bits);
 835			cache_state(state, cached_state);
 836			merge_state(tree, state);
 837			if (last_end == (u64)-1)
 838				goto out;
 839			start = last_end + 1;
 840			state = next_state(state);
 841			if (start < end && state && state->start == start &&
 842			    !need_resched())
 843				goto hit_next;
 844		}
 845		goto search_again;
 846	}
 847	/*
 848	 * | ---- desired range ---- |
 849	 *     | state | or               | state |
 850	 *
 851	 * There's a hole, we need to insert something in it and
 852	 * ignore the extent we found.
 853	 */
 854	if (state->start > start) {
 855		u64 this_end;
 856		if (end < last_start)
 857			this_end = end;
 858		else
 859			this_end = last_start - 1;
 860
 861		prealloc = alloc_extent_state_atomic(prealloc);
 862		BUG_ON(!prealloc);
 863
 864		/*
 865		 * Avoid to free 'prealloc' if it can be merged with
 866		 * the later extent.
 867		 */
 868		err = insert_state(tree, prealloc, start, this_end,
 869				   &bits);
 870		if (err)
 871			extent_io_tree_panic(tree, err);
 872
 873		cache_state(prealloc, cached_state);
 874		prealloc = NULL;
 875		start = this_end + 1;
 876		goto search_again;
 877	}
 878	/*
 879	 * | ---- desired range ---- |
 880	 *                        | state |
 881	 * We need to split the extent, and set the bit
 882	 * on the first half
 883	 */
 884	if (state->start <= end && state->end > end) {
 885		if (state->state & exclusive_bits) {
 886			*failed_start = start;
 887			err = -EEXIST;
 888			goto out;
 889		}
 890
 891		prealloc = alloc_extent_state_atomic(prealloc);
 892		BUG_ON(!prealloc);
 893		err = split_state(tree, state, prealloc, end + 1);
 894		if (err)
 895			extent_io_tree_panic(tree, err);
 896
 897		set_state_bits(tree, prealloc, &bits);
 898		cache_state(prealloc, cached_state);
 899		merge_state(tree, prealloc);
 900		prealloc = NULL;
 901		goto out;
 902	}
 903
 904	goto search_again;
 905
 906out:
 907	spin_unlock(&tree->lock);
 908	if (prealloc)
 909		free_extent_state(prealloc);
 910
 911	return err;
 912
 913search_again:
 914	if (start > end)
 915		goto out;
 916	spin_unlock(&tree->lock);
 917	if (mask & __GFP_WAIT)
 918		cond_resched();
 919	goto again;
 920}
 921
 922int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
 923		   u64 *failed_start, struct extent_state **cached_state,
 924		   gfp_t mask)
 925{
 926	return __set_extent_bit(tree, start, end, bits, 0, failed_start,
 927				cached_state, mask);
 928}
 929
 930
 931/**
 932 * convert_extent - convert all bits in a given range from one bit to another
 
 933 * @tree:	the io tree to search
 934 * @start:	the start offset in bytes
 935 * @end:	the end offset in bytes (inclusive)
 936 * @bits:	the bits to set in this range
 937 * @clear_bits:	the bits to clear in this range
 
 938 * @mask:	the allocation mask
 939 *
 940 * This will go through and set bits for the given range.  If any states exist
 941 * already in this range they are set with the given bit and cleared of the
 942 * clear_bits.  This is only meant to be used by things that are mergeable, ie
 943 * converting from say DELALLOC to DIRTY.  This is not meant to be used with
 944 * boundary bits like LOCK.
 945 */
 946int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 947		       int bits, int clear_bits, gfp_t mask)
 
 948{
 949	struct extent_state *state;
 950	struct extent_state *prealloc = NULL;
 951	struct rb_node *node;
 
 
 952	int err = 0;
 953	u64 last_start;
 954	u64 last_end;
 955
 
 
 956again:
 957	if (!prealloc && (mask & __GFP_WAIT)) {
 958		prealloc = alloc_extent_state(mask);
 959		if (!prealloc)
 960			return -ENOMEM;
 961	}
 962
 963	spin_lock(&tree->lock);
 
 
 
 
 
 
 
 
 
 964	/*
 965	 * this search will find all the extents that end after
 966	 * our range starts.
 967	 */
 968	node = tree_search(tree, start);
 969	if (!node) {
 970		prealloc = alloc_extent_state_atomic(prealloc);
 971		if (!prealloc) {
 972			err = -ENOMEM;
 973			goto out;
 974		}
 975		err = insert_state(tree, prealloc, start, end, &bits);
 976		prealloc = NULL;
 977		if (err)
 978			extent_io_tree_panic(tree, err);
 
 
 979		goto out;
 980	}
 981	state = rb_entry(node, struct extent_state, rb_node);
 982hit_next:
 983	last_start = state->start;
 984	last_end = state->end;
 985
 986	/*
 987	 * | ---- desired range ---- |
 988	 * | state |
 989	 *
 990	 * Just lock what we found and keep going
 991	 */
 992	if (state->start == start && state->end <= end) {
 993		set_state_bits(tree, state, &bits);
 
 994		state = clear_state_bit(tree, state, &clear_bits, 0);
 995		if (last_end == (u64)-1)
 996			goto out;
 997		start = last_end + 1;
 998		if (start < end && state && state->start == start &&
 999		    !need_resched())
1000			goto hit_next;
1001		goto search_again;
1002	}
1003
1004	/*
1005	 *     | ---- desired range ---- |
1006	 * | state |
1007	 *   or
1008	 * | ------------- state -------------- |
1009	 *
1010	 * We need to split the extent we found, and may flip bits on
1011	 * second half.
1012	 *
1013	 * If the extent we found extends past our
1014	 * range, we just split and search again.  It'll get split
1015	 * again the next time though.
1016	 *
1017	 * If the extent we found is inside our range, we set the
1018	 * desired bit on it.
1019	 */
1020	if (state->start < start) {
1021		prealloc = alloc_extent_state_atomic(prealloc);
1022		if (!prealloc) {
1023			err = -ENOMEM;
1024			goto out;
1025		}
1026		err = split_state(tree, state, prealloc, start);
1027		if (err)
1028			extent_io_tree_panic(tree, err);
1029		prealloc = NULL;
1030		if (err)
1031			goto out;
1032		if (state->end <= end) {
1033			set_state_bits(tree, state, &bits);
 
1034			state = clear_state_bit(tree, state, &clear_bits, 0);
1035			if (last_end == (u64)-1)
1036				goto out;
1037			start = last_end + 1;
1038			if (start < end && state && state->start == start &&
1039			    !need_resched())
1040				goto hit_next;
1041		}
1042		goto search_again;
1043	}
1044	/*
1045	 * | ---- desired range ---- |
1046	 *     | state | or               | state |
1047	 *
1048	 * There's a hole, we need to insert something in it and
1049	 * ignore the extent we found.
1050	 */
1051	if (state->start > start) {
1052		u64 this_end;
1053		if (end < last_start)
1054			this_end = end;
1055		else
1056			this_end = last_start - 1;
1057
1058		prealloc = alloc_extent_state_atomic(prealloc);
1059		if (!prealloc) {
1060			err = -ENOMEM;
1061			goto out;
1062		}
1063
1064		/*
1065		 * Avoid to free 'prealloc' if it can be merged with
1066		 * the later extent.
1067		 */
1068		err = insert_state(tree, prealloc, start, this_end,
1069				   &bits);
1070		if (err)
1071			extent_io_tree_panic(tree, err);
 
1072		prealloc = NULL;
1073		start = this_end + 1;
1074		goto search_again;
1075	}
1076	/*
1077	 * | ---- desired range ---- |
1078	 *                        | state |
1079	 * We need to split the extent, and set the bit
1080	 * on the first half
1081	 */
1082	if (state->start <= end && state->end > end) {
1083		prealloc = alloc_extent_state_atomic(prealloc);
1084		if (!prealloc) {
1085			err = -ENOMEM;
1086			goto out;
1087		}
1088
1089		err = split_state(tree, state, prealloc, end + 1);
1090		if (err)
1091			extent_io_tree_panic(tree, err);
1092
1093		set_state_bits(tree, prealloc, &bits);
 
1094		clear_state_bit(tree, prealloc, &clear_bits, 0);
1095		prealloc = NULL;
1096		goto out;
1097	}
1098
1099	goto search_again;
1100
1101out:
1102	spin_unlock(&tree->lock);
1103	if (prealloc)
1104		free_extent_state(prealloc);
1105
1106	return err;
1107
1108search_again:
1109	if (start > end)
1110		goto out;
1111	spin_unlock(&tree->lock);
1112	if (mask & __GFP_WAIT)
1113		cond_resched();
1114	goto again;
1115}
1116
1117/* wrappers around set/clear extent bit */
1118int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1119		     gfp_t mask)
1120{
1121	return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
1122			      NULL, mask);
1123}
1124
1125int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1126		    int bits, gfp_t mask)
1127{
1128	return set_extent_bit(tree, start, end, bits, NULL,
1129			      NULL, mask);
1130}
1131
1132int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1133		      int bits, gfp_t mask)
1134{
1135	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
1136}
1137
1138int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
1139			struct extent_state **cached_state, gfp_t mask)
1140{
1141	return set_extent_bit(tree, start, end,
1142			      EXTENT_DELALLOC | EXTENT_UPTODATE,
1143			      NULL, cached_state, mask);
1144}
1145
 
 
 
 
 
 
 
 
1146int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1147		       gfp_t mask)
1148{
1149	return clear_extent_bit(tree, start, end,
1150				EXTENT_DIRTY | EXTENT_DELALLOC |
1151				EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
1152}
1153
1154int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
1155		     gfp_t mask)
1156{
1157	return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
1158			      NULL, mask);
1159}
1160
1161int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1162			struct extent_state **cached_state, gfp_t mask)
1163{
1164	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
1165			      cached_state, mask);
1166}
1167
1168int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1169			  struct extent_state **cached_state, gfp_t mask)
1170{
1171	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
1172				cached_state, mask);
1173}
1174
1175/*
1176 * either insert or lock state struct between start and end use mask to tell
1177 * us if waiting is desired.
1178 */
1179int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1180		     int bits, struct extent_state **cached_state)
1181{
1182	int err;
1183	u64 failed_start;
1184	while (1) {
1185		err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
1186				       EXTENT_LOCKED, &failed_start,
1187				       cached_state, GFP_NOFS);
1188		if (err == -EEXIST) {
1189			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1190			start = failed_start;
1191		} else
1192			break;
1193		WARN_ON(start > end);
1194	}
1195	return err;
1196}
1197
1198int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1199{
1200	return lock_extent_bits(tree, start, end, 0, NULL);
1201}
1202
1203int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1204{
1205	int err;
1206	u64 failed_start;
1207
1208	err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1209			       &failed_start, NULL, GFP_NOFS);
1210	if (err == -EEXIST) {
1211		if (failed_start > start)
1212			clear_extent_bit(tree, start, failed_start - 1,
1213					 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
1214		return 0;
1215	}
1216	return 1;
1217}
1218
1219int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1220			 struct extent_state **cached, gfp_t mask)
1221{
1222	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1223				mask);
1224}
1225
1226int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1227{
1228	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1229				GFP_NOFS);
1230}
1231
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1232/*
1233 * helper function to set both pages and extents in the tree writeback
1234 */
1235static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1236{
1237	unsigned long index = start >> PAGE_CACHE_SHIFT;
1238	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1239	struct page *page;
1240
1241	while (index <= end_index) {
1242		page = find_get_page(tree->mapping, index);
1243		BUG_ON(!page); /* Pages should be in the extent_io_tree */
1244		set_page_writeback(page);
1245		page_cache_release(page);
1246		index++;
1247	}
1248	return 0;
1249}
1250
1251/* find the first state struct with 'bits' set after 'start', and
1252 * return it.  tree->lock must be held.  NULL will returned if
1253 * nothing was found after 'start'
1254 */
1255struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1256						 u64 start, int bits)
 
1257{
1258	struct rb_node *node;
1259	struct extent_state *state;
1260
1261	/*
1262	 * this search will find all the extents that end after
1263	 * our range starts.
1264	 */
1265	node = tree_search(tree, start);
1266	if (!node)
1267		goto out;
1268
1269	while (1) {
1270		state = rb_entry(node, struct extent_state, rb_node);
1271		if (state->end >= start && (state->state & bits))
1272			return state;
1273
1274		node = rb_next(node);
1275		if (!node)
1276			break;
1277	}
1278out:
1279	return NULL;
1280}
1281
1282/*
1283 * find the first offset in the io tree with 'bits' set. zero is
1284 * returned if we find something, and *start_ret and *end_ret are
1285 * set to reflect the state struct that was found.
1286 *
1287 * If nothing was found, 1 is returned. If found something, return 0.
1288 */
1289int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1290			  u64 *start_ret, u64 *end_ret, int bits)
 
1291{
1292	struct extent_state *state;
 
1293	int ret = 1;
1294
1295	spin_lock(&tree->lock);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1296	state = find_first_extent_bit_state(tree, start, bits);
 
1297	if (state) {
 
1298		*start_ret = state->start;
1299		*end_ret = state->end;
1300		ret = 0;
1301	}
 
1302	spin_unlock(&tree->lock);
1303	return ret;
1304}
1305
1306/*
1307 * find a contiguous range of bytes in the file marked as delalloc, not
1308 * more than 'max_bytes'.  start and end are used to return the range,
1309 *
1310 * 1 is returned if we find something, 0 if nothing was in the tree
1311 */
1312static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1313					u64 *start, u64 *end, u64 max_bytes,
1314					struct extent_state **cached_state)
1315{
1316	struct rb_node *node;
1317	struct extent_state *state;
1318	u64 cur_start = *start;
1319	u64 found = 0;
1320	u64 total_bytes = 0;
1321
1322	spin_lock(&tree->lock);
1323
1324	/*
1325	 * this search will find all the extents that end after
1326	 * our range starts.
1327	 */
1328	node = tree_search(tree, cur_start);
1329	if (!node) {
1330		if (!found)
1331			*end = (u64)-1;
1332		goto out;
1333	}
1334
1335	while (1) {
1336		state = rb_entry(node, struct extent_state, rb_node);
1337		if (found && (state->start != cur_start ||
1338			      (state->state & EXTENT_BOUNDARY))) {
1339			goto out;
1340		}
1341		if (!(state->state & EXTENT_DELALLOC)) {
1342			if (!found)
1343				*end = state->end;
1344			goto out;
1345		}
1346		if (!found) {
1347			*start = state->start;
1348			*cached_state = state;
1349			atomic_inc(&state->refs);
1350		}
1351		found++;
1352		*end = state->end;
1353		cur_start = state->end + 1;
1354		node = rb_next(node);
1355		if (!node)
1356			break;
1357		total_bytes += state->end - state->start + 1;
1358		if (total_bytes >= max_bytes)
1359			break;
 
 
1360	}
1361out:
1362	spin_unlock(&tree->lock);
1363	return found;
1364}
1365
1366static noinline void __unlock_for_delalloc(struct inode *inode,
1367					   struct page *locked_page,
1368					   u64 start, u64 end)
1369{
1370	int ret;
1371	struct page *pages[16];
1372	unsigned long index = start >> PAGE_CACHE_SHIFT;
1373	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1374	unsigned long nr_pages = end_index - index + 1;
1375	int i;
1376
1377	if (index == locked_page->index && end_index == index)
1378		return;
1379
1380	while (nr_pages > 0) {
1381		ret = find_get_pages_contig(inode->i_mapping, index,
1382				     min_t(unsigned long, nr_pages,
1383				     ARRAY_SIZE(pages)), pages);
1384		for (i = 0; i < ret; i++) {
1385			if (pages[i] != locked_page)
1386				unlock_page(pages[i]);
1387			page_cache_release(pages[i]);
1388		}
1389		nr_pages -= ret;
1390		index += ret;
1391		cond_resched();
1392	}
1393}
1394
1395static noinline int lock_delalloc_pages(struct inode *inode,
1396					struct page *locked_page,
1397					u64 delalloc_start,
1398					u64 delalloc_end)
1399{
1400	unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1401	unsigned long start_index = index;
1402	unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1403	unsigned long pages_locked = 0;
1404	struct page *pages[16];
1405	unsigned long nrpages;
1406	int ret;
1407	int i;
1408
1409	/* the caller is responsible for locking the start index */
1410	if (index == locked_page->index && index == end_index)
1411		return 0;
1412
1413	/* skip the page at the start index */
1414	nrpages = end_index - index + 1;
1415	while (nrpages > 0) {
1416		ret = find_get_pages_contig(inode->i_mapping, index,
1417				     min_t(unsigned long,
1418				     nrpages, ARRAY_SIZE(pages)), pages);
1419		if (ret == 0) {
1420			ret = -EAGAIN;
1421			goto done;
1422		}
1423		/* now we have an array of pages, lock them all */
1424		for (i = 0; i < ret; i++) {
1425			/*
1426			 * the caller is taking responsibility for
1427			 * locked_page
1428			 */
1429			if (pages[i] != locked_page) {
1430				lock_page(pages[i]);
1431				if (!PageDirty(pages[i]) ||
1432				    pages[i]->mapping != inode->i_mapping) {
1433					ret = -EAGAIN;
1434					unlock_page(pages[i]);
1435					page_cache_release(pages[i]);
1436					goto done;
1437				}
1438			}
1439			page_cache_release(pages[i]);
1440			pages_locked++;
1441		}
1442		nrpages -= ret;
1443		index += ret;
1444		cond_resched();
1445	}
1446	ret = 0;
1447done:
1448	if (ret && pages_locked) {
1449		__unlock_for_delalloc(inode, locked_page,
1450			      delalloc_start,
1451			      ((u64)(start_index + pages_locked - 1)) <<
1452			      PAGE_CACHE_SHIFT);
1453	}
1454	return ret;
1455}
1456
1457/*
1458 * find a contiguous range of bytes in the file marked as delalloc, not
1459 * more than 'max_bytes'.  start and end are used to return the range,
1460 *
1461 * 1 is returned if we find something, 0 if nothing was in the tree
1462 */
1463static noinline u64 find_lock_delalloc_range(struct inode *inode,
1464					     struct extent_io_tree *tree,
1465					     struct page *locked_page,
1466					     u64 *start, u64 *end,
1467					     u64 max_bytes)
1468{
1469	u64 delalloc_start;
1470	u64 delalloc_end;
1471	u64 found;
1472	struct extent_state *cached_state = NULL;
1473	int ret;
1474	int loops = 0;
1475
1476again:
1477	/* step one, find a bunch of delalloc bytes starting at start */
1478	delalloc_start = *start;
1479	delalloc_end = 0;
1480	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1481				    max_bytes, &cached_state);
1482	if (!found || delalloc_end <= *start) {
1483		*start = delalloc_start;
1484		*end = delalloc_end;
1485		free_extent_state(cached_state);
1486		return found;
1487	}
1488
1489	/*
1490	 * start comes from the offset of locked_page.  We have to lock
1491	 * pages in order, so we can't process delalloc bytes before
1492	 * locked_page
1493	 */
1494	if (delalloc_start < *start)
1495		delalloc_start = *start;
1496
1497	/*
1498	 * make sure to limit the number of pages we try to lock down
1499	 * if we're looping.
1500	 */
1501	if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1502		delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1503
1504	/* step two, lock all the pages after the page that has start */
1505	ret = lock_delalloc_pages(inode, locked_page,
1506				  delalloc_start, delalloc_end);
1507	if (ret == -EAGAIN) {
1508		/* some of the pages are gone, lets avoid looping by
1509		 * shortening the size of the delalloc range we're searching
1510		 */
1511		free_extent_state(cached_state);
1512		if (!loops) {
1513			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1514			max_bytes = PAGE_CACHE_SIZE - offset;
1515			loops = 1;
1516			goto again;
1517		} else {
1518			found = 0;
1519			goto out_failed;
1520		}
1521	}
1522	BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
1523
1524	/* step three, lock the state bits for the whole range */
1525	lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
1526
1527	/* then test to make sure it is all still delalloc */
1528	ret = test_range_bit(tree, delalloc_start, delalloc_end,
1529			     EXTENT_DELALLOC, 1, cached_state);
1530	if (!ret) {
1531		unlock_extent_cached(tree, delalloc_start, delalloc_end,
1532				     &cached_state, GFP_NOFS);
1533		__unlock_for_delalloc(inode, locked_page,
1534			      delalloc_start, delalloc_end);
1535		cond_resched();
1536		goto again;
1537	}
1538	free_extent_state(cached_state);
1539	*start = delalloc_start;
1540	*end = delalloc_end;
1541out_failed:
1542	return found;
1543}
1544
1545int extent_clear_unlock_delalloc(struct inode *inode,
1546				struct extent_io_tree *tree,
1547				u64 start, u64 end, struct page *locked_page,
1548				unsigned long op)
1549{
 
1550	int ret;
1551	struct page *pages[16];
1552	unsigned long index = start >> PAGE_CACHE_SHIFT;
1553	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1554	unsigned long nr_pages = end_index - index + 1;
1555	int i;
1556	int clear_bits = 0;
1557
1558	if (op & EXTENT_CLEAR_UNLOCK)
1559		clear_bits |= EXTENT_LOCKED;
1560	if (op & EXTENT_CLEAR_DIRTY)
1561		clear_bits |= EXTENT_DIRTY;
1562
1563	if (op & EXTENT_CLEAR_DELALLOC)
1564		clear_bits |= EXTENT_DELALLOC;
1565
1566	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1567	if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1568		    EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
1569		    EXTENT_SET_PRIVATE2)))
1570		return 0;
1571
1572	while (nr_pages > 0) {
1573		ret = find_get_pages_contig(inode->i_mapping, index,
1574				     min_t(unsigned long,
1575				     nr_pages, ARRAY_SIZE(pages)), pages);
1576		for (i = 0; i < ret; i++) {
1577
1578			if (op & EXTENT_SET_PRIVATE2)
1579				SetPagePrivate2(pages[i]);
1580
1581			if (pages[i] == locked_page) {
1582				page_cache_release(pages[i]);
1583				continue;
1584			}
1585			if (op & EXTENT_CLEAR_DIRTY)
1586				clear_page_dirty_for_io(pages[i]);
1587			if (op & EXTENT_SET_WRITEBACK)
1588				set_page_writeback(pages[i]);
1589			if (op & EXTENT_END_WRITEBACK)
1590				end_page_writeback(pages[i]);
1591			if (op & EXTENT_CLEAR_UNLOCK_PAGE)
1592				unlock_page(pages[i]);
1593			page_cache_release(pages[i]);
1594		}
1595		nr_pages -= ret;
1596		index += ret;
1597		cond_resched();
1598	}
1599	return 0;
1600}
1601
1602/*
1603 * count the number of bytes in the tree that have a given bit(s)
1604 * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1605 * cached.  The total number found is returned.
1606 */
1607u64 count_range_bits(struct extent_io_tree *tree,
1608		     u64 *start, u64 search_end, u64 max_bytes,
1609		     unsigned long bits, int contig)
1610{
1611	struct rb_node *node;
1612	struct extent_state *state;
1613	u64 cur_start = *start;
1614	u64 total_bytes = 0;
1615	u64 last = 0;
1616	int found = 0;
1617
1618	if (search_end <= cur_start) {
1619		WARN_ON(1);
1620		return 0;
1621	}
1622
1623	spin_lock(&tree->lock);
1624	if (cur_start == 0 && bits == EXTENT_DIRTY) {
1625		total_bytes = tree->dirty_bytes;
1626		goto out;
1627	}
1628	/*
1629	 * this search will find all the extents that end after
1630	 * our range starts.
1631	 */
1632	node = tree_search(tree, cur_start);
1633	if (!node)
1634		goto out;
1635
1636	while (1) {
1637		state = rb_entry(node, struct extent_state, rb_node);
1638		if (state->start > search_end)
1639			break;
1640		if (contig && found && state->start > last + 1)
1641			break;
1642		if (state->end >= cur_start && (state->state & bits) == bits) {
1643			total_bytes += min(search_end, state->end) + 1 -
1644				       max(cur_start, state->start);
1645			if (total_bytes >= max_bytes)
1646				break;
1647			if (!found) {
1648				*start = max(cur_start, state->start);
1649				found = 1;
1650			}
1651			last = state->end;
1652		} else if (contig && found) {
1653			break;
1654		}
1655		node = rb_next(node);
1656		if (!node)
1657			break;
1658	}
1659out:
1660	spin_unlock(&tree->lock);
1661	return total_bytes;
1662}
1663
1664/*
1665 * set the private field for a given byte offset in the tree.  If there isn't
1666 * an extent_state there already, this does nothing.
1667 */
1668int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1669{
1670	struct rb_node *node;
1671	struct extent_state *state;
1672	int ret = 0;
1673
1674	spin_lock(&tree->lock);
1675	/*
1676	 * this search will find all the extents that end after
1677	 * our range starts.
1678	 */
1679	node = tree_search(tree, start);
1680	if (!node) {
1681		ret = -ENOENT;
1682		goto out;
1683	}
1684	state = rb_entry(node, struct extent_state, rb_node);
1685	if (state->start != start) {
1686		ret = -ENOENT;
1687		goto out;
1688	}
1689	state->private = private;
1690out:
1691	spin_unlock(&tree->lock);
1692	return ret;
1693}
1694
1695int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1696{
1697	struct rb_node *node;
1698	struct extent_state *state;
1699	int ret = 0;
1700
1701	spin_lock(&tree->lock);
1702	/*
1703	 * this search will find all the extents that end after
1704	 * our range starts.
1705	 */
1706	node = tree_search(tree, start);
1707	if (!node) {
1708		ret = -ENOENT;
1709		goto out;
1710	}
1711	state = rb_entry(node, struct extent_state, rb_node);
1712	if (state->start != start) {
1713		ret = -ENOENT;
1714		goto out;
1715	}
1716	*private = state->private;
1717out:
1718	spin_unlock(&tree->lock);
1719	return ret;
1720}
1721
1722/*
1723 * searches a range in the state tree for a given mask.
1724 * If 'filled' == 1, this returns 1 only if every extent in the tree
1725 * has the bits set.  Otherwise, 1 is returned if any bit in the
1726 * range is found set.
1727 */
1728int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1729		   int bits, int filled, struct extent_state *cached)
1730{
1731	struct extent_state *state = NULL;
1732	struct rb_node *node;
1733	int bitset = 0;
1734
1735	spin_lock(&tree->lock);
1736	if (cached && cached->tree && cached->start <= start &&
1737	    cached->end > start)
1738		node = &cached->rb_node;
1739	else
1740		node = tree_search(tree, start);
1741	while (node && start <= end) {
1742		state = rb_entry(node, struct extent_state, rb_node);
1743
1744		if (filled && state->start > start) {
1745			bitset = 0;
1746			break;
1747		}
1748
1749		if (state->start > end)
1750			break;
1751
1752		if (state->state & bits) {
1753			bitset = 1;
1754			if (!filled)
1755				break;
1756		} else if (filled) {
1757			bitset = 0;
1758			break;
1759		}
1760
1761		if (state->end == (u64)-1)
1762			break;
1763
1764		start = state->end + 1;
1765		if (start > end)
1766			break;
1767		node = rb_next(node);
1768		if (!node) {
1769			if (filled)
1770				bitset = 0;
1771			break;
1772		}
1773	}
1774	spin_unlock(&tree->lock);
1775	return bitset;
1776}
1777
1778/*
1779 * helper function to set a given page up to date if all the
1780 * extents in the tree for that page are up to date
1781 */
1782static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
1783{
1784	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1785	u64 end = start + PAGE_CACHE_SIZE - 1;
1786	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1787		SetPageUptodate(page);
1788}
1789
1790/*
1791 * helper function to unlock a page if all the extents in the tree
1792 * for that page are unlocked
1793 */
1794static void check_page_locked(struct extent_io_tree *tree, struct page *page)
1795{
1796	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1797	u64 end = start + PAGE_CACHE_SIZE - 1;
1798	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1799		unlock_page(page);
1800}
1801
1802/*
1803 * helper function to end page writeback if all the extents
1804 * in the tree for that page are done with writeback
1805 */
1806static void check_page_writeback(struct extent_io_tree *tree,
1807				 struct page *page)
1808{
1809	end_page_writeback(page);
1810}
1811
1812/*
1813 * When IO fails, either with EIO or csum verification fails, we
1814 * try other mirrors that might have a good copy of the data.  This
1815 * io_failure_record is used to record state as we go through all the
1816 * mirrors.  If another mirror has good data, the page is set up to date
1817 * and things continue.  If a good mirror can't be found, the original
1818 * bio end_io callback is called to indicate things have failed.
1819 */
1820struct io_failure_record {
1821	struct page *page;
1822	u64 start;
1823	u64 len;
1824	u64 logical;
1825	unsigned long bio_flags;
1826	int this_mirror;
1827	int failed_mirror;
1828	int in_validation;
1829};
1830
1831static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1832				int did_repair)
1833{
1834	int ret;
1835	int err = 0;
1836	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1837
1838	set_state_private(failure_tree, rec->start, 0);
1839	ret = clear_extent_bits(failure_tree, rec->start,
1840				rec->start + rec->len - 1,
1841				EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1842	if (ret)
1843		err = ret;
1844
1845	if (did_repair) {
1846		ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1847					rec->start + rec->len - 1,
1848					EXTENT_DAMAGED, GFP_NOFS);
1849		if (ret && !err)
1850			err = ret;
1851	}
1852
1853	kfree(rec);
1854	return err;
1855}
1856
1857static void repair_io_failure_callback(struct bio *bio, int err)
1858{
1859	complete(bio->bi_private);
1860}
1861
1862/*
1863 * this bypasses the standard btrfs submit functions deliberately, as
1864 * the standard behavior is to write all copies in a raid setup. here we only
1865 * want to write the one bad copy. so we do the mapping for ourselves and issue
1866 * submit_bio directly.
1867 * to avoid any synchonization issues, wait for the data after writing, which
1868 * actually prevents the read that triggered the error from finishing.
1869 * currently, there can be no more than two copies of every data bit. thus,
1870 * exactly one rewrite is required.
1871 */
1872int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1873			u64 length, u64 logical, struct page *page,
1874			int mirror_num)
1875{
1876	struct bio *bio;
1877	struct btrfs_device *dev;
1878	DECLARE_COMPLETION_ONSTACK(compl);
1879	u64 map_length = 0;
1880	u64 sector;
1881	struct btrfs_bio *bbio = NULL;
 
1882	int ret;
1883
 
1884	BUG_ON(!mirror_num);
1885
1886	bio = bio_alloc(GFP_NOFS, 1);
 
 
 
 
1887	if (!bio)
1888		return -EIO;
1889	bio->bi_private = &compl;
1890	bio->bi_end_io = repair_io_failure_callback;
1891	bio->bi_size = 0;
1892	map_length = length;
1893
1894	ret = btrfs_map_block(map_tree, WRITE, logical,
1895			      &map_length, &bbio, mirror_num);
1896	if (ret) {
1897		bio_put(bio);
1898		return -EIO;
1899	}
1900	BUG_ON(mirror_num != bbio->mirror_num);
1901	sector = bbio->stripes[mirror_num-1].physical >> 9;
1902	bio->bi_sector = sector;
1903	dev = bbio->stripes[mirror_num-1].dev;
1904	kfree(bbio);
1905	if (!dev || !dev->bdev || !dev->writeable) {
1906		bio_put(bio);
1907		return -EIO;
1908	}
1909	bio->bi_bdev = dev->bdev;
1910	bio_add_page(bio, page, length, start-page_offset(page));
1911	btrfsic_submit_bio(WRITE_SYNC, bio);
1912	wait_for_completion(&compl);
1913
1914	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1915		/* try to remap that extent elsewhere? */
1916		bio_put(bio);
1917		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
1918		return -EIO;
1919	}
1920
1921	printk_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu "
1922		      "(dev %s sector %llu)\n", page->mapping->host->i_ino,
1923		      start, rcu_str_deref(dev->name), sector);
 
1924
1925	bio_put(bio);
1926	return 0;
1927}
1928
1929int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
1930			 int mirror_num)
1931{
1932	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
1933	u64 start = eb->start;
1934	unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
1935	int ret = 0;
1936
 
 
 
1937	for (i = 0; i < num_pages; i++) {
1938		struct page *p = extent_buffer_page(eb, i);
1939		ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE,
1940					start, p, mirror_num);
1941		if (ret)
1942			break;
1943		start += PAGE_CACHE_SIZE;
1944	}
1945
1946	return ret;
1947}
1948
1949/*
1950 * each time an IO finishes, we do a fast check in the IO failure tree
1951 * to see if we need to process or clean up an io_failure_record
1952 */
1953static int clean_io_failure(u64 start, struct page *page)
1954{
1955	u64 private;
1956	u64 private_failure;
1957	struct io_failure_record *failrec;
1958	struct btrfs_mapping_tree *map_tree;
 
1959	struct extent_state *state;
1960	int num_copies;
1961	int did_repair = 0;
1962	int ret;
1963	struct inode *inode = page->mapping->host;
1964
1965	private = 0;
1966	ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1967				(u64)-1, 1, EXTENT_DIRTY, 0);
1968	if (!ret)
1969		return 0;
1970
1971	ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
1972				&private_failure);
1973	if (ret)
1974		return 0;
1975
1976	failrec = (struct io_failure_record *)(unsigned long) private_failure;
1977	BUG_ON(!failrec->this_mirror);
1978
1979	if (failrec->in_validation) {
1980		/* there was no real error, just free the record */
1981		pr_debug("clean_io_failure: freeing dummy error at %llu\n",
1982			 failrec->start);
1983		did_repair = 1;
1984		goto out;
1985	}
 
 
1986
1987	spin_lock(&BTRFS_I(inode)->io_tree.lock);
1988	state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1989					    failrec->start,
1990					    EXTENT_LOCKED);
1991	spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1992
1993	if (state && state->start == failrec->start) {
1994		map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
1995		num_copies = btrfs_num_copies(map_tree, failrec->logical,
1996						failrec->len);
1997		if (num_copies > 1)  {
1998			ret = repair_io_failure(map_tree, start, failrec->len,
1999						failrec->logical, page,
2000						failrec->failed_mirror);
2001			did_repair = !ret;
2002		}
 
2003	}
2004
2005out:
2006	if (!ret)
2007		ret = free_io_failure(inode, failrec, did_repair);
2008
2009	return ret;
2010}
2011
2012/*
2013 * this is a generic handler for readpage errors (default
2014 * readpage_io_failed_hook). if other copies exist, read those and write back
2015 * good data to the failed position. does not investigate in remapping the
2016 * failed extent elsewhere, hoping the device will be smart enough to do this as
2017 * needed
2018 */
2019
2020static int bio_readpage_error(struct bio *failed_bio, struct page *page,
2021				u64 start, u64 end, int failed_mirror,
2022				struct extent_state *state)
2023{
2024	struct io_failure_record *failrec = NULL;
2025	u64 private;
2026	struct extent_map *em;
2027	struct inode *inode = page->mapping->host;
2028	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2029	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2030	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2031	struct bio *bio;
 
 
2032	int num_copies;
2033	int ret;
2034	int read_mode;
2035	u64 logical;
2036
2037	BUG_ON(failed_bio->bi_rw & REQ_WRITE);
2038
2039	ret = get_state_private(failure_tree, start, &private);
2040	if (ret) {
2041		failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2042		if (!failrec)
2043			return -ENOMEM;
2044		failrec->start = start;
2045		failrec->len = end - start + 1;
2046		failrec->this_mirror = 0;
2047		failrec->bio_flags = 0;
2048		failrec->in_validation = 0;
2049
2050		read_lock(&em_tree->lock);
2051		em = lookup_extent_mapping(em_tree, start, failrec->len);
2052		if (!em) {
2053			read_unlock(&em_tree->lock);
2054			kfree(failrec);
2055			return -EIO;
2056		}
2057
2058		if (em->start > start || em->start + em->len < start) {
2059			free_extent_map(em);
2060			em = NULL;
2061		}
2062		read_unlock(&em_tree->lock);
2063
2064		if (!em || IS_ERR(em)) {
2065			kfree(failrec);
2066			return -EIO;
2067		}
2068		logical = start - em->start;
2069		logical = em->block_start + logical;
2070		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2071			logical = em->block_start;
2072			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2073			extent_set_compress_type(&failrec->bio_flags,
2074						 em->compress_type);
2075		}
2076		pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
2077			 "len=%llu\n", logical, start, failrec->len);
2078		failrec->logical = logical;
2079		free_extent_map(em);
2080
2081		/* set the bits in the private failure tree */
2082		ret = set_extent_bits(failure_tree, start, end,
2083					EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2084		if (ret >= 0)
2085			ret = set_state_private(failure_tree, start,
2086						(u64)(unsigned long)failrec);
2087		/* set the bits in the inode's tree */
2088		if (ret >= 0)
2089			ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
2090						GFP_NOFS);
2091		if (ret < 0) {
2092			kfree(failrec);
2093			return ret;
2094		}
2095	} else {
2096		failrec = (struct io_failure_record *)(unsigned long)private;
2097		pr_debug("bio_readpage_error: (found) logical=%llu, "
2098			 "start=%llu, len=%llu, validation=%d\n",
2099			 failrec->logical, failrec->start, failrec->len,
2100			 failrec->in_validation);
2101		/*
2102		 * when data can be on disk more than twice, add to failrec here
2103		 * (e.g. with a list for failed_mirror) to make
2104		 * clean_io_failure() clean all those errors at once.
2105		 */
2106	}
2107	num_copies = btrfs_num_copies(
2108			      &BTRFS_I(inode)->root->fs_info->mapping_tree,
2109			      failrec->logical, failrec->len);
2110	if (num_copies == 1) {
2111		/*
2112		 * we only have a single copy of the data, so don't bother with
2113		 * all the retry and error correction code that follows. no
2114		 * matter what the error is, it is very likely to persist.
2115		 */
2116		pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
2117			 "state=%p, num_copies=%d, next_mirror %d, "
2118			 "failed_mirror %d\n", state, num_copies,
2119			 failrec->this_mirror, failed_mirror);
2120		free_io_failure(inode, failrec, 0);
2121		return -EIO;
2122	}
2123
2124	if (!state) {
2125		spin_lock(&tree->lock);
2126		state = find_first_extent_bit_state(tree, failrec->start,
2127						    EXTENT_LOCKED);
2128		if (state && state->start != failrec->start)
2129			state = NULL;
2130		spin_unlock(&tree->lock);
2131	}
2132
2133	/*
2134	 * there are two premises:
2135	 *	a) deliver good data to the caller
2136	 *	b) correct the bad sectors on disk
2137	 */
2138	if (failed_bio->bi_vcnt > 1) {
2139		/*
2140		 * to fulfill b), we need to know the exact failing sectors, as
2141		 * we don't want to rewrite any more than the failed ones. thus,
2142		 * we need separate read requests for the failed bio
2143		 *
2144		 * if the following BUG_ON triggers, our validation request got
2145		 * merged. we need separate requests for our algorithm to work.
2146		 */
2147		BUG_ON(failrec->in_validation);
2148		failrec->in_validation = 1;
2149		failrec->this_mirror = failed_mirror;
2150		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2151	} else {
2152		/*
2153		 * we're ready to fulfill a) and b) alongside. get a good copy
2154		 * of the failed sector and if we succeed, we have setup
2155		 * everything for repair_io_failure to do the rest for us.
2156		 */
2157		if (failrec->in_validation) {
2158			BUG_ON(failrec->this_mirror != failed_mirror);
2159			failrec->in_validation = 0;
2160			failrec->this_mirror = 0;
2161		}
2162		failrec->failed_mirror = failed_mirror;
2163		failrec->this_mirror++;
2164		if (failrec->this_mirror == failed_mirror)
2165			failrec->this_mirror++;
2166		read_mode = READ_SYNC;
2167	}
2168
2169	if (!state || failrec->this_mirror > num_copies) {
2170		pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
2171			 "next_mirror %d, failed_mirror %d\n", state,
2172			 num_copies, failrec->this_mirror, failed_mirror);
2173		free_io_failure(inode, failrec, 0);
2174		return -EIO;
2175	}
2176
2177	bio = bio_alloc(GFP_NOFS, 1);
2178	if (!bio) {
2179		free_io_failure(inode, failrec, 0);
2180		return -EIO;
2181	}
2182	bio->bi_private = state;
2183	bio->bi_end_io = failed_bio->bi_end_io;
2184	bio->bi_sector = failrec->logical >> 9;
2185	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2186	bio->bi_size = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
2187
2188	bio_add_page(bio, page, failrec->len, start - page_offset(page));
2189
2190	pr_debug("bio_readpage_error: submitting new read[%#x] to "
2191		 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
2192		 failrec->this_mirror, num_copies, failrec->in_validation);
2193
2194	ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
2195					 failrec->this_mirror,
2196					 failrec->bio_flags, 0);
2197	return ret;
2198}
2199
2200/* lots and lots of room for performance fixes in the end_bio funcs */
2201
2202int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2203{
2204	int uptodate = (err == 0);
2205	struct extent_io_tree *tree;
2206	int ret;
2207
2208	tree = &BTRFS_I(page->mapping->host)->io_tree;
2209
2210	if (tree->ops && tree->ops->writepage_end_io_hook) {
2211		ret = tree->ops->writepage_end_io_hook(page, start,
2212					       end, NULL, uptodate);
2213		if (ret)
2214			uptodate = 0;
2215	}
2216
2217	if (!uptodate) {
2218		ClearPageUptodate(page);
2219		SetPageError(page);
2220	}
2221	return 0;
2222}
2223
2224/*
2225 * after a writepage IO is done, we need to:
2226 * clear the uptodate bits on error
2227 * clear the writeback bits in the extent tree for this IO
2228 * end_page_writeback if the page has no more pending IO
2229 *
2230 * Scheduling is not allowed, so the extent state tree is expected
2231 * to have one and only one object corresponding to this IO.
2232 */
2233static void end_bio_extent_writepage(struct bio *bio, int err)
2234{
2235	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
2236	struct extent_io_tree *tree;
2237	u64 start;
2238	u64 end;
2239	int whole_page;
2240
2241	do {
2242		struct page *page = bvec->bv_page;
2243		tree = &BTRFS_I(page->mapping->host)->io_tree;
2244
2245		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
2246			 bvec->bv_offset;
2247		end = start + bvec->bv_len - 1;
2248
2249		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
2250			whole_page = 1;
2251		else
2252			whole_page = 0;
 
 
 
 
 
 
 
 
 
 
 
 
2253
2254		if (--bvec >= bio->bi_io_vec)
2255			prefetchw(&bvec->bv_page->flags);
2256
2257		if (end_extent_writepage(page, err, start, end))
2258			continue;
2259
2260		if (whole_page)
2261			end_page_writeback(page);
2262		else
2263			check_page_writeback(tree, page);
2264	} while (bvec >= bio->bi_io_vec);
2265
2266	bio_put(bio);
2267}
2268
 
 
 
 
 
 
 
 
 
 
 
 
2269/*
2270 * after a readpage IO is done, we need to:
2271 * clear the uptodate bits on error
2272 * set the uptodate bits if things worked
2273 * set the page up to date if all extents in the tree are uptodate
2274 * clear the lock bit in the extent tree
2275 * unlock the page if there are no other extents locked for it
2276 *
2277 * Scheduling is not allowed, so the extent state tree is expected
2278 * to have one and only one object corresponding to this IO.
2279 */
2280static void end_bio_extent_readpage(struct bio *bio, int err)
2281{
 
2282	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
2283	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
2284	struct bio_vec *bvec = bio->bi_io_vec;
2285	struct extent_io_tree *tree;
 
2286	u64 start;
2287	u64 end;
2288	int whole_page;
 
 
2289	int mirror;
2290	int ret;
 
2291
2292	if (err)
2293		uptodate = 0;
2294
2295	do {
2296		struct page *page = bvec->bv_page;
2297		struct extent_state *cached = NULL;
2298		struct extent_state *state;
2299
2300		pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
2301			 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
2302			 (long int)bio->bi_bdev);
2303		tree = &BTRFS_I(page->mapping->host)->io_tree;
2304
2305		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
2306			bvec->bv_offset;
2307		end = start + bvec->bv_len - 1;
2308
2309		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
2310			whole_page = 1;
2311		else
2312			whole_page = 0;
2313
2314		if (++bvec <= bvec_end)
2315			prefetchw(&bvec->bv_page->flags);
2316
2317		spin_lock(&tree->lock);
2318		state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
2319		if (state && state->start == start) {
2320			/*
2321			 * take a reference on the state, unlock will drop
2322			 * the ref
2323			 */
2324			cache_state(state, &cached);
2325		}
2326		spin_unlock(&tree->lock);
2327
2328		mirror = (int)(unsigned long)bio->bi_bdev;
2329		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
2330			ret = tree->ops->readpage_end_io_hook(page, start, end,
2331							      state, mirror);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2332			if (ret)
2333				uptodate = 0;
2334			else
2335				clean_io_failure(start, page);
2336		}
2337
2338		if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
 
 
 
2339			ret = tree->ops->readpage_io_failed_hook(page, mirror);
2340			if (!ret && !err &&
2341			    test_bit(BIO_UPTODATE, &bio->bi_flags))
2342				uptodate = 1;
2343		} else if (!uptodate) {
2344			/*
2345			 * The generic bio_readpage_error handles errors the
2346			 * following way: If possible, new read requests are
2347			 * created and submitted and will end up in
2348			 * end_bio_extent_readpage as well (if we're lucky, not
2349			 * in the !uptodate case). In that case it returns 0 and
2350			 * we just go on with the next page in our bio. If it
2351			 * can't handle the error it will return -EIO and we
2352			 * remain responsible for that page.
2353			 */
2354			ret = bio_readpage_error(bio, page, start, end, mirror, NULL);
 
2355			if (ret == 0) {
2356				uptodate =
2357					test_bit(BIO_UPTODATE, &bio->bi_flags);
2358				if (err)
2359					uptodate = 0;
2360				uncache_state(&cached);
2361				continue;
2362			}
2363		}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2364
2365		if (uptodate && tree->track_uptodate) {
2366			set_extent_uptodate(tree, start, end, &cached,
2367					    GFP_ATOMIC);
2368		}
2369		unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
2370
2371		if (whole_page) {
2372			if (uptodate) {
2373				SetPageUptodate(page);
2374			} else {
2375				ClearPageUptodate(page);
2376				SetPageError(page);
2377			}
2378			unlock_page(page);
 
 
 
 
 
 
2379		} else {
2380			if (uptodate) {
2381				check_page_uptodate(tree, page);
2382			} else {
2383				ClearPageUptodate(page);
2384				SetPageError(page);
2385			}
2386			check_page_locked(tree, page);
2387		}
2388	} while (bvec <= bvec_end);
2389
 
 
 
 
 
2390	bio_put(bio);
2391}
2392
 
 
 
 
2393struct bio *
2394btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2395		gfp_t gfp_flags)
2396{
 
2397	struct bio *bio;
2398
2399	bio = bio_alloc(gfp_flags, nr_vecs);
2400
2401	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
2402		while (!bio && (nr_vecs /= 2))
2403			bio = bio_alloc(gfp_flags, nr_vecs);
 
 
2404	}
2405
2406	if (bio) {
2407		bio->bi_size = 0;
2408		bio->bi_bdev = bdev;
2409		bio->bi_sector = first_sector;
 
 
 
 
2410	}
2411	return bio;
2412}
2413
2414/*
2415 * Since writes are async, they will only return -ENOMEM.
2416 * Reads can return the full range of I/O error conditions.
2417 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2418static int __must_check submit_one_bio(int rw, struct bio *bio,
2419				       int mirror_num, unsigned long bio_flags)
2420{
2421	int ret = 0;
2422	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
2423	struct page *page = bvec->bv_page;
2424	struct extent_io_tree *tree = bio->bi_private;
2425	u64 start;
2426
2427	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
2428
2429	bio->bi_private = NULL;
2430
2431	bio_get(bio);
2432
2433	if (tree->ops && tree->ops->submit_bio_hook)
2434		ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
2435					   mirror_num, bio_flags, start);
2436	else
2437		btrfsic_submit_bio(rw, bio);
2438
2439	if (bio_flagged(bio, BIO_EOPNOTSUPP))
2440		ret = -EOPNOTSUPP;
2441	bio_put(bio);
2442	return ret;
2443}
2444
2445static int merge_bio(struct extent_io_tree *tree, struct page *page,
2446		     unsigned long offset, size_t size, struct bio *bio,
2447		     unsigned long bio_flags)
2448{
2449	int ret = 0;
2450	if (tree->ops && tree->ops->merge_bio_hook)
2451		ret = tree->ops->merge_bio_hook(page, offset, size, bio,
2452						bio_flags);
2453	BUG_ON(ret < 0);
2454	return ret;
2455
2456}
2457
2458static int submit_extent_page(int rw, struct extent_io_tree *tree,
2459			      struct page *page, sector_t sector,
2460			      size_t size, unsigned long offset,
2461			      struct block_device *bdev,
2462			      struct bio **bio_ret,
2463			      unsigned long max_pages,
2464			      bio_end_io_t end_io_func,
2465			      int mirror_num,
2466			      unsigned long prev_bio_flags,
2467			      unsigned long bio_flags)
2468{
2469	int ret = 0;
2470	struct bio *bio;
2471	int nr;
2472	int contig = 0;
2473	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
2474	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
2475	size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
2476
2477	if (bio_ret && *bio_ret) {
2478		bio = *bio_ret;
2479		if (old_compressed)
2480			contig = bio->bi_sector == sector;
2481		else
2482			contig = bio->bi_sector + (bio->bi_size >> 9) ==
2483				sector;
2484
2485		if (prev_bio_flags != bio_flags || !contig ||
2486		    merge_bio(tree, page, offset, page_size, bio, bio_flags) ||
2487		    bio_add_page(bio, page, page_size, offset) < page_size) {
2488			ret = submit_one_bio(rw, bio, mirror_num,
2489					     prev_bio_flags);
2490			if (ret < 0)
2491				return ret;
2492			bio = NULL;
2493		} else {
2494			return 0;
2495		}
2496	}
2497	if (this_compressed)
2498		nr = BIO_MAX_PAGES;
2499	else
2500		nr = bio_get_nr_vecs(bdev);
2501
2502	bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
2503	if (!bio)
2504		return -ENOMEM;
2505
2506	bio_add_page(bio, page, page_size, offset);
2507	bio->bi_end_io = end_io_func;
2508	bio->bi_private = tree;
2509
2510	if (bio_ret)
2511		*bio_ret = bio;
2512	else
2513		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
2514
2515	return ret;
2516}
2517
2518void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
 
2519{
2520	if (!PagePrivate(page)) {
2521		SetPagePrivate(page);
2522		page_cache_get(page);
2523		set_page_private(page, (unsigned long)eb);
2524	} else {
2525		WARN_ON(page->private != (unsigned long)eb);
2526	}
2527}
2528
2529void set_page_extent_mapped(struct page *page)
2530{
2531	if (!PagePrivate(page)) {
2532		SetPagePrivate(page);
2533		page_cache_get(page);
2534		set_page_private(page, EXTENT_PAGE_PRIVATE);
2535	}
2536}
2537
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2538/*
2539 * basic readpage implementation.  Locked extent state structs are inserted
2540 * into the tree that are removed when the IO is done (by the end_io
2541 * handlers)
2542 * XXX JDM: This needs looking at to ensure proper page locking
2543 */
2544static int __extent_read_full_page(struct extent_io_tree *tree,
2545				   struct page *page,
2546				   get_extent_t *get_extent,
2547				   struct bio **bio, int mirror_num,
2548				   unsigned long *bio_flags)
 
2549{
2550	struct inode *inode = page->mapping->host;
2551	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2552	u64 page_end = start + PAGE_CACHE_SIZE - 1;
2553	u64 end;
2554	u64 cur = start;
2555	u64 extent_offset;
2556	u64 last_byte = i_size_read(inode);
2557	u64 block_start;
2558	u64 cur_end;
2559	sector_t sector;
2560	struct extent_map *em;
2561	struct block_device *bdev;
2562	struct btrfs_ordered_extent *ordered;
2563	int ret;
2564	int nr = 0;
 
2565	size_t pg_offset = 0;
2566	size_t iosize;
2567	size_t disk_io_size;
2568	size_t blocksize = inode->i_sb->s_blocksize;
2569	unsigned long this_bio_flag = 0;
2570
2571	set_page_extent_mapped(page);
2572
 
2573	if (!PageUptodate(page)) {
2574		if (cleancache_get_page(page) == 0) {
2575			BUG_ON(blocksize != PAGE_SIZE);
 
2576			goto out;
2577		}
2578	}
2579
2580	end = page_end;
2581	while (1) {
2582		lock_extent(tree, start, end);
2583		ordered = btrfs_lookup_ordered_extent(inode, start);
2584		if (!ordered)
2585			break;
2586		unlock_extent(tree, start, end);
2587		btrfs_start_ordered_extent(inode, ordered, 1);
2588		btrfs_put_ordered_extent(ordered);
2589	}
2590
2591	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2592		char *userpage;
2593		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
2594
2595		if (zero_offset) {
2596			iosize = PAGE_CACHE_SIZE - zero_offset;
2597			userpage = kmap_atomic(page);
2598			memset(userpage + zero_offset, 0, iosize);
2599			flush_dcache_page(page);
2600			kunmap_atomic(userpage);
2601		}
2602	}
2603	while (cur <= end) {
 
 
2604		if (cur >= last_byte) {
2605			char *userpage;
2606			struct extent_state *cached = NULL;
2607
2608			iosize = PAGE_CACHE_SIZE - pg_offset;
2609			userpage = kmap_atomic(page);
2610			memset(userpage + pg_offset, 0, iosize);
2611			flush_dcache_page(page);
2612			kunmap_atomic(userpage);
2613			set_extent_uptodate(tree, cur, cur + iosize - 1,
2614					    &cached, GFP_NOFS);
2615			unlock_extent_cached(tree, cur, cur + iosize - 1,
2616					     &cached, GFP_NOFS);
 
 
2617			break;
2618		}
2619		em = get_extent(inode, page, pg_offset, cur,
2620				end - cur + 1, 0);
2621		if (IS_ERR_OR_NULL(em)) {
2622			SetPageError(page);
2623			unlock_extent(tree, cur, end);
 
2624			break;
2625		}
2626		extent_offset = cur - em->start;
2627		BUG_ON(extent_map_end(em) <= cur);
2628		BUG_ON(end < cur);
2629
2630		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2631			this_bio_flag = EXTENT_BIO_COMPRESSED;
2632			extent_set_compress_type(&this_bio_flag,
2633						 em->compress_type);
2634		}
2635
2636		iosize = min(extent_map_end(em) - cur, end - cur + 1);
2637		cur_end = min(extent_map_end(em) - 1, end);
2638		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2639		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2640			disk_io_size = em->block_len;
2641			sector = em->block_start >> 9;
2642		} else {
2643			sector = (em->block_start + extent_offset) >> 9;
2644			disk_io_size = iosize;
2645		}
2646		bdev = em->bdev;
2647		block_start = em->block_start;
2648		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2649			block_start = EXTENT_MAP_HOLE;
2650		free_extent_map(em);
2651		em = NULL;
2652
2653		/* we've found a hole, just zero and go on */
2654		if (block_start == EXTENT_MAP_HOLE) {
2655			char *userpage;
2656			struct extent_state *cached = NULL;
2657
2658			userpage = kmap_atomic(page);
2659			memset(userpage + pg_offset, 0, iosize);
2660			flush_dcache_page(page);
2661			kunmap_atomic(userpage);
2662
2663			set_extent_uptodate(tree, cur, cur + iosize - 1,
2664					    &cached, GFP_NOFS);
2665			unlock_extent_cached(tree, cur, cur + iosize - 1,
2666			                     &cached, GFP_NOFS);
2667			cur = cur + iosize;
2668			pg_offset += iosize;
2669			continue;
2670		}
2671		/* the get_extent function already copied into the page */
2672		if (test_range_bit(tree, cur, cur_end,
2673				   EXTENT_UPTODATE, 1, NULL)) {
2674			check_page_uptodate(tree, page);
2675			unlock_extent(tree, cur, cur + iosize - 1);
 
2676			cur = cur + iosize;
2677			pg_offset += iosize;
2678			continue;
2679		}
2680		/* we have an inline extent but it didn't get marked up
2681		 * to date.  Error out
2682		 */
2683		if (block_start == EXTENT_MAP_INLINE) {
2684			SetPageError(page);
2685			unlock_extent(tree, cur, cur + iosize - 1);
 
2686			cur = cur + iosize;
2687			pg_offset += iosize;
2688			continue;
2689		}
2690
2691		ret = 0;
2692		if (tree->ops && tree->ops->readpage_io_hook) {
2693			ret = tree->ops->readpage_io_hook(page, cur,
2694							  cur + iosize - 1);
2695		}
2696		if (!ret) {
2697			unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2698			pnr -= page->index;
2699			ret = submit_extent_page(READ, tree, page,
2700					 sector, disk_io_size, pg_offset,
2701					 bdev, bio, pnr,
2702					 end_bio_extent_readpage, mirror_num,
2703					 *bio_flags,
2704					 this_bio_flag);
2705			BUG_ON(ret == -ENOMEM);
2706			nr++;
2707			*bio_flags = this_bio_flag;
2708		}
2709		if (ret)
2710			SetPageError(page);
 
 
 
2711		cur = cur + iosize;
2712		pg_offset += iosize;
2713	}
2714out:
2715	if (!nr) {
2716		if (!PageError(page))
2717			SetPageUptodate(page);
2718		unlock_page(page);
2719	}
2720	return 0;
2721}
2722
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2723int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2724			    get_extent_t *get_extent, int mirror_num)
2725{
2726	struct bio *bio = NULL;
2727	unsigned long bio_flags = 0;
2728	int ret;
2729
2730	ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
2731				      &bio_flags);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2732	if (bio)
2733		ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
2734	return ret;
2735}
2736
2737static noinline void update_nr_written(struct page *page,
2738				      struct writeback_control *wbc,
2739				      unsigned long nr_written)
2740{
2741	wbc->nr_to_write -= nr_written;
2742	if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2743	    wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2744		page->mapping->writeback_index = page->index + nr_written;
2745}
2746
2747/*
2748 * the writepage semantics are similar to regular writepage.  extent
2749 * records are inserted to lock ranges in the tree, and as dirty areas
2750 * are found, they are marked writeback.  Then the lock bits are removed
2751 * and the end_io handler clears the writeback ranges
2752 */
2753static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2754			      void *data)
2755{
2756	struct inode *inode = page->mapping->host;
2757	struct extent_page_data *epd = data;
2758	struct extent_io_tree *tree = epd->tree;
2759	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2760	u64 delalloc_start;
2761	u64 page_end = start + PAGE_CACHE_SIZE - 1;
2762	u64 end;
2763	u64 cur = start;
2764	u64 extent_offset;
2765	u64 last_byte = i_size_read(inode);
2766	u64 block_start;
2767	u64 iosize;
2768	sector_t sector;
2769	struct extent_state *cached_state = NULL;
2770	struct extent_map *em;
2771	struct block_device *bdev;
2772	int ret;
2773	int nr = 0;
2774	size_t pg_offset = 0;
2775	size_t blocksize;
2776	loff_t i_size = i_size_read(inode);
2777	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2778	u64 nr_delalloc;
2779	u64 delalloc_end;
2780	int page_started;
2781	int compressed;
2782	int write_flags;
2783	unsigned long nr_written = 0;
2784	bool fill_delalloc = true;
2785
2786	if (wbc->sync_mode == WB_SYNC_ALL)
2787		write_flags = WRITE_SYNC;
2788	else
2789		write_flags = WRITE;
2790
2791	trace___extent_writepage(page, inode, wbc);
2792
2793	WARN_ON(!PageLocked(page));
2794
2795	ClearPageError(page);
2796
2797	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2798	if (page->index > end_index ||
2799	   (page->index == end_index && !pg_offset)) {
2800		page->mapping->a_ops->invalidatepage(page, 0);
2801		unlock_page(page);
2802		return 0;
2803	}
2804
2805	if (page->index == end_index) {
2806		char *userpage;
2807
2808		userpage = kmap_atomic(page);
2809		memset(userpage + pg_offset, 0,
2810		       PAGE_CACHE_SIZE - pg_offset);
2811		kunmap_atomic(userpage);
2812		flush_dcache_page(page);
2813	}
2814	pg_offset = 0;
2815
2816	set_page_extent_mapped(page);
2817
2818	if (!tree->ops || !tree->ops->fill_delalloc)
2819		fill_delalloc = false;
2820
2821	delalloc_start = start;
2822	delalloc_end = 0;
2823	page_started = 0;
2824	if (!epd->extent_locked && fill_delalloc) {
2825		u64 delalloc_to_write = 0;
2826		/*
2827		 * make sure the wbc mapping index is at least updated
2828		 * to this page.
2829		 */
2830		update_nr_written(page, wbc, 0);
2831
2832		while (delalloc_end < page_end) {
2833			nr_delalloc = find_lock_delalloc_range(inode, tree,
2834						       page,
2835						       &delalloc_start,
2836						       &delalloc_end,
2837						       128 * 1024 * 1024);
2838			if (nr_delalloc == 0) {
2839				delalloc_start = delalloc_end + 1;
2840				continue;
2841			}
2842			ret = tree->ops->fill_delalloc(inode, page,
2843						       delalloc_start,
2844						       delalloc_end,
2845						       &page_started,
2846						       &nr_written);
2847			/* File system has been set read-only */
2848			if (ret) {
2849				SetPageError(page);
2850				goto done;
2851			}
2852			/*
2853			 * delalloc_end is already one less than the total
2854			 * length, so we don't subtract one from
2855			 * PAGE_CACHE_SIZE
2856			 */
2857			delalloc_to_write += (delalloc_end - delalloc_start +
2858					      PAGE_CACHE_SIZE) >>
2859					      PAGE_CACHE_SHIFT;
2860			delalloc_start = delalloc_end + 1;
2861		}
2862		if (wbc->nr_to_write < delalloc_to_write) {
2863			int thresh = 8192;
2864
2865			if (delalloc_to_write < thresh * 2)
2866				thresh = delalloc_to_write;
2867			wbc->nr_to_write = min_t(u64, delalloc_to_write,
2868						 thresh);
2869		}
2870
2871		/* did the fill delalloc function already unlock and start
2872		 * the IO?
2873		 */
2874		if (page_started) {
2875			ret = 0;
2876			/*
2877			 * we've unlocked the page, so we can't update
2878			 * the mapping's writeback index, just update
2879			 * nr_to_write.
2880			 */
2881			wbc->nr_to_write -= nr_written;
2882			goto done_unlocked;
2883		}
2884	}
2885	if (tree->ops && tree->ops->writepage_start_hook) {
2886		ret = tree->ops->writepage_start_hook(page, start,
2887						      page_end);
2888		if (ret) {
2889			/* Fixup worker will requeue */
2890			if (ret == -EBUSY)
2891				wbc->pages_skipped++;
2892			else
2893				redirty_page_for_writepage(wbc, page);
2894			update_nr_written(page, wbc, nr_written);
2895			unlock_page(page);
2896			ret = 0;
2897			goto done_unlocked;
2898		}
2899	}
2900
2901	/*
2902	 * we don't want to touch the inode after unlocking the page,
2903	 * so we update the mapping writeback index now
2904	 */
2905	update_nr_written(page, wbc, nr_written + 1);
2906
2907	end = page_end;
2908	if (last_byte <= start) {
2909		if (tree->ops && tree->ops->writepage_end_io_hook)
2910			tree->ops->writepage_end_io_hook(page, start,
2911							 page_end, NULL, 1);
2912		goto done;
2913	}
2914
2915	blocksize = inode->i_sb->s_blocksize;
2916
2917	while (cur <= end) {
2918		if (cur >= last_byte) {
2919			if (tree->ops && tree->ops->writepage_end_io_hook)
2920				tree->ops->writepage_end_io_hook(page, cur,
2921							 page_end, NULL, 1);
2922			break;
2923		}
2924		em = epd->get_extent(inode, page, pg_offset, cur,
2925				     end - cur + 1, 1);
2926		if (IS_ERR_OR_NULL(em)) {
2927			SetPageError(page);
2928			break;
2929		}
2930
2931		extent_offset = cur - em->start;
2932		BUG_ON(extent_map_end(em) <= cur);
2933		BUG_ON(end < cur);
2934		iosize = min(extent_map_end(em) - cur, end - cur + 1);
2935		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2936		sector = (em->block_start + extent_offset) >> 9;
2937		bdev = em->bdev;
2938		block_start = em->block_start;
2939		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2940		free_extent_map(em);
2941		em = NULL;
2942
2943		/*
2944		 * compressed and inline extents are written through other
2945		 * paths in the FS
2946		 */
2947		if (compressed || block_start == EXTENT_MAP_HOLE ||
2948		    block_start == EXTENT_MAP_INLINE) {
2949			/*
2950			 * end_io notification does not happen here for
2951			 * compressed extents
2952			 */
2953			if (!compressed && tree->ops &&
2954			    tree->ops->writepage_end_io_hook)
2955				tree->ops->writepage_end_io_hook(page, cur,
2956							 cur + iosize - 1,
2957							 NULL, 1);
2958			else if (compressed) {
2959				/* we don't want to end_page_writeback on
2960				 * a compressed extent.  this happens
2961				 * elsewhere
2962				 */
2963				nr++;
2964			}
2965
2966			cur += iosize;
2967			pg_offset += iosize;
2968			continue;
2969		}
2970		/* leave this out until we have a page_mkwrite call */
2971		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2972				   EXTENT_DIRTY, 0, NULL)) {
2973			cur = cur + iosize;
2974			pg_offset += iosize;
2975			continue;
2976		}
2977
2978		if (tree->ops && tree->ops->writepage_io_hook) {
2979			ret = tree->ops->writepage_io_hook(page, cur,
2980						cur + iosize - 1);
2981		} else {
2982			ret = 0;
2983		}
2984		if (ret) {
2985			SetPageError(page);
2986		} else {
2987			unsigned long max_nr = end_index + 1;
2988
2989			set_range_writeback(tree, cur, cur + iosize - 1);
2990			if (!PageWriteback(page)) {
2991				printk(KERN_ERR "btrfs warning page %lu not "
2992				       "writeback, cur %llu end %llu\n",
2993				       page->index, (unsigned long long)cur,
2994				       (unsigned long long)end);
2995			}
2996
2997			ret = submit_extent_page(write_flags, tree, page,
2998						 sector, iosize, pg_offset,
2999						 bdev, &epd->bio, max_nr,
3000						 end_bio_extent_writepage,
3001						 0, 0, 0);
3002			if (ret)
3003				SetPageError(page);
3004		}
3005		cur = cur + iosize;
3006		pg_offset += iosize;
3007		nr++;
3008	}
3009done:
3010	if (nr == 0) {
3011		/* make sure the mapping tag for page dirty gets cleared */
3012		set_page_writeback(page);
3013		end_page_writeback(page);
3014	}
3015	unlock_page(page);
3016
3017done_unlocked:
3018
3019	/* drop our reference on any cached states */
3020	free_extent_state(cached_state);
3021	return 0;
3022}
3023
3024static int eb_wait(void *word)
3025{
3026	io_schedule();
3027	return 0;
3028}
3029
3030static void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
3031{
3032	wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait,
3033		    TASK_UNINTERRUPTIBLE);
3034}
3035
3036static int lock_extent_buffer_for_io(struct extent_buffer *eb,
3037				     struct btrfs_fs_info *fs_info,
3038				     struct extent_page_data *epd)
3039{
3040	unsigned long i, num_pages;
3041	int flush = 0;
3042	int ret = 0;
3043
3044	if (!btrfs_try_tree_write_lock(eb)) {
3045		flush = 1;
3046		flush_write_bio(epd);
3047		btrfs_tree_lock(eb);
3048	}
3049
3050	if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
3051		btrfs_tree_unlock(eb);
3052		if (!epd->sync_io)
3053			return 0;
3054		if (!flush) {
3055			flush_write_bio(epd);
3056			flush = 1;
3057		}
3058		while (1) {
3059			wait_on_extent_buffer_writeback(eb);
3060			btrfs_tree_lock(eb);
3061			if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
3062				break;
3063			btrfs_tree_unlock(eb);
3064		}
3065	}
3066
 
 
 
 
 
 
3067	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3068		set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
 
3069		btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3070		spin_lock(&fs_info->delalloc_lock);
3071		if (fs_info->dirty_metadata_bytes >= eb->len)
3072			fs_info->dirty_metadata_bytes -= eb->len;
3073		else
3074			WARN_ON(1);
3075		spin_unlock(&fs_info->delalloc_lock);
3076		ret = 1;
 
 
3077	}
3078
3079	btrfs_tree_unlock(eb);
3080
3081	if (!ret)
3082		return ret;
3083
3084	num_pages = num_extent_pages(eb->start, eb->len);
3085	for (i = 0; i < num_pages; i++) {
3086		struct page *p = extent_buffer_page(eb, i);
3087
3088		if (!trylock_page(p)) {
3089			if (!flush) {
3090				flush_write_bio(epd);
3091				flush = 1;
3092			}
3093			lock_page(p);
3094		}
3095	}
3096
3097	return ret;
3098}
3099
3100static void end_extent_buffer_writeback(struct extent_buffer *eb)
3101{
3102	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3103	smp_mb__after_clear_bit();
3104	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3105}
3106
3107static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
3108{
3109	int uptodate = err == 0;
3110	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
3111	struct extent_buffer *eb;
3112	int done;
3113
3114	do {
3115		struct page *page = bvec->bv_page;
3116
3117		bvec--;
3118		eb = (struct extent_buffer *)page->private;
3119		BUG_ON(!eb);
3120		done = atomic_dec_and_test(&eb->io_pages);
3121
3122		if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
3123			set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3124			ClearPageUptodate(page);
3125			SetPageError(page);
3126		}
3127
3128		end_page_writeback(page);
3129
3130		if (!done)
3131			continue;
3132
3133		end_extent_buffer_writeback(eb);
3134	} while (bvec >= bio->bi_io_vec);
3135
3136	bio_put(bio);
3137
3138}
3139
3140static int write_one_eb(struct extent_buffer *eb,
3141			struct btrfs_fs_info *fs_info,
3142			struct writeback_control *wbc,
3143			struct extent_page_data *epd)
3144{
3145	struct block_device *bdev = fs_info->fs_devices->latest_bdev;
 
3146	u64 offset = eb->start;
3147	unsigned long i, num_pages;
3148	int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
 
3149	int ret = 0;
3150
3151	clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3152	num_pages = num_extent_pages(eb->start, eb->len);
3153	atomic_set(&eb->io_pages, num_pages);
 
 
 
3154	for (i = 0; i < num_pages; i++) {
3155		struct page *p = extent_buffer_page(eb, i);
3156
3157		clear_page_dirty_for_io(p);
3158		set_page_writeback(p);
3159		ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
3160					 PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
3161					 -1, end_bio_extent_buffer_writepage,
3162					 0, 0, 0);
 
3163		if (ret) {
3164			set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3165			SetPageError(p);
3166			if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
3167				end_extent_buffer_writeback(eb);
3168			ret = -EIO;
3169			break;
3170		}
3171		offset += PAGE_CACHE_SIZE;
3172		update_nr_written(p, wbc, 1);
3173		unlock_page(p);
3174	}
3175
3176	if (unlikely(ret)) {
3177		for (; i < num_pages; i++) {
3178			struct page *p = extent_buffer_page(eb, i);
3179			unlock_page(p);
3180		}
3181	}
3182
3183	return ret;
3184}
3185
3186int btree_write_cache_pages(struct address_space *mapping,
3187				   struct writeback_control *wbc)
3188{
3189	struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
3190	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
3191	struct extent_buffer *eb, *prev_eb = NULL;
3192	struct extent_page_data epd = {
3193		.bio = NULL,
3194		.tree = tree,
3195		.extent_locked = 0,
3196		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
 
3197	};
3198	int ret = 0;
3199	int done = 0;
3200	int nr_to_write_done = 0;
3201	struct pagevec pvec;
3202	int nr_pages;
3203	pgoff_t index;
3204	pgoff_t end;		/* Inclusive */
3205	int scanned = 0;
3206	int tag;
3207
3208	pagevec_init(&pvec, 0);
3209	if (wbc->range_cyclic) {
3210		index = mapping->writeback_index; /* Start from prev offset */
3211		end = -1;
3212	} else {
3213		index = wbc->range_start >> PAGE_CACHE_SHIFT;
3214		end = wbc->range_end >> PAGE_CACHE_SHIFT;
3215		scanned = 1;
3216	}
3217	if (wbc->sync_mode == WB_SYNC_ALL)
3218		tag = PAGECACHE_TAG_TOWRITE;
3219	else
3220		tag = PAGECACHE_TAG_DIRTY;
3221retry:
3222	if (wbc->sync_mode == WB_SYNC_ALL)
3223		tag_pages_for_writeback(mapping, index, end);
3224	while (!done && !nr_to_write_done && (index <= end) &&
3225	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
3226			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
3227		unsigned i;
3228
3229		scanned = 1;
3230		for (i = 0; i < nr_pages; i++) {
3231			struct page *page = pvec.pages[i];
3232
3233			if (!PagePrivate(page))
3234				continue;
3235
3236			if (!wbc->range_cyclic && page->index > end) {
3237				done = 1;
3238				break;
3239			}
3240
3241			eb = (struct extent_buffer *)page->private;
3242			if (!eb) {
3243				WARN_ON(1);
3244				continue;
3245			}
3246
3247			if (eb == prev_eb)
 
 
 
 
 
 
 
 
3248				continue;
 
3249
3250			if (!atomic_inc_not_zero(&eb->refs)) {
3251				WARN_ON(1);
3252				continue;
3253			}
3254
 
 
 
 
 
3255			prev_eb = eb;
3256			ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
3257			if (!ret) {
3258				free_extent_buffer(eb);
3259				continue;
3260			}
3261
3262			ret = write_one_eb(eb, fs_info, wbc, &epd);
3263			if (ret) {
3264				done = 1;
3265				free_extent_buffer(eb);
3266				break;
3267			}
3268			free_extent_buffer(eb);
3269
3270			/*
3271			 * the filesystem may choose to bump up nr_to_write.
3272			 * We have to make sure to honor the new nr_to_write
3273			 * at any time
3274			 */
3275			nr_to_write_done = wbc->nr_to_write <= 0;
3276		}
3277		pagevec_release(&pvec);
3278		cond_resched();
3279	}
3280	if (!scanned && !done) {
3281		/*
3282		 * We hit the last page and there is more work to be done: wrap
3283		 * back to the start of the file
3284		 */
3285		scanned = 1;
3286		index = 0;
3287		goto retry;
3288	}
3289	flush_write_bio(&epd);
3290	return ret;
3291}
3292
3293/**
3294 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
3295 * @mapping: address space structure to write
3296 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
3297 * @writepage: function called for each page
3298 * @data: data passed to writepage function
3299 *
3300 * If a page is already under I/O, write_cache_pages() skips it, even
3301 * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
3302 * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
3303 * and msync() need to guarantee that all the data which was dirty at the time
3304 * the call was made get new I/O started against them.  If wbc->sync_mode is
3305 * WB_SYNC_ALL then we were called for data integrity and we must wait for
3306 * existing IO to complete.
3307 */
3308static int extent_write_cache_pages(struct extent_io_tree *tree,
3309			     struct address_space *mapping,
3310			     struct writeback_control *wbc,
3311			     writepage_t writepage, void *data,
3312			     void (*flush_fn)(void *))
3313{
3314	struct inode *inode = mapping->host;
3315	int ret = 0;
3316	int done = 0;
3317	int nr_to_write_done = 0;
3318	struct pagevec pvec;
3319	int nr_pages;
3320	pgoff_t index;
3321	pgoff_t end;		/* Inclusive */
3322	int scanned = 0;
3323	int tag;
3324
3325	/*
3326	 * We have to hold onto the inode so that ordered extents can do their
3327	 * work when the IO finishes.  The alternative to this is failing to add
3328	 * an ordered extent if the igrab() fails there and that is a huge pain
3329	 * to deal with, so instead just hold onto the inode throughout the
3330	 * writepages operation.  If it fails here we are freeing up the inode
3331	 * anyway and we'd rather not waste our time writing out stuff that is
3332	 * going to be truncated anyway.
3333	 */
3334	if (!igrab(inode))
3335		return 0;
3336
3337	pagevec_init(&pvec, 0);
3338	if (wbc->range_cyclic) {
3339		index = mapping->writeback_index; /* Start from prev offset */
3340		end = -1;
3341	} else {
3342		index = wbc->range_start >> PAGE_CACHE_SHIFT;
3343		end = wbc->range_end >> PAGE_CACHE_SHIFT;
3344		scanned = 1;
3345	}
3346	if (wbc->sync_mode == WB_SYNC_ALL)
3347		tag = PAGECACHE_TAG_TOWRITE;
3348	else
3349		tag = PAGECACHE_TAG_DIRTY;
3350retry:
3351	if (wbc->sync_mode == WB_SYNC_ALL)
3352		tag_pages_for_writeback(mapping, index, end);
3353	while (!done && !nr_to_write_done && (index <= end) &&
3354	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
3355			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
3356		unsigned i;
3357
3358		scanned = 1;
3359		for (i = 0; i < nr_pages; i++) {
3360			struct page *page = pvec.pages[i];
3361
3362			/*
3363			 * At this point we hold neither mapping->tree_lock nor
3364			 * lock on the page itself: the page may be truncated or
3365			 * invalidated (changing page->mapping to NULL), or even
3366			 * swizzled back from swapper_space to tmpfs file
3367			 * mapping
3368			 */
3369			if (tree->ops &&
3370			    tree->ops->write_cache_pages_lock_hook) {
3371				tree->ops->write_cache_pages_lock_hook(page,
3372							       data, flush_fn);
3373			} else {
3374				if (!trylock_page(page)) {
3375					flush_fn(data);
3376					lock_page(page);
3377				}
3378			}
3379
3380			if (unlikely(page->mapping != mapping)) {
3381				unlock_page(page);
3382				continue;
3383			}
3384
3385			if (!wbc->range_cyclic && page->index > end) {
3386				done = 1;
3387				unlock_page(page);
3388				continue;
3389			}
3390
3391			if (wbc->sync_mode != WB_SYNC_NONE) {
3392				if (PageWriteback(page))
3393					flush_fn(data);
3394				wait_on_page_writeback(page);
3395			}
3396
3397			if (PageWriteback(page) ||
3398			    !clear_page_dirty_for_io(page)) {
3399				unlock_page(page);
3400				continue;
3401			}
3402
3403			ret = (*writepage)(page, wbc, data);
3404
3405			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
3406				unlock_page(page);
3407				ret = 0;
3408			}
3409			if (ret)
3410				done = 1;
3411
3412			/*
3413			 * the filesystem may choose to bump up nr_to_write.
3414			 * We have to make sure to honor the new nr_to_write
3415			 * at any time
3416			 */
3417			nr_to_write_done = wbc->nr_to_write <= 0;
3418		}
3419		pagevec_release(&pvec);
3420		cond_resched();
3421	}
3422	if (!scanned && !done) {
3423		/*
3424		 * We hit the last page and there is more work to be done: wrap
3425		 * back to the start of the file
3426		 */
3427		scanned = 1;
3428		index = 0;
3429		goto retry;
3430	}
3431	btrfs_add_delayed_iput(inode);
3432	return ret;
3433}
3434
3435static void flush_epd_write_bio(struct extent_page_data *epd)
3436{
3437	if (epd->bio) {
3438		int rw = WRITE;
3439		int ret;
3440
3441		if (epd->sync_io)
3442			rw = WRITE_SYNC;
3443
3444		ret = submit_one_bio(rw, epd->bio, 0, 0);
3445		BUG_ON(ret < 0); /* -ENOMEM */
3446		epd->bio = NULL;
3447	}
3448}
3449
3450static noinline void flush_write_bio(void *data)
3451{
3452	struct extent_page_data *epd = data;
3453	flush_epd_write_bio(epd);
3454}
3455
3456int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
3457			  get_extent_t *get_extent,
3458			  struct writeback_control *wbc)
3459{
3460	int ret;
3461	struct extent_page_data epd = {
3462		.bio = NULL,
3463		.tree = tree,
3464		.get_extent = get_extent,
3465		.extent_locked = 0,
3466		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
 
3467	};
3468
3469	ret = __extent_writepage(page, wbc, &epd);
3470
3471	flush_epd_write_bio(&epd);
3472	return ret;
3473}
3474
3475int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
3476			      u64 start, u64 end, get_extent_t *get_extent,
3477			      int mode)
3478{
3479	int ret = 0;
3480	struct address_space *mapping = inode->i_mapping;
3481	struct page *page;
3482	unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
3483		PAGE_CACHE_SHIFT;
3484
3485	struct extent_page_data epd = {
3486		.bio = NULL,
3487		.tree = tree,
3488		.get_extent = get_extent,
3489		.extent_locked = 1,
3490		.sync_io = mode == WB_SYNC_ALL,
 
3491	};
3492	struct writeback_control wbc_writepages = {
3493		.sync_mode	= mode,
3494		.nr_to_write	= nr_pages * 2,
3495		.range_start	= start,
3496		.range_end	= end + 1,
3497	};
3498
3499	while (start <= end) {
3500		page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
3501		if (clear_page_dirty_for_io(page))
3502			ret = __extent_writepage(page, &wbc_writepages, &epd);
3503		else {
3504			if (tree->ops && tree->ops->writepage_end_io_hook)
3505				tree->ops->writepage_end_io_hook(page, start,
3506						 start + PAGE_CACHE_SIZE - 1,
3507						 NULL, 1);
3508			unlock_page(page);
3509		}
3510		page_cache_release(page);
3511		start += PAGE_CACHE_SIZE;
3512	}
3513
3514	flush_epd_write_bio(&epd);
3515	return ret;
3516}
3517
3518int extent_writepages(struct extent_io_tree *tree,
3519		      struct address_space *mapping,
3520		      get_extent_t *get_extent,
3521		      struct writeback_control *wbc)
3522{
3523	int ret = 0;
3524	struct extent_page_data epd = {
3525		.bio = NULL,
3526		.tree = tree,
3527		.get_extent = get_extent,
3528		.extent_locked = 0,
3529		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
 
3530	};
3531
3532	ret = extent_write_cache_pages(tree, mapping, wbc,
3533				       __extent_writepage, &epd,
3534				       flush_write_bio);
3535	flush_epd_write_bio(&epd);
3536	return ret;
3537}
3538
3539int extent_readpages(struct extent_io_tree *tree,
3540		     struct address_space *mapping,
3541		     struct list_head *pages, unsigned nr_pages,
3542		     get_extent_t get_extent)
3543{
3544	struct bio *bio = NULL;
3545	unsigned page_idx;
3546	unsigned long bio_flags = 0;
 
 
 
 
3547
3548	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
3549		struct page *page = list_entry(pages->prev, struct page, lru);
3550
3551		prefetchw(&page->flags);
3552		list_del(&page->lru);
3553		if (!add_to_page_cache_lru(page, mapping,
3554					page->index, GFP_NOFS)) {
3555			__extent_read_full_page(tree, page, get_extent,
3556						&bio, 0, &bio_flags);
3557		}
3558		page_cache_release(page);
3559	}
 
 
 
 
 
 
 
 
 
 
 
 
 
3560	BUG_ON(!list_empty(pages));
3561	if (bio)
3562		return submit_one_bio(READ, bio, 0, bio_flags);
3563	return 0;
3564}
3565
3566/*
3567 * basic invalidatepage code, this waits on any locked or writeback
3568 * ranges corresponding to the page, and then deletes any extent state
3569 * records from the tree
3570 */
3571int extent_invalidatepage(struct extent_io_tree *tree,
3572			  struct page *page, unsigned long offset)
3573{
3574	struct extent_state *cached_state = NULL;
3575	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
3576	u64 end = start + PAGE_CACHE_SIZE - 1;
3577	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
3578
3579	start += (offset + blocksize - 1) & ~(blocksize - 1);
3580	if (start > end)
3581		return 0;
3582
3583	lock_extent_bits(tree, start, end, 0, &cached_state);
3584	wait_on_page_writeback(page);
3585	clear_extent_bit(tree, start, end,
3586			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
3587			 EXTENT_DO_ACCOUNTING,
3588			 1, 1, &cached_state, GFP_NOFS);
3589	return 0;
3590}
3591
3592/*
3593 * a helper for releasepage, this tests for areas of the page that
3594 * are locked or under IO and drops the related state bits if it is safe
3595 * to drop the page.
3596 */
3597int try_release_extent_state(struct extent_map_tree *map,
3598			     struct extent_io_tree *tree, struct page *page,
3599			     gfp_t mask)
3600{
3601	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
3602	u64 end = start + PAGE_CACHE_SIZE - 1;
3603	int ret = 1;
3604
3605	if (test_range_bit(tree, start, end,
3606			   EXTENT_IOBITS, 0, NULL))
3607		ret = 0;
3608	else {
3609		if ((mask & GFP_NOFS) == GFP_NOFS)
3610			mask = GFP_NOFS;
3611		/*
3612		 * at this point we can safely clear everything except the
3613		 * locked bit and the nodatasum bit
3614		 */
3615		ret = clear_extent_bit(tree, start, end,
3616				 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
3617				 0, 0, NULL, mask);
3618
3619		/* if clear_extent_bit failed for enomem reasons,
3620		 * we can't allow the release to continue.
3621		 */
3622		if (ret < 0)
3623			ret = 0;
3624		else
3625			ret = 1;
3626	}
3627	return ret;
3628}
3629
3630/*
3631 * a helper for releasepage.  As long as there are no locked extents
3632 * in the range corresponding to the page, both state records and extent
3633 * map records are removed
3634 */
3635int try_release_extent_mapping(struct extent_map_tree *map,
3636			       struct extent_io_tree *tree, struct page *page,
3637			       gfp_t mask)
3638{
3639	struct extent_map *em;
3640	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
3641	u64 end = start + PAGE_CACHE_SIZE - 1;
3642
3643	if ((mask & __GFP_WAIT) &&
3644	    page->mapping->host->i_size > 16 * 1024 * 1024) {
3645		u64 len;
3646		while (start <= end) {
3647			len = end - start + 1;
3648			write_lock(&map->lock);
3649			em = lookup_extent_mapping(map, start, len);
3650			if (!em) {
3651				write_unlock(&map->lock);
3652				break;
3653			}
3654			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
3655			    em->start != start) {
3656				write_unlock(&map->lock);
3657				free_extent_map(em);
3658				break;
3659			}
3660			if (!test_range_bit(tree, em->start,
3661					    extent_map_end(em) - 1,
3662					    EXTENT_LOCKED | EXTENT_WRITEBACK,
3663					    0, NULL)) {
3664				remove_extent_mapping(map, em);
3665				/* once for the rb tree */
3666				free_extent_map(em);
3667			}
3668			start = extent_map_end(em);
3669			write_unlock(&map->lock);
3670
3671			/* once for us */
3672			free_extent_map(em);
3673		}
3674	}
3675	return try_release_extent_state(map, tree, page, mask);
3676}
3677
3678/*
3679 * helper function for fiemap, which doesn't want to see any holes.
3680 * This maps until we find something past 'last'
3681 */
3682static struct extent_map *get_extent_skip_holes(struct inode *inode,
3683						u64 offset,
3684						u64 last,
3685						get_extent_t *get_extent)
3686{
3687	u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
3688	struct extent_map *em;
3689	u64 len;
3690
3691	if (offset >= last)
3692		return NULL;
3693
3694	while(1) {
3695		len = last - offset;
3696		if (len == 0)
3697			break;
3698		len = (len + sectorsize - 1) & ~(sectorsize - 1);
3699		em = get_extent(inode, NULL, 0, offset, len, 0);
3700		if (IS_ERR_OR_NULL(em))
3701			return em;
3702
3703		/* if this isn't a hole return it */
3704		if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
3705		    em->block_start != EXTENT_MAP_HOLE) {
3706			return em;
3707		}
3708
3709		/* this is a hole, advance to the next extent */
3710		offset = extent_map_end(em);
3711		free_extent_map(em);
3712		if (offset >= last)
3713			break;
3714	}
3715	return NULL;
3716}
3717
 
 
 
 
 
 
 
 
 
 
 
 
 
3718int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3719		__u64 start, __u64 len, get_extent_t *get_extent)
3720{
3721	int ret = 0;
3722	u64 off = start;
3723	u64 max = start + len;
3724	u32 flags = 0;
3725	u32 found_type;
3726	u64 last;
3727	u64 last_for_get_extent = 0;
3728	u64 disko = 0;
3729	u64 isize = i_size_read(inode);
3730	struct btrfs_key found_key;
3731	struct extent_map *em = NULL;
3732	struct extent_state *cached_state = NULL;
3733	struct btrfs_path *path;
3734	struct btrfs_file_extent_item *item;
3735	int end = 0;
3736	u64 em_start = 0;
3737	u64 em_len = 0;
3738	u64 em_end = 0;
3739	unsigned long emflags;
3740
3741	if (len == 0)
3742		return -EINVAL;
3743
3744	path = btrfs_alloc_path();
3745	if (!path)
3746		return -ENOMEM;
3747	path->leave_spinning = 1;
3748
3749	start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
3750	len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
3751
3752	/*
3753	 * lookup the last file extent.  We're not using i_size here
3754	 * because there might be preallocation past i_size
3755	 */
3756	ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
3757				       path, btrfs_ino(inode), -1, 0);
3758	if (ret < 0) {
3759		btrfs_free_path(path);
3760		return ret;
3761	}
3762	WARN_ON(!ret);
3763	path->slots[0]--;
3764	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3765			      struct btrfs_file_extent_item);
3766	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
3767	found_type = btrfs_key_type(&found_key);
3768
3769	/* No extents, but there might be delalloc bits */
3770	if (found_key.objectid != btrfs_ino(inode) ||
3771	    found_type != BTRFS_EXTENT_DATA_KEY) {
3772		/* have to trust i_size as the end */
3773		last = (u64)-1;
3774		last_for_get_extent = isize;
3775	} else {
3776		/*
3777		 * remember the start of the last extent.  There are a
3778		 * bunch of different factors that go into the length of the
3779		 * extent, so its much less complex to remember where it started
3780		 */
3781		last = found_key.offset;
3782		last_for_get_extent = last + 1;
3783	}
3784	btrfs_free_path(path);
3785
3786	/*
3787	 * we might have some extents allocated but more delalloc past those
3788	 * extents.  so, we trust isize unless the start of the last extent is
3789	 * beyond isize
3790	 */
3791	if (last < isize) {
3792		last = (u64)-1;
3793		last_for_get_extent = isize;
3794	}
3795
3796	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
3797			 &cached_state);
3798
3799	em = get_extent_skip_holes(inode, start, last_for_get_extent,
3800				   get_extent);
3801	if (!em)
3802		goto out;
3803	if (IS_ERR(em)) {
3804		ret = PTR_ERR(em);
3805		goto out;
3806	}
3807
3808	while (!end) {
3809		u64 offset_in_extent;
3810
3811		/* break if the extent we found is outside the range */
3812		if (em->start >= max || extent_map_end(em) < off)
3813			break;
3814
3815		/*
3816		 * get_extent may return an extent that starts before our
3817		 * requested range.  We have to make sure the ranges
3818		 * we return to fiemap always move forward and don't
3819		 * overlap, so adjust the offsets here
3820		 */
3821		em_start = max(em->start, off);
3822
3823		/*
3824		 * record the offset from the start of the extent
3825		 * for adjusting the disk offset below
 
 
3826		 */
3827		offset_in_extent = em_start - em->start;
 
3828		em_end = extent_map_end(em);
3829		em_len = em_end - em_start;
3830		emflags = em->flags;
3831		disko = 0;
3832		flags = 0;
3833
3834		/*
3835		 * bump off for our next call to get_extent
3836		 */
3837		off = extent_map_end(em);
3838		if (off >= max)
3839			end = 1;
3840
3841		if (em->block_start == EXTENT_MAP_LAST_BYTE) {
3842			end = 1;
3843			flags |= FIEMAP_EXTENT_LAST;
3844		} else if (em->block_start == EXTENT_MAP_INLINE) {
3845			flags |= (FIEMAP_EXTENT_DATA_INLINE |
3846				  FIEMAP_EXTENT_NOT_ALIGNED);
3847		} else if (em->block_start == EXTENT_MAP_DELALLOC) {
3848			flags |= (FIEMAP_EXTENT_DELALLOC |
3849				  FIEMAP_EXTENT_UNKNOWN);
3850		} else {
 
 
3851			disko = em->block_start + offset_in_extent;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3852		}
3853		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
3854			flags |= FIEMAP_EXTENT_ENCODED;
3855
3856		free_extent_map(em);
3857		em = NULL;
3858		if ((em_start >= last) || em_len == (u64)-1 ||
3859		   (last == (u64)-1 && isize <= em_end)) {
3860			flags |= FIEMAP_EXTENT_LAST;
3861			end = 1;
3862		}
3863
3864		/* now scan forward to see if this is really the last extent. */
3865		em = get_extent_skip_holes(inode, off, last_for_get_extent,
3866					   get_extent);
3867		if (IS_ERR(em)) {
3868			ret = PTR_ERR(em);
3869			goto out;
3870		}
3871		if (!em) {
3872			flags |= FIEMAP_EXTENT_LAST;
3873			end = 1;
3874		}
3875		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
3876					      em_len, flags);
3877		if (ret)
3878			goto out_free;
3879	}
3880out_free:
3881	free_extent_map(em);
3882out:
3883	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
 
3884			     &cached_state, GFP_NOFS);
3885	return ret;
3886}
3887
3888inline struct page *extent_buffer_page(struct extent_buffer *eb,
3889					      unsigned long i)
3890{
3891	return eb->pages[i];
 
3892}
3893
3894inline unsigned long num_extent_pages(u64 start, u64 len)
3895{
3896	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
3897		(start >> PAGE_CACHE_SHIFT);
 
3898}
3899
3900static void __free_extent_buffer(struct extent_buffer *eb)
 
 
 
 
3901{
3902#if LEAK_DEBUG
3903	unsigned long flags;
3904	spin_lock_irqsave(&leak_lock, flags);
3905	list_del(&eb->leak_list);
3906	spin_unlock_irqrestore(&leak_lock, flags);
3907#endif
3908	if (eb->pages && eb->pages != eb->inline_pages)
3909		kfree(eb->pages);
3910	kmem_cache_free(extent_buffer_cache, eb);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3911}
3912
3913static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3914						   u64 start,
3915						   unsigned long len,
3916						   gfp_t mask)
 
 
 
 
 
 
 
 
3917{
3918	struct extent_buffer *eb = NULL;
3919#if LEAK_DEBUG
3920	unsigned long flags;
3921#endif
3922
3923	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
3924	if (eb == NULL)
3925		return NULL;
3926	eb->start = start;
3927	eb->len = len;
3928	eb->tree = tree;
3929	eb->bflags = 0;
3930	rwlock_init(&eb->lock);
3931	atomic_set(&eb->write_locks, 0);
3932	atomic_set(&eb->read_locks, 0);
3933	atomic_set(&eb->blocking_readers, 0);
3934	atomic_set(&eb->blocking_writers, 0);
3935	atomic_set(&eb->spinning_readers, 0);
3936	atomic_set(&eb->spinning_writers, 0);
3937	eb->lock_nested = 0;
3938	init_waitqueue_head(&eb->write_lock_wq);
3939	init_waitqueue_head(&eb->read_lock_wq);
3940
3941#if LEAK_DEBUG
3942	spin_lock_irqsave(&leak_lock, flags);
3943	list_add(&eb->leak_list, &buffers);
3944	spin_unlock_irqrestore(&leak_lock, flags);
3945#endif
3946	spin_lock_init(&eb->refs_lock);
3947	atomic_set(&eb->refs, 1);
3948	atomic_set(&eb->io_pages, 0);
3949
3950	if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) {
3951		struct page **pages;
3952		int num_pages = (len + PAGE_CACHE_SIZE - 1) >>
3953			PAGE_CACHE_SHIFT;
3954		pages = kzalloc(num_pages, mask);
3955		if (!pages) {
3956			__free_extent_buffer(eb);
3957			return NULL;
3958		}
3959		eb->pages = pages;
3960	} else {
3961		eb->pages = eb->inline_pages;
3962	}
3963
3964	return eb;
3965}
3966
3967struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
3968{
3969	unsigned long i;
3970	struct page *p;
3971	struct extent_buffer *new;
3972	unsigned long num_pages = num_extent_pages(src->start, src->len);
3973
3974	new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC);
3975	if (new == NULL)
3976		return NULL;
3977
3978	for (i = 0; i < num_pages; i++) {
3979		p = alloc_page(GFP_ATOMIC);
3980		BUG_ON(!p);
 
 
 
3981		attach_extent_buffer_page(new, p);
3982		WARN_ON(PageDirty(p));
3983		SetPageUptodate(p);
3984		new->pages[i] = p;
3985	}
3986
3987	copy_extent_buffer(new, src, 0, 0, src->len);
3988	set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
3989	set_bit(EXTENT_BUFFER_DUMMY, &new->bflags);
3990
3991	return new;
3992}
3993
3994struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
3995{
3996	struct extent_buffer *eb;
3997	unsigned long num_pages = num_extent_pages(0, len);
3998	unsigned long i;
3999
4000	eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC);
4001	if (!eb)
4002		return NULL;
4003
4004	for (i = 0; i < num_pages; i++) {
4005		eb->pages[i] = alloc_page(GFP_ATOMIC);
4006		if (!eb->pages[i])
4007			goto err;
4008	}
4009	set_extent_buffer_uptodate(eb);
4010	btrfs_set_header_nritems(eb, 0);
4011	set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4012
4013	return eb;
4014err:
4015	for (i--; i > 0; i--)
4016		__free_page(eb->pages[i]);
4017	__free_extent_buffer(eb);
4018	return NULL;
4019}
4020
4021static int extent_buffer_under_io(struct extent_buffer *eb)
4022{
4023	return (atomic_read(&eb->io_pages) ||
4024		test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
4025		test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4026}
4027
4028/*
4029 * Helper for releasing extent buffer page.
4030 */
4031static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
4032						unsigned long start_idx)
4033{
4034	unsigned long index;
4035	unsigned long num_pages;
4036	struct page *page;
4037	int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4038
4039	BUG_ON(extent_buffer_under_io(eb));
4040
4041	num_pages = num_extent_pages(eb->start, eb->len);
4042	index = start_idx + num_pages;
4043	if (start_idx >= index)
4044		return;
4045
4046	do {
4047		index--;
4048		page = extent_buffer_page(eb, index);
4049		if (page && mapped) {
4050			spin_lock(&page->mapping->private_lock);
4051			/*
4052			 * We do this since we'll remove the pages after we've
4053			 * removed the eb from the radix tree, so we could race
4054			 * and have this page now attached to the new eb.  So
4055			 * only clear page_private if it's still connected to
4056			 * this eb.
4057			 */
4058			if (PagePrivate(page) &&
4059			    page->private == (unsigned long)eb) {
4060				BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4061				BUG_ON(PageDirty(page));
4062				BUG_ON(PageWriteback(page));
4063				/*
4064				 * We need to make sure we haven't be attached
4065				 * to a new eb.
4066				 */
4067				ClearPagePrivate(page);
4068				set_page_private(page, 0);
4069				/* One for the page private */
4070				page_cache_release(page);
4071			}
4072			spin_unlock(&page->mapping->private_lock);
4073
4074		}
4075		if (page) {
4076			/* One for when we alloced the page */
4077			page_cache_release(page);
4078		}
4079	} while (index != start_idx);
4080}
4081
4082/*
4083 * Helper for releasing the extent buffer.
4084 */
4085static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
4086{
4087	btrfs_release_extent_buffer_page(eb, 0);
4088	__free_extent_buffer(eb);
4089}
4090
4091static void check_buffer_tree_ref(struct extent_buffer *eb)
4092{
 
4093	/* the ref bit is tricky.  We have to make sure it is set
4094	 * if we have the buffer dirty.   Otherwise the
4095	 * code to free a buffer can end up dropping a dirty
4096	 * page
4097	 *
4098	 * Once the ref bit is set, it won't go away while the
4099	 * buffer is dirty or in writeback, and it also won't
4100	 * go away while we have the reference count on the
4101	 * eb bumped.
4102	 *
4103	 * We can't just set the ref bit without bumping the
4104	 * ref on the eb because free_extent_buffer might
4105	 * see the ref bit and try to clear it.  If this happens
4106	 * free_extent_buffer might end up dropping our original
4107	 * ref by mistake and freeing the page before we are able
4108	 * to add one more ref.
4109	 *
4110	 * So bump the ref count first, then set the bit.  If someone
4111	 * beat us to it, drop the ref we added.
4112	 */
4113	if (!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
 
 
 
 
 
4114		atomic_inc(&eb->refs);
4115		if (test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4116			atomic_dec(&eb->refs);
4117	}
4118}
4119
4120static void mark_extent_buffer_accessed(struct extent_buffer *eb)
4121{
4122	unsigned long num_pages, i;
4123
4124	check_buffer_tree_ref(eb);
4125
4126	num_pages = num_extent_pages(eb->start, eb->len);
4127	for (i = 0; i < num_pages; i++) {
4128		struct page *p = extent_buffer_page(eb, i);
4129		mark_page_accessed(p);
4130	}
4131}
4132
4133struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4134					  u64 start, unsigned long len)
4135{
4136	unsigned long num_pages = num_extent_pages(start, len);
4137	unsigned long i;
4138	unsigned long index = start >> PAGE_CACHE_SHIFT;
4139	struct extent_buffer *eb;
4140	struct extent_buffer *exists = NULL;
4141	struct page *p;
4142	struct address_space *mapping = tree->mapping;
4143	int uptodate = 1;
4144	int ret;
4145
4146	rcu_read_lock();
4147	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4148	if (eb && atomic_inc_not_zero(&eb->refs)) {
4149		rcu_read_unlock();
4150		mark_extent_buffer_accessed(eb);
4151		return eb;
4152	}
4153	rcu_read_unlock();
4154
4155	eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
4156	if (!eb)
4157		return NULL;
4158
4159	for (i = 0; i < num_pages; i++, index++) {
4160		p = find_or_create_page(mapping, index, GFP_NOFS);
4161		if (!p) {
4162			WARN_ON(1);
4163			goto free_eb;
4164		}
4165
4166		spin_lock(&mapping->private_lock);
4167		if (PagePrivate(p)) {
4168			/*
4169			 * We could have already allocated an eb for this page
4170			 * and attached one so lets see if we can get a ref on
4171			 * the existing eb, and if we can we know it's good and
4172			 * we can just return that one, else we know we can just
4173			 * overwrite page->private.
4174			 */
4175			exists = (struct extent_buffer *)p->private;
4176			if (atomic_inc_not_zero(&exists->refs)) {
4177				spin_unlock(&mapping->private_lock);
4178				unlock_page(p);
4179				page_cache_release(p);
4180				mark_extent_buffer_accessed(exists);
4181				goto free_eb;
4182			}
4183
4184			/*
4185			 * Do this so attach doesn't complain and we need to
4186			 * drop the ref the old guy had.
4187			 */
4188			ClearPagePrivate(p);
4189			WARN_ON(PageDirty(p));
4190			page_cache_release(p);
4191		}
4192		attach_extent_buffer_page(eb, p);
4193		spin_unlock(&mapping->private_lock);
4194		WARN_ON(PageDirty(p));
4195		mark_page_accessed(p);
4196		eb->pages[i] = p;
4197		if (!PageUptodate(p))
4198			uptodate = 0;
4199
4200		/*
4201		 * see below about how we avoid a nasty race with release page
4202		 * and why we unlock later
4203		 */
4204	}
4205	if (uptodate)
4206		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4207again:
4208	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
4209	if (ret)
4210		goto free_eb;
4211
4212	spin_lock(&tree->buffer_lock);
4213	ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
 
 
 
4214	if (ret == -EEXIST) {
4215		exists = radix_tree_lookup(&tree->buffer,
4216						start >> PAGE_CACHE_SHIFT);
4217		if (!atomic_inc_not_zero(&exists->refs)) {
4218			spin_unlock(&tree->buffer_lock);
4219			radix_tree_preload_end();
4220			exists = NULL;
4221			goto again;
4222		}
4223		spin_unlock(&tree->buffer_lock);
4224		radix_tree_preload_end();
4225		mark_extent_buffer_accessed(exists);
4226		goto free_eb;
4227	}
4228	/* add one reference for the tree */
4229	spin_lock(&eb->refs_lock);
4230	check_buffer_tree_ref(eb);
4231	spin_unlock(&eb->refs_lock);
4232	spin_unlock(&tree->buffer_lock);
4233	radix_tree_preload_end();
4234
4235	/*
4236	 * there is a race where release page may have
4237	 * tried to find this extent buffer in the radix
4238	 * but failed.  It will tell the VM it is safe to
4239	 * reclaim the, and it will clear the page private bit.
4240	 * We must make sure to set the page private bit properly
4241	 * after the extent buffer is in the radix tree so
4242	 * it doesn't get lost
4243	 */
4244	SetPageChecked(eb->pages[0]);
4245	for (i = 1; i < num_pages; i++) {
4246		p = extent_buffer_page(eb, i);
4247		ClearPageChecked(p);
4248		unlock_page(p);
4249	}
4250	unlock_page(eb->pages[0]);
4251	return eb;
4252
4253free_eb:
4254	for (i = 0; i < num_pages; i++) {
4255		if (eb->pages[i])
4256			unlock_page(eb->pages[i]);
4257	}
4258
4259	WARN_ON(!atomic_dec_and_test(&eb->refs));
4260	btrfs_release_extent_buffer(eb);
4261	return exists;
4262}
4263
4264struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
4265					 u64 start, unsigned long len)
4266{
4267	struct extent_buffer *eb;
4268
4269	rcu_read_lock();
4270	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4271	if (eb && atomic_inc_not_zero(&eb->refs)) {
4272		rcu_read_unlock();
4273		mark_extent_buffer_accessed(eb);
4274		return eb;
4275	}
4276	rcu_read_unlock();
4277
4278	return NULL;
4279}
4280
4281static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
4282{
4283	struct extent_buffer *eb =
4284			container_of(head, struct extent_buffer, rcu_head);
4285
4286	__free_extent_buffer(eb);
4287}
4288
4289/* Expects to have eb->eb_lock already held */
4290static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
4291{
4292	WARN_ON(atomic_read(&eb->refs) == 0);
4293	if (atomic_dec_and_test(&eb->refs)) {
4294		if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) {
4295			spin_unlock(&eb->refs_lock);
4296		} else {
4297			struct extent_io_tree *tree = eb->tree;
4298
4299			spin_unlock(&eb->refs_lock);
4300
4301			spin_lock(&tree->buffer_lock);
4302			radix_tree_delete(&tree->buffer,
4303					  eb->start >> PAGE_CACHE_SHIFT);
4304			spin_unlock(&tree->buffer_lock);
 
 
4305		}
4306
4307		/* Should be safe to release our pages at this point */
4308		btrfs_release_extent_buffer_page(eb, 0);
4309
4310		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
4311		return;
4312	}
4313	spin_unlock(&eb->refs_lock);
 
 
4314}
4315
4316void free_extent_buffer(struct extent_buffer *eb)
4317{
 
 
4318	if (!eb)
4319		return;
4320
 
 
 
 
 
 
 
 
 
4321	spin_lock(&eb->refs_lock);
4322	if (atomic_read(&eb->refs) == 2 &&
4323	    test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
4324		atomic_dec(&eb->refs);
4325
4326	if (atomic_read(&eb->refs) == 2 &&
4327	    test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
4328	    !extent_buffer_under_io(eb) &&
4329	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4330		atomic_dec(&eb->refs);
4331
4332	/*
4333	 * I know this is terrible, but it's temporary until we stop tracking
4334	 * the uptodate bits and such for the extent buffers.
4335	 */
4336	release_extent_buffer(eb, GFP_ATOMIC);
4337}
4338
4339void free_extent_buffer_stale(struct extent_buffer *eb)
4340{
4341	if (!eb)
4342		return;
4343
4344	spin_lock(&eb->refs_lock);
4345	set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
4346
4347	if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
4348	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4349		atomic_dec(&eb->refs);
4350	release_extent_buffer(eb, GFP_NOFS);
4351}
4352
4353void clear_extent_buffer_dirty(struct extent_buffer *eb)
4354{
4355	unsigned long i;
4356	unsigned long num_pages;
4357	struct page *page;
4358
4359	num_pages = num_extent_pages(eb->start, eb->len);
4360
4361	for (i = 0; i < num_pages; i++) {
4362		page = extent_buffer_page(eb, i);
4363		if (!PageDirty(page))
4364			continue;
4365
4366		lock_page(page);
4367		WARN_ON(!PagePrivate(page));
4368
4369		clear_page_dirty_for_io(page);
4370		spin_lock_irq(&page->mapping->tree_lock);
4371		if (!PageDirty(page)) {
4372			radix_tree_tag_clear(&page->mapping->page_tree,
4373						page_index(page),
4374						PAGECACHE_TAG_DIRTY);
4375		}
4376		spin_unlock_irq(&page->mapping->tree_lock);
4377		ClearPageError(page);
4378		unlock_page(page);
4379	}
4380	WARN_ON(atomic_read(&eb->refs) == 0);
4381}
4382
4383int set_extent_buffer_dirty(struct extent_buffer *eb)
4384{
4385	unsigned long i;
4386	unsigned long num_pages;
4387	int was_dirty = 0;
4388
4389	check_buffer_tree_ref(eb);
4390
4391	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
4392
4393	num_pages = num_extent_pages(eb->start, eb->len);
4394	WARN_ON(atomic_read(&eb->refs) == 0);
4395	WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
4396
4397	for (i = 0; i < num_pages; i++)
4398		set_page_dirty(extent_buffer_page(eb, i));
4399	return was_dirty;
4400}
4401
4402static int range_straddles_pages(u64 start, u64 len)
4403{
4404	if (len < PAGE_CACHE_SIZE)
4405		return 1;
4406	if (start & (PAGE_CACHE_SIZE - 1))
4407		return 1;
4408	if ((start + len) & (PAGE_CACHE_SIZE - 1))
4409		return 1;
4410	return 0;
4411}
4412
4413int clear_extent_buffer_uptodate(struct extent_buffer *eb)
4414{
4415	unsigned long i;
4416	struct page *page;
4417	unsigned long num_pages;
4418
4419	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4420	num_pages = num_extent_pages(eb->start, eb->len);
4421	for (i = 0; i < num_pages; i++) {
4422		page = extent_buffer_page(eb, i);
4423		if (page)
4424			ClearPageUptodate(page);
4425	}
4426	return 0;
4427}
4428
4429int set_extent_buffer_uptodate(struct extent_buffer *eb)
4430{
4431	unsigned long i;
4432	struct page *page;
4433	unsigned long num_pages;
4434
4435	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4436	num_pages = num_extent_pages(eb->start, eb->len);
4437	for (i = 0; i < num_pages; i++) {
4438		page = extent_buffer_page(eb, i);
4439		SetPageUptodate(page);
4440	}
4441	return 0;
4442}
4443
4444int extent_range_uptodate(struct extent_io_tree *tree,
4445			  u64 start, u64 end)
4446{
4447	struct page *page;
4448	int ret;
4449	int pg_uptodate = 1;
4450	int uptodate;
4451	unsigned long index;
4452
4453	if (range_straddles_pages(start, end - start + 1)) {
4454		ret = test_range_bit(tree, start, end,
4455				     EXTENT_UPTODATE, 1, NULL);
4456		if (ret)
4457			return 1;
4458	}
4459	while (start <= end) {
4460		index = start >> PAGE_CACHE_SHIFT;
4461		page = find_get_page(tree->mapping, index);
4462		if (!page)
4463			return 1;
4464		uptodate = PageUptodate(page);
4465		page_cache_release(page);
4466		if (!uptodate) {
4467			pg_uptodate = 0;
4468			break;
4469		}
4470		start += PAGE_CACHE_SIZE;
4471	}
4472	return pg_uptodate;
4473}
4474
4475int extent_buffer_uptodate(struct extent_buffer *eb)
4476{
4477	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4478}
4479
4480int read_extent_buffer_pages(struct extent_io_tree *tree,
4481			     struct extent_buffer *eb, u64 start, int wait,
4482			     get_extent_t *get_extent, int mirror_num)
4483{
4484	unsigned long i;
4485	unsigned long start_i;
4486	struct page *page;
4487	int err;
4488	int ret = 0;
4489	int locked_pages = 0;
4490	int all_uptodate = 1;
4491	unsigned long num_pages;
4492	unsigned long num_reads = 0;
4493	struct bio *bio = NULL;
4494	unsigned long bio_flags = 0;
4495
4496	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
4497		return 0;
4498
4499	if (start) {
4500		WARN_ON(start < eb->start);
4501		start_i = (start >> PAGE_CACHE_SHIFT) -
4502			(eb->start >> PAGE_CACHE_SHIFT);
4503	} else {
4504		start_i = 0;
4505	}
4506
4507	num_pages = num_extent_pages(eb->start, eb->len);
4508	for (i = start_i; i < num_pages; i++) {
4509		page = extent_buffer_page(eb, i);
4510		if (wait == WAIT_NONE) {
4511			if (!trylock_page(page))
4512				goto unlock_exit;
4513		} else {
4514			lock_page(page);
4515		}
4516		locked_pages++;
4517		if (!PageUptodate(page)) {
4518			num_reads++;
4519			all_uptodate = 0;
4520		}
4521	}
4522	if (all_uptodate) {
4523		if (start_i == 0)
4524			set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4525		goto unlock_exit;
4526	}
4527
4528	clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
4529	eb->read_mirror = 0;
4530	atomic_set(&eb->io_pages, num_reads);
4531	for (i = start_i; i < num_pages; i++) {
4532		page = extent_buffer_page(eb, i);
4533		if (!PageUptodate(page)) {
4534			ClearPageError(page);
4535			err = __extent_read_full_page(tree, page,
4536						      get_extent, &bio,
4537						      mirror_num, &bio_flags);
 
4538			if (err)
4539				ret = err;
4540		} else {
4541			unlock_page(page);
4542		}
4543	}
4544
4545	if (bio) {
4546		err = submit_one_bio(READ, bio, mirror_num, bio_flags);
 
4547		if (err)
4548			return err;
4549	}
4550
4551	if (ret || wait != WAIT_COMPLETE)
4552		return ret;
4553
4554	for (i = start_i; i < num_pages; i++) {
4555		page = extent_buffer_page(eb, i);
4556		wait_on_page_locked(page);
4557		if (!PageUptodate(page))
4558			ret = -EIO;
4559	}
4560
4561	return ret;
4562
4563unlock_exit:
4564	i = start_i;
4565	while (locked_pages > 0) {
4566		page = extent_buffer_page(eb, i);
4567		i++;
4568		unlock_page(page);
4569		locked_pages--;
4570	}
4571	return ret;
4572}
4573
4574void read_extent_buffer(struct extent_buffer *eb, void *dstv,
4575			unsigned long start,
4576			unsigned long len)
4577{
4578	size_t cur;
4579	size_t offset;
4580	struct page *page;
4581	char *kaddr;
4582	char *dst = (char *)dstv;
4583	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4584	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4585
4586	WARN_ON(start > eb->len);
4587	WARN_ON(start + len > eb->start + eb->len);
4588
4589	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4590
4591	while (len > 0) {
4592		page = extent_buffer_page(eb, i);
4593
4594		cur = min(len, (PAGE_CACHE_SIZE - offset));
4595		kaddr = page_address(page);
4596		memcpy(dst, kaddr + offset, cur);
4597
4598		dst += cur;
4599		len -= cur;
4600		offset = 0;
4601		i++;
4602	}
4603}
4604
4605int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
4606			       unsigned long min_len, char **map,
4607			       unsigned long *map_start,
4608			       unsigned long *map_len)
4609{
4610	size_t offset = start & (PAGE_CACHE_SIZE - 1);
4611	char *kaddr;
4612	struct page *p;
4613	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4614	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4615	unsigned long end_i = (start_offset + start + min_len - 1) >>
4616		PAGE_CACHE_SHIFT;
4617
4618	if (i != end_i)
4619		return -EINVAL;
4620
4621	if (i == 0) {
4622		offset = start_offset;
4623		*map_start = 0;
4624	} else {
4625		offset = 0;
4626		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
4627	}
4628
4629	if (start + min_len > eb->len) {
4630		printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
4631		       "wanted %lu %lu\n", (unsigned long long)eb->start,
4632		       eb->len, start, min_len);
4633		WARN_ON(1);
4634		return -EINVAL;
4635	}
4636
4637	p = extent_buffer_page(eb, i);
4638	kaddr = page_address(p);
4639	*map = kaddr + offset;
4640	*map_len = PAGE_CACHE_SIZE - offset;
4641	return 0;
4642}
4643
4644int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
4645			  unsigned long start,
4646			  unsigned long len)
4647{
4648	size_t cur;
4649	size_t offset;
4650	struct page *page;
4651	char *kaddr;
4652	char *ptr = (char *)ptrv;
4653	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4654	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4655	int ret = 0;
4656
4657	WARN_ON(start > eb->len);
4658	WARN_ON(start + len > eb->start + eb->len);
4659
4660	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4661
4662	while (len > 0) {
4663		page = extent_buffer_page(eb, i);
4664
4665		cur = min(len, (PAGE_CACHE_SIZE - offset));
4666
4667		kaddr = page_address(page);
4668		ret = memcmp(ptr, kaddr + offset, cur);
4669		if (ret)
4670			break;
4671
4672		ptr += cur;
4673		len -= cur;
4674		offset = 0;
4675		i++;
4676	}
4677	return ret;
4678}
4679
4680void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
4681			 unsigned long start, unsigned long len)
4682{
4683	size_t cur;
4684	size_t offset;
4685	struct page *page;
4686	char *kaddr;
4687	char *src = (char *)srcv;
4688	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4689	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4690
4691	WARN_ON(start > eb->len);
4692	WARN_ON(start + len > eb->start + eb->len);
4693
4694	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4695
4696	while (len > 0) {
4697		page = extent_buffer_page(eb, i);
4698		WARN_ON(!PageUptodate(page));
4699
4700		cur = min(len, PAGE_CACHE_SIZE - offset);
4701		kaddr = page_address(page);
4702		memcpy(kaddr + offset, src, cur);
4703
4704		src += cur;
4705		len -= cur;
4706		offset = 0;
4707		i++;
4708	}
4709}
4710
4711void memset_extent_buffer(struct extent_buffer *eb, char c,
4712			  unsigned long start, unsigned long len)
4713{
4714	size_t cur;
4715	size_t offset;
4716	struct page *page;
4717	char *kaddr;
4718	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4719	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4720
4721	WARN_ON(start > eb->len);
4722	WARN_ON(start + len > eb->start + eb->len);
4723
4724	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4725
4726	while (len > 0) {
4727		page = extent_buffer_page(eb, i);
4728		WARN_ON(!PageUptodate(page));
4729
4730		cur = min(len, PAGE_CACHE_SIZE - offset);
4731		kaddr = page_address(page);
4732		memset(kaddr + offset, c, cur);
4733
4734		len -= cur;
4735		offset = 0;
4736		i++;
4737	}
4738}
4739
4740void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
4741			unsigned long dst_offset, unsigned long src_offset,
4742			unsigned long len)
4743{
4744	u64 dst_len = dst->len;
4745	size_t cur;
4746	size_t offset;
4747	struct page *page;
4748	char *kaddr;
4749	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4750	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
4751
4752	WARN_ON(src->len != dst_len);
4753
4754	offset = (start_offset + dst_offset) &
4755		((unsigned long)PAGE_CACHE_SIZE - 1);
4756
4757	while (len > 0) {
4758		page = extent_buffer_page(dst, i);
4759		WARN_ON(!PageUptodate(page));
4760
4761		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
4762
4763		kaddr = page_address(page);
4764		read_extent_buffer(src, kaddr + offset, src_offset, cur);
4765
4766		src_offset += cur;
4767		len -= cur;
4768		offset = 0;
4769		i++;
4770	}
4771}
4772
4773static void move_pages(struct page *dst_page, struct page *src_page,
4774		       unsigned long dst_off, unsigned long src_off,
4775		       unsigned long len)
4776{
4777	char *dst_kaddr = page_address(dst_page);
4778	if (dst_page == src_page) {
4779		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
4780	} else {
4781		char *src_kaddr = page_address(src_page);
4782		char *p = dst_kaddr + dst_off + len;
4783		char *s = src_kaddr + src_off + len;
4784
4785		while (len--)
4786			*--p = *--s;
4787	}
4788}
4789
4790static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
4791{
4792	unsigned long distance = (src > dst) ? src - dst : dst - src;
4793	return distance < len;
4794}
4795
4796static void copy_pages(struct page *dst_page, struct page *src_page,
4797		       unsigned long dst_off, unsigned long src_off,
4798		       unsigned long len)
4799{
4800	char *dst_kaddr = page_address(dst_page);
4801	char *src_kaddr;
4802	int must_memmove = 0;
4803
4804	if (dst_page != src_page) {
4805		src_kaddr = page_address(src_page);
4806	} else {
4807		src_kaddr = dst_kaddr;
4808		if (areas_overlap(src_off, dst_off, len))
4809			must_memmove = 1;
4810	}
4811
4812	if (must_memmove)
4813		memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
4814	else
4815		memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
4816}
4817
4818void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4819			   unsigned long src_offset, unsigned long len)
4820{
4821	size_t cur;
4822	size_t dst_off_in_page;
4823	size_t src_off_in_page;
4824	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4825	unsigned long dst_i;
4826	unsigned long src_i;
4827
4828	if (src_offset + len > dst->len) {
4829		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
4830		       "len %lu dst len %lu\n", src_offset, len, dst->len);
4831		BUG_ON(1);
4832	}
4833	if (dst_offset + len > dst->len) {
4834		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
4835		       "len %lu dst len %lu\n", dst_offset, len, dst->len);
4836		BUG_ON(1);
4837	}
4838
4839	while (len > 0) {
4840		dst_off_in_page = (start_offset + dst_offset) &
4841			((unsigned long)PAGE_CACHE_SIZE - 1);
4842		src_off_in_page = (start_offset + src_offset) &
4843			((unsigned long)PAGE_CACHE_SIZE - 1);
4844
4845		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
4846		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
4847
4848		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
4849					       src_off_in_page));
4850		cur = min_t(unsigned long, cur,
4851			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
4852
4853		copy_pages(extent_buffer_page(dst, dst_i),
4854			   extent_buffer_page(dst, src_i),
4855			   dst_off_in_page, src_off_in_page, cur);
4856
4857		src_offset += cur;
4858		dst_offset += cur;
4859		len -= cur;
4860	}
4861}
4862
4863void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4864			   unsigned long src_offset, unsigned long len)
4865{
4866	size_t cur;
4867	size_t dst_off_in_page;
4868	size_t src_off_in_page;
4869	unsigned long dst_end = dst_offset + len - 1;
4870	unsigned long src_end = src_offset + len - 1;
4871	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4872	unsigned long dst_i;
4873	unsigned long src_i;
4874
4875	if (src_offset + len > dst->len) {
4876		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
4877		       "len %lu len %lu\n", src_offset, len, dst->len);
4878		BUG_ON(1);
4879	}
4880	if (dst_offset + len > dst->len) {
4881		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
4882		       "len %lu len %lu\n", dst_offset, len, dst->len);
4883		BUG_ON(1);
4884	}
4885	if (dst_offset < src_offset) {
4886		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
4887		return;
4888	}
4889	while (len > 0) {
4890		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
4891		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
4892
4893		dst_off_in_page = (start_offset + dst_end) &
4894			((unsigned long)PAGE_CACHE_SIZE - 1);
4895		src_off_in_page = (start_offset + src_end) &
4896			((unsigned long)PAGE_CACHE_SIZE - 1);
4897
4898		cur = min_t(unsigned long, len, src_off_in_page + 1);
4899		cur = min(cur, dst_off_in_page + 1);
4900		move_pages(extent_buffer_page(dst, dst_i),
4901			   extent_buffer_page(dst, src_i),
4902			   dst_off_in_page - cur + 1,
4903			   src_off_in_page - cur + 1, cur);
4904
4905		dst_end -= cur;
4906		src_end -= cur;
4907		len -= cur;
4908	}
4909}
4910
4911int try_release_extent_buffer(struct page *page, gfp_t mask)
4912{
4913	struct extent_buffer *eb;
4914
4915	/*
4916	 * We need to make sure noboody is attaching this page to an eb right
4917	 * now.
4918	 */
4919	spin_lock(&page->mapping->private_lock);
4920	if (!PagePrivate(page)) {
4921		spin_unlock(&page->mapping->private_lock);
4922		return 1;
4923	}
4924
4925	eb = (struct extent_buffer *)page->private;
4926	BUG_ON(!eb);
4927
4928	/*
4929	 * This is a little awful but should be ok, we need to make sure that
4930	 * the eb doesn't disappear out from under us while we're looking at
4931	 * this page.
4932	 */
4933	spin_lock(&eb->refs_lock);
4934	if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
4935		spin_unlock(&eb->refs_lock);
4936		spin_unlock(&page->mapping->private_lock);
4937		return 0;
4938	}
4939	spin_unlock(&page->mapping->private_lock);
4940
4941	if ((mask & GFP_NOFS) == GFP_NOFS)
4942		mask = GFP_NOFS;
4943
4944	/*
4945	 * If tree ref isn't set then we know the ref on this eb is a real ref,
4946	 * so just return, this page will likely be freed soon anyway.
4947	 */
4948	if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
4949		spin_unlock(&eb->refs_lock);
4950		return 0;
4951	}
4952	release_extent_buffer(eb, mask);
4953
4954	return 1;
4955}
v3.15
   1#include <linux/bitops.h>
   2#include <linux/slab.h>
   3#include <linux/bio.h>
   4#include <linux/mm.h>
   5#include <linux/pagemap.h>
   6#include <linux/page-flags.h>
 
   7#include <linux/spinlock.h>
   8#include <linux/blkdev.h>
   9#include <linux/swap.h>
  10#include <linux/writeback.h>
  11#include <linux/pagevec.h>
  12#include <linux/prefetch.h>
  13#include <linux/cleancache.h>
  14#include "extent_io.h"
  15#include "extent_map.h"
 
  16#include "ctree.h"
  17#include "btrfs_inode.h"
  18#include "volumes.h"
  19#include "check-integrity.h"
  20#include "locking.h"
  21#include "rcu-string.h"
  22#include "backref.h"
  23
  24static struct kmem_cache *extent_state_cache;
  25static struct kmem_cache *extent_buffer_cache;
  26static struct bio_set *btrfs_bioset;
  27
  28#ifdef CONFIG_BTRFS_DEBUG
  29static LIST_HEAD(buffers);
  30static LIST_HEAD(states);
  31
 
 
  32static DEFINE_SPINLOCK(leak_lock);
  33
  34static inline
  35void btrfs_leak_debug_add(struct list_head *new, struct list_head *head)
  36{
  37	unsigned long flags;
  38
  39	spin_lock_irqsave(&leak_lock, flags);
  40	list_add(new, head);
  41	spin_unlock_irqrestore(&leak_lock, flags);
  42}
  43
  44static inline
  45void btrfs_leak_debug_del(struct list_head *entry)
  46{
  47	unsigned long flags;
  48
  49	spin_lock_irqsave(&leak_lock, flags);
  50	list_del(entry);
  51	spin_unlock_irqrestore(&leak_lock, flags);
  52}
  53
  54static inline
  55void btrfs_leak_debug_check(void)
  56{
  57	struct extent_state *state;
  58	struct extent_buffer *eb;
  59
  60	while (!list_empty(&states)) {
  61		state = list_entry(states.next, struct extent_state, leak_list);
  62		printk(KERN_ERR "BTRFS: state leak: start %llu end %llu "
  63		       "state %lu in tree %p refs %d\n",
  64		       state->start, state->end, state->state, state->tree,
  65		       atomic_read(&state->refs));
  66		list_del(&state->leak_list);
  67		kmem_cache_free(extent_state_cache, state);
  68	}
  69
  70	while (!list_empty(&buffers)) {
  71		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
  72		printk(KERN_ERR "BTRFS: buffer leak start %llu len %lu "
  73		       "refs %d\n",
  74		       eb->start, eb->len, atomic_read(&eb->refs));
  75		list_del(&eb->leak_list);
  76		kmem_cache_free(extent_buffer_cache, eb);
  77	}
  78}
  79
  80#define btrfs_debug_check_extent_io_range(tree, start, end)		\
  81	__btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
  82static inline void __btrfs_debug_check_extent_io_range(const char *caller,
  83		struct extent_io_tree *tree, u64 start, u64 end)
  84{
  85	struct inode *inode;
  86	u64 isize;
  87
  88	if (!tree->mapping)
  89		return;
  90
  91	inode = tree->mapping->host;
  92	isize = i_size_read(inode);
  93	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
  94		printk_ratelimited(KERN_DEBUG
  95		    "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
  96				caller, btrfs_ino(inode), isize, start, end);
  97	}
  98}
  99#else
 100#define btrfs_leak_debug_add(new, head)	do {} while (0)
 101#define btrfs_leak_debug_del(entry)	do {} while (0)
 102#define btrfs_leak_debug_check()	do {} while (0)
 103#define btrfs_debug_check_extent_io_range(c, s, e)	do {} while (0)
 104#endif
 105
 106#define BUFFER_LRU_MAX 64
 107
 108struct tree_entry {
 109	u64 start;
 110	u64 end;
 111	struct rb_node rb_node;
 112};
 113
 114struct extent_page_data {
 115	struct bio *bio;
 116	struct extent_io_tree *tree;
 117	get_extent_t *get_extent;
 118	unsigned long bio_flags;
 119
 120	/* tells writepage not to lock the state bits for this range
 121	 * it still does the unlocking
 122	 */
 123	unsigned int extent_locked:1;
 124
 125	/* tells the submit_bio code to use a WRITE_SYNC */
 126	unsigned int sync_io:1;
 127};
 128
 129static noinline void flush_write_bio(void *data);
 130static inline struct btrfs_fs_info *
 131tree_fs_info(struct extent_io_tree *tree)
 132{
 133	if (!tree->mapping)
 134		return NULL;
 135	return btrfs_sb(tree->mapping->host->i_sb);
 136}
 137
 138int __init extent_io_init(void)
 139{
 140	extent_state_cache = kmem_cache_create("btrfs_extent_state",
 141			sizeof(struct extent_state), 0,
 142			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 143	if (!extent_state_cache)
 144		return -ENOMEM;
 145
 146	extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
 147			sizeof(struct extent_buffer), 0,
 148			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 149	if (!extent_buffer_cache)
 150		goto free_state_cache;
 151
 152	btrfs_bioset = bioset_create(BIO_POOL_SIZE,
 153				     offsetof(struct btrfs_io_bio, bio));
 154	if (!btrfs_bioset)
 155		goto free_buffer_cache;
 156
 157	if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE))
 158		goto free_bioset;
 159
 160	return 0;
 161
 162free_bioset:
 163	bioset_free(btrfs_bioset);
 164	btrfs_bioset = NULL;
 165
 166free_buffer_cache:
 167	kmem_cache_destroy(extent_buffer_cache);
 168	extent_buffer_cache = NULL;
 169
 170free_state_cache:
 171	kmem_cache_destroy(extent_state_cache);
 172	extent_state_cache = NULL;
 173	return -ENOMEM;
 174}
 175
 176void extent_io_exit(void)
 177{
 178	btrfs_leak_debug_check();
 
 
 
 
 
 
 
 
 
 
 
 179
 180	/*
 181	 * Make sure all delayed rcu free are flushed before we
 182	 * destroy caches.
 183	 */
 184	rcu_barrier();
 
 
 
 
 
 185	if (extent_state_cache)
 186		kmem_cache_destroy(extent_state_cache);
 187	if (extent_buffer_cache)
 188		kmem_cache_destroy(extent_buffer_cache);
 189	if (btrfs_bioset)
 190		bioset_free(btrfs_bioset);
 191}
 192
 193void extent_io_tree_init(struct extent_io_tree *tree,
 194			 struct address_space *mapping)
 195{
 196	tree->state = RB_ROOT;
 
 197	tree->ops = NULL;
 198	tree->dirty_bytes = 0;
 199	spin_lock_init(&tree->lock);
 
 200	tree->mapping = mapping;
 201}
 202
 203static struct extent_state *alloc_extent_state(gfp_t mask)
 204{
 205	struct extent_state *state;
 
 
 
 206
 207	state = kmem_cache_alloc(extent_state_cache, mask);
 208	if (!state)
 209		return state;
 210	state->state = 0;
 211	state->private = 0;
 212	state->tree = NULL;
 213	btrfs_leak_debug_add(&state->leak_list, &states);
 
 
 
 
 214	atomic_set(&state->refs, 1);
 215	init_waitqueue_head(&state->wq);
 216	trace_alloc_extent_state(state, mask, _RET_IP_);
 217	return state;
 218}
 219
 220void free_extent_state(struct extent_state *state)
 221{
 222	if (!state)
 223		return;
 224	if (atomic_dec_and_test(&state->refs)) {
 
 
 
 225		WARN_ON(state->tree);
 226		btrfs_leak_debug_del(&state->leak_list);
 
 
 
 
 227		trace_free_extent_state(state, _RET_IP_);
 228		kmem_cache_free(extent_state_cache, state);
 229	}
 230}
 231
 232static struct rb_node *tree_insert(struct rb_root *root,
 233				   struct rb_node *search_start,
 234				   u64 offset,
 235				   struct rb_node *node,
 236				   struct rb_node ***p_in,
 237				   struct rb_node **parent_in)
 238{
 239	struct rb_node **p;
 240	struct rb_node *parent = NULL;
 241	struct tree_entry *entry;
 242
 243	if (p_in && parent_in) {
 244		p = *p_in;
 245		parent = *parent_in;
 246		goto do_insert;
 247	}
 248
 249	p = search_start ? &search_start : &root->rb_node;
 250	while (*p) {
 251		parent = *p;
 252		entry = rb_entry(parent, struct tree_entry, rb_node);
 253
 254		if (offset < entry->start)
 255			p = &(*p)->rb_left;
 256		else if (offset > entry->end)
 257			p = &(*p)->rb_right;
 258		else
 259			return parent;
 260	}
 261
 262do_insert:
 263	rb_link_node(node, parent, p);
 264	rb_insert_color(node, root);
 265	return NULL;
 266}
 267
 268static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 269				      struct rb_node **prev_ret,
 270				      struct rb_node **next_ret,
 271				      struct rb_node ***p_ret,
 272				      struct rb_node **parent_ret)
 273{
 274	struct rb_root *root = &tree->state;
 275	struct rb_node **n = &root->rb_node;
 276	struct rb_node *prev = NULL;
 277	struct rb_node *orig_prev = NULL;
 278	struct tree_entry *entry;
 279	struct tree_entry *prev_entry = NULL;
 280
 281	while (*n) {
 282		prev = *n;
 283		entry = rb_entry(prev, struct tree_entry, rb_node);
 284		prev_entry = entry;
 285
 286		if (offset < entry->start)
 287			n = &(*n)->rb_left;
 288		else if (offset > entry->end)
 289			n = &(*n)->rb_right;
 290		else
 291			return *n;
 292	}
 293
 294	if (p_ret)
 295		*p_ret = n;
 296	if (parent_ret)
 297		*parent_ret = prev;
 298
 299	if (prev_ret) {
 300		orig_prev = prev;
 301		while (prev && offset > prev_entry->end) {
 302			prev = rb_next(prev);
 303			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 304		}
 305		*prev_ret = prev;
 306		prev = orig_prev;
 307	}
 308
 309	if (next_ret) {
 310		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 311		while (prev && offset < prev_entry->start) {
 312			prev = rb_prev(prev);
 313			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 314		}
 315		*next_ret = prev;
 316	}
 317	return NULL;
 318}
 319
 320static inline struct rb_node *
 321tree_search_for_insert(struct extent_io_tree *tree,
 322		       u64 offset,
 323		       struct rb_node ***p_ret,
 324		       struct rb_node **parent_ret)
 325{
 326	struct rb_node *prev = NULL;
 327	struct rb_node *ret;
 328
 329	ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret);
 330	if (!ret)
 331		return prev;
 332	return ret;
 333}
 334
 335static inline struct rb_node *tree_search(struct extent_io_tree *tree,
 336					  u64 offset)
 337{
 338	return tree_search_for_insert(tree, offset, NULL, NULL);
 339}
 340
 341static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
 342		     struct extent_state *other)
 343{
 344	if (tree->ops && tree->ops->merge_extent_hook)
 345		tree->ops->merge_extent_hook(tree->mapping->host, new,
 346					     other);
 347}
 348
 349/*
 350 * utility function to look for merge candidates inside a given range.
 351 * Any extents with matching state are merged together into a single
 352 * extent in the tree.  Extents with EXTENT_IO in their state field
 353 * are not merged because the end_io handlers need to be able to do
 354 * operations on them without sleeping (or doing allocations/splits).
 355 *
 356 * This should be called with the tree lock held.
 357 */
 358static void merge_state(struct extent_io_tree *tree,
 359		        struct extent_state *state)
 360{
 361	struct extent_state *other;
 362	struct rb_node *other_node;
 363
 364	if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
 365		return;
 366
 367	other_node = rb_prev(&state->rb_node);
 368	if (other_node) {
 369		other = rb_entry(other_node, struct extent_state, rb_node);
 370		if (other->end == state->start - 1 &&
 371		    other->state == state->state) {
 372			merge_cb(tree, state, other);
 373			state->start = other->start;
 374			other->tree = NULL;
 375			rb_erase(&other->rb_node, &tree->state);
 376			free_extent_state(other);
 377		}
 378	}
 379	other_node = rb_next(&state->rb_node);
 380	if (other_node) {
 381		other = rb_entry(other_node, struct extent_state, rb_node);
 382		if (other->start == state->end + 1 &&
 383		    other->state == state->state) {
 384			merge_cb(tree, state, other);
 385			state->end = other->end;
 386			other->tree = NULL;
 387			rb_erase(&other->rb_node, &tree->state);
 388			free_extent_state(other);
 389		}
 390	}
 391}
 392
 393static void set_state_cb(struct extent_io_tree *tree,
 394			 struct extent_state *state, unsigned long *bits)
 395{
 396	if (tree->ops && tree->ops->set_bit_hook)
 397		tree->ops->set_bit_hook(tree->mapping->host, state, bits);
 398}
 399
 400static void clear_state_cb(struct extent_io_tree *tree,
 401			   struct extent_state *state, unsigned long *bits)
 402{
 403	if (tree->ops && tree->ops->clear_bit_hook)
 404		tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
 405}
 406
 407static void set_state_bits(struct extent_io_tree *tree,
 408			   struct extent_state *state, unsigned long *bits);
 409
 410/*
 411 * insert an extent_state struct into the tree.  'bits' are set on the
 412 * struct before it is inserted.
 413 *
 414 * This may return -EEXIST if the extent is already there, in which case the
 415 * state struct is freed.
 416 *
 417 * The tree lock is not taken internally.  This is a utility function and
 418 * probably isn't what you want to call (see set/clear_extent_bit).
 419 */
 420static int insert_state(struct extent_io_tree *tree,
 421			struct extent_state *state, u64 start, u64 end,
 422			struct rb_node ***p,
 423			struct rb_node **parent,
 424			unsigned long *bits)
 425{
 426	struct rb_node *node;
 427
 428	if (end < start)
 429		WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n",
 430		       end, start);
 
 
 
 431	state->start = start;
 432	state->end = end;
 433
 434	set_state_bits(tree, state, bits);
 435
 436	node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
 437	if (node) {
 438		struct extent_state *found;
 439		found = rb_entry(node, struct extent_state, rb_node);
 440		printk(KERN_ERR "BTRFS: found node %llu %llu on insert of "
 441		       "%llu %llu\n",
 442		       found->start, found->end, start, end);
 
 443		return -EEXIST;
 444	}
 445	state->tree = tree;
 446	merge_state(tree, state);
 447	return 0;
 448}
 449
 450static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
 451		     u64 split)
 452{
 453	if (tree->ops && tree->ops->split_extent_hook)
 454		tree->ops->split_extent_hook(tree->mapping->host, orig, split);
 455}
 456
 457/*
 458 * split a given extent state struct in two, inserting the preallocated
 459 * struct 'prealloc' as the newly created second half.  'split' indicates an
 460 * offset inside 'orig' where it should be split.
 461 *
 462 * Before calling,
 463 * the tree has 'orig' at [orig->start, orig->end].  After calling, there
 464 * are two extent state structs in the tree:
 465 * prealloc: [orig->start, split - 1]
 466 * orig: [ split, orig->end ]
 467 *
 468 * The tree locks are not taken by this function. They need to be held
 469 * by the caller.
 470 */
 471static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 472		       struct extent_state *prealloc, u64 split)
 473{
 474	struct rb_node *node;
 475
 476	split_cb(tree, orig, split);
 477
 478	prealloc->start = orig->start;
 479	prealloc->end = split - 1;
 480	prealloc->state = orig->state;
 481	orig->start = split;
 482
 483	node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
 484			   &prealloc->rb_node, NULL, NULL);
 485	if (node) {
 486		free_extent_state(prealloc);
 487		return -EEXIST;
 488	}
 489	prealloc->tree = tree;
 490	return 0;
 491}
 492
 493static struct extent_state *next_state(struct extent_state *state)
 494{
 495	struct rb_node *next = rb_next(&state->rb_node);
 496	if (next)
 497		return rb_entry(next, struct extent_state, rb_node);
 498	else
 499		return NULL;
 500}
 501
 502/*
 503 * utility function to clear some bits in an extent state struct.
 504 * it will optionally wake up any one waiting on this state (wake == 1).
 505 *
 506 * If no bits are set on the state struct after clearing things, the
 507 * struct is freed and removed from the tree
 508 */
 509static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
 510					    struct extent_state *state,
 511					    unsigned long *bits, int wake)
 512{
 513	struct extent_state *next;
 514	unsigned long bits_to_clear = *bits & ~EXTENT_CTLBITS;
 515
 516	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
 517		u64 range = state->end - state->start + 1;
 518		WARN_ON(range > tree->dirty_bytes);
 519		tree->dirty_bytes -= range;
 520	}
 521	clear_state_cb(tree, state, bits);
 522	state->state &= ~bits_to_clear;
 523	if (wake)
 524		wake_up(&state->wq);
 525	if (state->state == 0) {
 526		next = next_state(state);
 527		if (state->tree) {
 528			rb_erase(&state->rb_node, &tree->state);
 529			state->tree = NULL;
 530			free_extent_state(state);
 531		} else {
 532			WARN_ON(1);
 533		}
 534	} else {
 535		merge_state(tree, state);
 536		next = next_state(state);
 537	}
 538	return next;
 539}
 540
 541static struct extent_state *
 542alloc_extent_state_atomic(struct extent_state *prealloc)
 543{
 544	if (!prealloc)
 545		prealloc = alloc_extent_state(GFP_ATOMIC);
 546
 547	return prealloc;
 548}
 549
 550static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
 551{
 552	btrfs_panic(tree_fs_info(tree), err, "Locking error: "
 553		    "Extent tree was modified by another "
 554		    "thread while locked.");
 555}
 556
 557/*
 558 * clear some bits on a range in the tree.  This may require splitting
 559 * or inserting elements in the tree, so the gfp mask is used to
 560 * indicate which allocations or sleeping are allowed.
 561 *
 562 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
 563 * the given range from the tree regardless of state (ie for truncate).
 564 *
 565 * the range [start, end] is inclusive.
 566 *
 567 * This takes the tree lock, and returns 0 on success and < 0 on error.
 568 */
 569int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 570		     unsigned long bits, int wake, int delete,
 571		     struct extent_state **cached_state,
 572		     gfp_t mask)
 573{
 574	struct extent_state *state;
 575	struct extent_state *cached;
 576	struct extent_state *prealloc = NULL;
 577	struct rb_node *node;
 578	u64 last_end;
 579	int err;
 580	int clear = 0;
 581
 582	btrfs_debug_check_extent_io_range(tree, start, end);
 583
 584	if (bits & EXTENT_DELALLOC)
 585		bits |= EXTENT_NORESERVE;
 586
 587	if (delete)
 588		bits |= ~EXTENT_CTLBITS;
 589	bits |= EXTENT_FIRST_DELALLOC;
 590
 591	if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
 592		clear = 1;
 593again:
 594	if (!prealloc && (mask & __GFP_WAIT)) {
 595		prealloc = alloc_extent_state(mask);
 596		if (!prealloc)
 597			return -ENOMEM;
 598	}
 599
 600	spin_lock(&tree->lock);
 601	if (cached_state) {
 602		cached = *cached_state;
 603
 604		if (clear) {
 605			*cached_state = NULL;
 606			cached_state = NULL;
 607		}
 608
 609		if (cached && cached->tree && cached->start <= start &&
 610		    cached->end > start) {
 611			if (clear)
 612				atomic_dec(&cached->refs);
 613			state = cached;
 614			goto hit_next;
 615		}
 616		if (clear)
 617			free_extent_state(cached);
 618	}
 619	/*
 620	 * this search will find the extents that end after
 621	 * our range starts
 622	 */
 623	node = tree_search(tree, start);
 624	if (!node)
 625		goto out;
 626	state = rb_entry(node, struct extent_state, rb_node);
 627hit_next:
 628	if (state->start > end)
 629		goto out;
 630	WARN_ON(state->end < start);
 631	last_end = state->end;
 632
 633	/* the state doesn't have the wanted bits, go ahead */
 634	if (!(state->state & bits)) {
 635		state = next_state(state);
 636		goto next;
 637	}
 638
 639	/*
 640	 *     | ---- desired range ---- |
 641	 *  | state | or
 642	 *  | ------------- state -------------- |
 643	 *
 644	 * We need to split the extent we found, and may flip
 645	 * bits on second half.
 646	 *
 647	 * If the extent we found extends past our range, we
 648	 * just split and search again.  It'll get split again
 649	 * the next time though.
 650	 *
 651	 * If the extent we found is inside our range, we clear
 652	 * the desired bit on it.
 653	 */
 654
 655	if (state->start < start) {
 656		prealloc = alloc_extent_state_atomic(prealloc);
 657		BUG_ON(!prealloc);
 658		err = split_state(tree, state, prealloc, start);
 659		if (err)
 660			extent_io_tree_panic(tree, err);
 661
 662		prealloc = NULL;
 663		if (err)
 664			goto out;
 665		if (state->end <= end) {
 666			state = clear_state_bit(tree, state, &bits, wake);
 667			goto next;
 668		}
 669		goto search_again;
 670	}
 671	/*
 672	 * | ---- desired range ---- |
 673	 *                        | state |
 674	 * We need to split the extent, and clear the bit
 675	 * on the first half
 676	 */
 677	if (state->start <= end && state->end > end) {
 678		prealloc = alloc_extent_state_atomic(prealloc);
 679		BUG_ON(!prealloc);
 680		err = split_state(tree, state, prealloc, end + 1);
 681		if (err)
 682			extent_io_tree_panic(tree, err);
 683
 684		if (wake)
 685			wake_up(&state->wq);
 686
 687		clear_state_bit(tree, prealloc, &bits, wake);
 688
 689		prealloc = NULL;
 690		goto out;
 691	}
 692
 693	state = clear_state_bit(tree, state, &bits, wake);
 694next:
 695	if (last_end == (u64)-1)
 696		goto out;
 697	start = last_end + 1;
 698	if (start <= end && state && !need_resched())
 699		goto hit_next;
 700	goto search_again;
 701
 702out:
 703	spin_unlock(&tree->lock);
 704	if (prealloc)
 705		free_extent_state(prealloc);
 706
 707	return 0;
 708
 709search_again:
 710	if (start > end)
 711		goto out;
 712	spin_unlock(&tree->lock);
 713	if (mask & __GFP_WAIT)
 714		cond_resched();
 715	goto again;
 716}
 717
 718static void wait_on_state(struct extent_io_tree *tree,
 719			  struct extent_state *state)
 720		__releases(tree->lock)
 721		__acquires(tree->lock)
 722{
 723	DEFINE_WAIT(wait);
 724	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
 725	spin_unlock(&tree->lock);
 726	schedule();
 727	spin_lock(&tree->lock);
 728	finish_wait(&state->wq, &wait);
 729}
 730
 731/*
 732 * waits for one or more bits to clear on a range in the state tree.
 733 * The range [start, end] is inclusive.
 734 * The tree lock is taken by this function
 735 */
 736static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 737			    unsigned long bits)
 738{
 739	struct extent_state *state;
 740	struct rb_node *node;
 741
 742	btrfs_debug_check_extent_io_range(tree, start, end);
 743
 744	spin_lock(&tree->lock);
 745again:
 746	while (1) {
 747		/*
 748		 * this search will find all the extents that end after
 749		 * our range starts
 750		 */
 751		node = tree_search(tree, start);
 752process_node:
 753		if (!node)
 754			break;
 755
 756		state = rb_entry(node, struct extent_state, rb_node);
 757
 758		if (state->start > end)
 759			goto out;
 760
 761		if (state->state & bits) {
 762			start = state->start;
 763			atomic_inc(&state->refs);
 764			wait_on_state(tree, state);
 765			free_extent_state(state);
 766			goto again;
 767		}
 768		start = state->end + 1;
 769
 770		if (start > end)
 771			break;
 772
 773		if (!cond_resched_lock(&tree->lock)) {
 774			node = rb_next(node);
 775			goto process_node;
 776		}
 777	}
 778out:
 779	spin_unlock(&tree->lock);
 780}
 781
 782static void set_state_bits(struct extent_io_tree *tree,
 783			   struct extent_state *state,
 784			   unsigned long *bits)
 785{
 786	unsigned long bits_to_set = *bits & ~EXTENT_CTLBITS;
 787
 788	set_state_cb(tree, state, bits);
 789	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
 790		u64 range = state->end - state->start + 1;
 791		tree->dirty_bytes += range;
 792	}
 793	state->state |= bits_to_set;
 794}
 795
 796static void cache_state(struct extent_state *state,
 797			struct extent_state **cached_ptr)
 798{
 799	if (cached_ptr && !(*cached_ptr)) {
 800		if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
 801			*cached_ptr = state;
 802			atomic_inc(&state->refs);
 803		}
 804	}
 805}
 806
 
 
 
 
 
 
 
 
 
 807/*
 808 * set some bits on a range in the tree.  This may require allocations or
 809 * sleeping, so the gfp mask is used to indicate what is allowed.
 810 *
 811 * If any of the exclusive bits are set, this will fail with -EEXIST if some
 812 * part of the range already has the desired bits set.  The start of the
 813 * existing range is returned in failed_start in this case.
 814 *
 815 * [start, end] is inclusive This takes the tree lock.
 816 */
 817
 818static int __must_check
 819__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 820		 unsigned long bits, unsigned long exclusive_bits,
 821		 u64 *failed_start, struct extent_state **cached_state,
 822		 gfp_t mask)
 823{
 824	struct extent_state *state;
 825	struct extent_state *prealloc = NULL;
 826	struct rb_node *node;
 827	struct rb_node **p;
 828	struct rb_node *parent;
 829	int err = 0;
 830	u64 last_start;
 831	u64 last_end;
 832
 833	btrfs_debug_check_extent_io_range(tree, start, end);
 834
 835	bits |= EXTENT_FIRST_DELALLOC;
 836again:
 837	if (!prealloc && (mask & __GFP_WAIT)) {
 838		prealloc = alloc_extent_state(mask);
 839		BUG_ON(!prealloc);
 840	}
 841
 842	spin_lock(&tree->lock);
 843	if (cached_state && *cached_state) {
 844		state = *cached_state;
 845		if (state->start <= start && state->end > start &&
 846		    state->tree) {
 847			node = &state->rb_node;
 848			goto hit_next;
 849		}
 850	}
 851	/*
 852	 * this search will find all the extents that end after
 853	 * our range starts.
 854	 */
 855	node = tree_search_for_insert(tree, start, &p, &parent);
 856	if (!node) {
 857		prealloc = alloc_extent_state_atomic(prealloc);
 858		BUG_ON(!prealloc);
 859		err = insert_state(tree, prealloc, start, end,
 860				   &p, &parent, &bits);
 861		if (err)
 862			extent_io_tree_panic(tree, err);
 863
 864		cache_state(prealloc, cached_state);
 865		prealloc = NULL;
 866		goto out;
 867	}
 868	state = rb_entry(node, struct extent_state, rb_node);
 869hit_next:
 870	last_start = state->start;
 871	last_end = state->end;
 872
 873	/*
 874	 * | ---- desired range ---- |
 875	 * | state |
 876	 *
 877	 * Just lock what we found and keep going
 878	 */
 879	if (state->start == start && state->end <= end) {
 880		if (state->state & exclusive_bits) {
 881			*failed_start = state->start;
 882			err = -EEXIST;
 883			goto out;
 884		}
 885
 886		set_state_bits(tree, state, &bits);
 887		cache_state(state, cached_state);
 888		merge_state(tree, state);
 889		if (last_end == (u64)-1)
 890			goto out;
 891		start = last_end + 1;
 892		state = next_state(state);
 893		if (start < end && state && state->start == start &&
 894		    !need_resched())
 895			goto hit_next;
 896		goto search_again;
 897	}
 898
 899	/*
 900	 *     | ---- desired range ---- |
 901	 * | state |
 902	 *   or
 903	 * | ------------- state -------------- |
 904	 *
 905	 * We need to split the extent we found, and may flip bits on
 906	 * second half.
 907	 *
 908	 * If the extent we found extends past our
 909	 * range, we just split and search again.  It'll get split
 910	 * again the next time though.
 911	 *
 912	 * If the extent we found is inside our range, we set the
 913	 * desired bit on it.
 914	 */
 915	if (state->start < start) {
 916		if (state->state & exclusive_bits) {
 917			*failed_start = start;
 918			err = -EEXIST;
 919			goto out;
 920		}
 921
 922		prealloc = alloc_extent_state_atomic(prealloc);
 923		BUG_ON(!prealloc);
 924		err = split_state(tree, state, prealloc, start);
 925		if (err)
 926			extent_io_tree_panic(tree, err);
 927
 928		prealloc = NULL;
 929		if (err)
 930			goto out;
 931		if (state->end <= end) {
 932			set_state_bits(tree, state, &bits);
 933			cache_state(state, cached_state);
 934			merge_state(tree, state);
 935			if (last_end == (u64)-1)
 936				goto out;
 937			start = last_end + 1;
 938			state = next_state(state);
 939			if (start < end && state && state->start == start &&
 940			    !need_resched())
 941				goto hit_next;
 942		}
 943		goto search_again;
 944	}
 945	/*
 946	 * | ---- desired range ---- |
 947	 *     | state | or               | state |
 948	 *
 949	 * There's a hole, we need to insert something in it and
 950	 * ignore the extent we found.
 951	 */
 952	if (state->start > start) {
 953		u64 this_end;
 954		if (end < last_start)
 955			this_end = end;
 956		else
 957			this_end = last_start - 1;
 958
 959		prealloc = alloc_extent_state_atomic(prealloc);
 960		BUG_ON(!prealloc);
 961
 962		/*
 963		 * Avoid to free 'prealloc' if it can be merged with
 964		 * the later extent.
 965		 */
 966		err = insert_state(tree, prealloc, start, this_end,
 967				   NULL, NULL, &bits);
 968		if (err)
 969			extent_io_tree_panic(tree, err);
 970
 971		cache_state(prealloc, cached_state);
 972		prealloc = NULL;
 973		start = this_end + 1;
 974		goto search_again;
 975	}
 976	/*
 977	 * | ---- desired range ---- |
 978	 *                        | state |
 979	 * We need to split the extent, and set the bit
 980	 * on the first half
 981	 */
 982	if (state->start <= end && state->end > end) {
 983		if (state->state & exclusive_bits) {
 984			*failed_start = start;
 985			err = -EEXIST;
 986			goto out;
 987		}
 988
 989		prealloc = alloc_extent_state_atomic(prealloc);
 990		BUG_ON(!prealloc);
 991		err = split_state(tree, state, prealloc, end + 1);
 992		if (err)
 993			extent_io_tree_panic(tree, err);
 994
 995		set_state_bits(tree, prealloc, &bits);
 996		cache_state(prealloc, cached_state);
 997		merge_state(tree, prealloc);
 998		prealloc = NULL;
 999		goto out;
1000	}
1001
1002	goto search_again;
1003
1004out:
1005	spin_unlock(&tree->lock);
1006	if (prealloc)
1007		free_extent_state(prealloc);
1008
1009	return err;
1010
1011search_again:
1012	if (start > end)
1013		goto out;
1014	spin_unlock(&tree->lock);
1015	if (mask & __GFP_WAIT)
1016		cond_resched();
1017	goto again;
1018}
1019
1020int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1021		   unsigned long bits, u64 * failed_start,
1022		   struct extent_state **cached_state, gfp_t mask)
1023{
1024	return __set_extent_bit(tree, start, end, bits, 0, failed_start,
1025				cached_state, mask);
1026}
1027
1028
1029/**
1030 * convert_extent_bit - convert all bits in a given range from one bit to
1031 * 			another
1032 * @tree:	the io tree to search
1033 * @start:	the start offset in bytes
1034 * @end:	the end offset in bytes (inclusive)
1035 * @bits:	the bits to set in this range
1036 * @clear_bits:	the bits to clear in this range
1037 * @cached_state:	state that we're going to cache
1038 * @mask:	the allocation mask
1039 *
1040 * This will go through and set bits for the given range.  If any states exist
1041 * already in this range they are set with the given bit and cleared of the
1042 * clear_bits.  This is only meant to be used by things that are mergeable, ie
1043 * converting from say DELALLOC to DIRTY.  This is not meant to be used with
1044 * boundary bits like LOCK.
1045 */
1046int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1047		       unsigned long bits, unsigned long clear_bits,
1048		       struct extent_state **cached_state, gfp_t mask)
1049{
1050	struct extent_state *state;
1051	struct extent_state *prealloc = NULL;
1052	struct rb_node *node;
1053	struct rb_node **p;
1054	struct rb_node *parent;
1055	int err = 0;
1056	u64 last_start;
1057	u64 last_end;
1058
1059	btrfs_debug_check_extent_io_range(tree, start, end);
1060
1061again:
1062	if (!prealloc && (mask & __GFP_WAIT)) {
1063		prealloc = alloc_extent_state(mask);
1064		if (!prealloc)
1065			return -ENOMEM;
1066	}
1067
1068	spin_lock(&tree->lock);
1069	if (cached_state && *cached_state) {
1070		state = *cached_state;
1071		if (state->start <= start && state->end > start &&
1072		    state->tree) {
1073			node = &state->rb_node;
1074			goto hit_next;
1075		}
1076	}
1077
1078	/*
1079	 * this search will find all the extents that end after
1080	 * our range starts.
1081	 */
1082	node = tree_search_for_insert(tree, start, &p, &parent);
1083	if (!node) {
1084		prealloc = alloc_extent_state_atomic(prealloc);
1085		if (!prealloc) {
1086			err = -ENOMEM;
1087			goto out;
1088		}
1089		err = insert_state(tree, prealloc, start, end,
1090				   &p, &parent, &bits);
1091		if (err)
1092			extent_io_tree_panic(tree, err);
1093		cache_state(prealloc, cached_state);
1094		prealloc = NULL;
1095		goto out;
1096	}
1097	state = rb_entry(node, struct extent_state, rb_node);
1098hit_next:
1099	last_start = state->start;
1100	last_end = state->end;
1101
1102	/*
1103	 * | ---- desired range ---- |
1104	 * | state |
1105	 *
1106	 * Just lock what we found and keep going
1107	 */
1108	if (state->start == start && state->end <= end) {
1109		set_state_bits(tree, state, &bits);
1110		cache_state(state, cached_state);
1111		state = clear_state_bit(tree, state, &clear_bits, 0);
1112		if (last_end == (u64)-1)
1113			goto out;
1114		start = last_end + 1;
1115		if (start < end && state && state->start == start &&
1116		    !need_resched())
1117			goto hit_next;
1118		goto search_again;
1119	}
1120
1121	/*
1122	 *     | ---- desired range ---- |
1123	 * | state |
1124	 *   or
1125	 * | ------------- state -------------- |
1126	 *
1127	 * We need to split the extent we found, and may flip bits on
1128	 * second half.
1129	 *
1130	 * If the extent we found extends past our
1131	 * range, we just split and search again.  It'll get split
1132	 * again the next time though.
1133	 *
1134	 * If the extent we found is inside our range, we set the
1135	 * desired bit on it.
1136	 */
1137	if (state->start < start) {
1138		prealloc = alloc_extent_state_atomic(prealloc);
1139		if (!prealloc) {
1140			err = -ENOMEM;
1141			goto out;
1142		}
1143		err = split_state(tree, state, prealloc, start);
1144		if (err)
1145			extent_io_tree_panic(tree, err);
1146		prealloc = NULL;
1147		if (err)
1148			goto out;
1149		if (state->end <= end) {
1150			set_state_bits(tree, state, &bits);
1151			cache_state(state, cached_state);
1152			state = clear_state_bit(tree, state, &clear_bits, 0);
1153			if (last_end == (u64)-1)
1154				goto out;
1155			start = last_end + 1;
1156			if (start < end && state && state->start == start &&
1157			    !need_resched())
1158				goto hit_next;
1159		}
1160		goto search_again;
1161	}
1162	/*
1163	 * | ---- desired range ---- |
1164	 *     | state | or               | state |
1165	 *
1166	 * There's a hole, we need to insert something in it and
1167	 * ignore the extent we found.
1168	 */
1169	if (state->start > start) {
1170		u64 this_end;
1171		if (end < last_start)
1172			this_end = end;
1173		else
1174			this_end = last_start - 1;
1175
1176		prealloc = alloc_extent_state_atomic(prealloc);
1177		if (!prealloc) {
1178			err = -ENOMEM;
1179			goto out;
1180		}
1181
1182		/*
1183		 * Avoid to free 'prealloc' if it can be merged with
1184		 * the later extent.
1185		 */
1186		err = insert_state(tree, prealloc, start, this_end,
1187				   NULL, NULL, &bits);
1188		if (err)
1189			extent_io_tree_panic(tree, err);
1190		cache_state(prealloc, cached_state);
1191		prealloc = NULL;
1192		start = this_end + 1;
1193		goto search_again;
1194	}
1195	/*
1196	 * | ---- desired range ---- |
1197	 *                        | state |
1198	 * We need to split the extent, and set the bit
1199	 * on the first half
1200	 */
1201	if (state->start <= end && state->end > end) {
1202		prealloc = alloc_extent_state_atomic(prealloc);
1203		if (!prealloc) {
1204			err = -ENOMEM;
1205			goto out;
1206		}
1207
1208		err = split_state(tree, state, prealloc, end + 1);
1209		if (err)
1210			extent_io_tree_panic(tree, err);
1211
1212		set_state_bits(tree, prealloc, &bits);
1213		cache_state(prealloc, cached_state);
1214		clear_state_bit(tree, prealloc, &clear_bits, 0);
1215		prealloc = NULL;
1216		goto out;
1217	}
1218
1219	goto search_again;
1220
1221out:
1222	spin_unlock(&tree->lock);
1223	if (prealloc)
1224		free_extent_state(prealloc);
1225
1226	return err;
1227
1228search_again:
1229	if (start > end)
1230		goto out;
1231	spin_unlock(&tree->lock);
1232	if (mask & __GFP_WAIT)
1233		cond_resched();
1234	goto again;
1235}
1236
1237/* wrappers around set/clear extent bit */
1238int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1239		     gfp_t mask)
1240{
1241	return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
1242			      NULL, mask);
1243}
1244
1245int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1246		    unsigned long bits, gfp_t mask)
1247{
1248	return set_extent_bit(tree, start, end, bits, NULL,
1249			      NULL, mask);
1250}
1251
1252int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1253		      unsigned long bits, gfp_t mask)
1254{
1255	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
1256}
1257
1258int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
1259			struct extent_state **cached_state, gfp_t mask)
1260{
1261	return set_extent_bit(tree, start, end,
1262			      EXTENT_DELALLOC | EXTENT_UPTODATE,
1263			      NULL, cached_state, mask);
1264}
1265
1266int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
1267		      struct extent_state **cached_state, gfp_t mask)
1268{
1269	return set_extent_bit(tree, start, end,
1270			      EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
1271			      NULL, cached_state, mask);
1272}
1273
1274int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1275		       gfp_t mask)
1276{
1277	return clear_extent_bit(tree, start, end,
1278				EXTENT_DIRTY | EXTENT_DELALLOC |
1279				EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
1280}
1281
1282int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
1283		     gfp_t mask)
1284{
1285	return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
1286			      NULL, mask);
1287}
1288
1289int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1290			struct extent_state **cached_state, gfp_t mask)
1291{
1292	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
1293			      cached_state, mask);
1294}
1295
1296int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1297			  struct extent_state **cached_state, gfp_t mask)
1298{
1299	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
1300				cached_state, mask);
1301}
1302
1303/*
1304 * either insert or lock state struct between start and end use mask to tell
1305 * us if waiting is desired.
1306 */
1307int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1308		     unsigned long bits, struct extent_state **cached_state)
1309{
1310	int err;
1311	u64 failed_start;
1312	while (1) {
1313		err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
1314				       EXTENT_LOCKED, &failed_start,
1315				       cached_state, GFP_NOFS);
1316		if (err == -EEXIST) {
1317			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1318			start = failed_start;
1319		} else
1320			break;
1321		WARN_ON(start > end);
1322	}
1323	return err;
1324}
1325
1326int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1327{
1328	return lock_extent_bits(tree, start, end, 0, NULL);
1329}
1330
1331int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1332{
1333	int err;
1334	u64 failed_start;
1335
1336	err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1337			       &failed_start, NULL, GFP_NOFS);
1338	if (err == -EEXIST) {
1339		if (failed_start > start)
1340			clear_extent_bit(tree, start, failed_start - 1,
1341					 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
1342		return 0;
1343	}
1344	return 1;
1345}
1346
1347int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1348			 struct extent_state **cached, gfp_t mask)
1349{
1350	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1351				mask);
1352}
1353
1354int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1355{
1356	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1357				GFP_NOFS);
1358}
1359
1360int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
1361{
1362	unsigned long index = start >> PAGE_CACHE_SHIFT;
1363	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1364	struct page *page;
1365
1366	while (index <= end_index) {
1367		page = find_get_page(inode->i_mapping, index);
1368		BUG_ON(!page); /* Pages should be in the extent_io_tree */
1369		clear_page_dirty_for_io(page);
1370		page_cache_release(page);
1371		index++;
1372	}
1373	return 0;
1374}
1375
1376int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
1377{
1378	unsigned long index = start >> PAGE_CACHE_SHIFT;
1379	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1380	struct page *page;
1381
1382	while (index <= end_index) {
1383		page = find_get_page(inode->i_mapping, index);
1384		BUG_ON(!page); /* Pages should be in the extent_io_tree */
1385		account_page_redirty(page);
1386		__set_page_dirty_nobuffers(page);
1387		page_cache_release(page);
1388		index++;
1389	}
1390	return 0;
1391}
1392
1393/*
1394 * helper function to set both pages and extents in the tree writeback
1395 */
1396static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1397{
1398	unsigned long index = start >> PAGE_CACHE_SHIFT;
1399	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1400	struct page *page;
1401
1402	while (index <= end_index) {
1403		page = find_get_page(tree->mapping, index);
1404		BUG_ON(!page); /* Pages should be in the extent_io_tree */
1405		set_page_writeback(page);
1406		page_cache_release(page);
1407		index++;
1408	}
1409	return 0;
1410}
1411
1412/* find the first state struct with 'bits' set after 'start', and
1413 * return it.  tree->lock must be held.  NULL will returned if
1414 * nothing was found after 'start'
1415 */
1416static struct extent_state *
1417find_first_extent_bit_state(struct extent_io_tree *tree,
1418			    u64 start, unsigned long bits)
1419{
1420	struct rb_node *node;
1421	struct extent_state *state;
1422
1423	/*
1424	 * this search will find all the extents that end after
1425	 * our range starts.
1426	 */
1427	node = tree_search(tree, start);
1428	if (!node)
1429		goto out;
1430
1431	while (1) {
1432		state = rb_entry(node, struct extent_state, rb_node);
1433		if (state->end >= start && (state->state & bits))
1434			return state;
1435
1436		node = rb_next(node);
1437		if (!node)
1438			break;
1439	}
1440out:
1441	return NULL;
1442}
1443
1444/*
1445 * find the first offset in the io tree with 'bits' set. zero is
1446 * returned if we find something, and *start_ret and *end_ret are
1447 * set to reflect the state struct that was found.
1448 *
1449 * If nothing was found, 1 is returned. If found something, return 0.
1450 */
1451int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1452			  u64 *start_ret, u64 *end_ret, unsigned long bits,
1453			  struct extent_state **cached_state)
1454{
1455	struct extent_state *state;
1456	struct rb_node *n;
1457	int ret = 1;
1458
1459	spin_lock(&tree->lock);
1460	if (cached_state && *cached_state) {
1461		state = *cached_state;
1462		if (state->end == start - 1 && state->tree) {
1463			n = rb_next(&state->rb_node);
1464			while (n) {
1465				state = rb_entry(n, struct extent_state,
1466						 rb_node);
1467				if (state->state & bits)
1468					goto got_it;
1469				n = rb_next(n);
1470			}
1471			free_extent_state(*cached_state);
1472			*cached_state = NULL;
1473			goto out;
1474		}
1475		free_extent_state(*cached_state);
1476		*cached_state = NULL;
1477	}
1478
1479	state = find_first_extent_bit_state(tree, start, bits);
1480got_it:
1481	if (state) {
1482		cache_state(state, cached_state);
1483		*start_ret = state->start;
1484		*end_ret = state->end;
1485		ret = 0;
1486	}
1487out:
1488	spin_unlock(&tree->lock);
1489	return ret;
1490}
1491
1492/*
1493 * find a contiguous range of bytes in the file marked as delalloc, not
1494 * more than 'max_bytes'.  start and end are used to return the range,
1495 *
1496 * 1 is returned if we find something, 0 if nothing was in the tree
1497 */
1498static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1499					u64 *start, u64 *end, u64 max_bytes,
1500					struct extent_state **cached_state)
1501{
1502	struct rb_node *node;
1503	struct extent_state *state;
1504	u64 cur_start = *start;
1505	u64 found = 0;
1506	u64 total_bytes = 0;
1507
1508	spin_lock(&tree->lock);
1509
1510	/*
1511	 * this search will find all the extents that end after
1512	 * our range starts.
1513	 */
1514	node = tree_search(tree, cur_start);
1515	if (!node) {
1516		if (!found)
1517			*end = (u64)-1;
1518		goto out;
1519	}
1520
1521	while (1) {
1522		state = rb_entry(node, struct extent_state, rb_node);
1523		if (found && (state->start != cur_start ||
1524			      (state->state & EXTENT_BOUNDARY))) {
1525			goto out;
1526		}
1527		if (!(state->state & EXTENT_DELALLOC)) {
1528			if (!found)
1529				*end = state->end;
1530			goto out;
1531		}
1532		if (!found) {
1533			*start = state->start;
1534			*cached_state = state;
1535			atomic_inc(&state->refs);
1536		}
1537		found++;
1538		*end = state->end;
1539		cur_start = state->end + 1;
1540		node = rb_next(node);
 
 
1541		total_bytes += state->end - state->start + 1;
1542		if (total_bytes >= max_bytes)
1543			break;
1544		if (!node)
1545			break;
1546	}
1547out:
1548	spin_unlock(&tree->lock);
1549	return found;
1550}
1551
1552static noinline void __unlock_for_delalloc(struct inode *inode,
1553					   struct page *locked_page,
1554					   u64 start, u64 end)
1555{
1556	int ret;
1557	struct page *pages[16];
1558	unsigned long index = start >> PAGE_CACHE_SHIFT;
1559	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1560	unsigned long nr_pages = end_index - index + 1;
1561	int i;
1562
1563	if (index == locked_page->index && end_index == index)
1564		return;
1565
1566	while (nr_pages > 0) {
1567		ret = find_get_pages_contig(inode->i_mapping, index,
1568				     min_t(unsigned long, nr_pages,
1569				     ARRAY_SIZE(pages)), pages);
1570		for (i = 0; i < ret; i++) {
1571			if (pages[i] != locked_page)
1572				unlock_page(pages[i]);
1573			page_cache_release(pages[i]);
1574		}
1575		nr_pages -= ret;
1576		index += ret;
1577		cond_resched();
1578	}
1579}
1580
1581static noinline int lock_delalloc_pages(struct inode *inode,
1582					struct page *locked_page,
1583					u64 delalloc_start,
1584					u64 delalloc_end)
1585{
1586	unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1587	unsigned long start_index = index;
1588	unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1589	unsigned long pages_locked = 0;
1590	struct page *pages[16];
1591	unsigned long nrpages;
1592	int ret;
1593	int i;
1594
1595	/* the caller is responsible for locking the start index */
1596	if (index == locked_page->index && index == end_index)
1597		return 0;
1598
1599	/* skip the page at the start index */
1600	nrpages = end_index - index + 1;
1601	while (nrpages > 0) {
1602		ret = find_get_pages_contig(inode->i_mapping, index,
1603				     min_t(unsigned long,
1604				     nrpages, ARRAY_SIZE(pages)), pages);
1605		if (ret == 0) {
1606			ret = -EAGAIN;
1607			goto done;
1608		}
1609		/* now we have an array of pages, lock them all */
1610		for (i = 0; i < ret; i++) {
1611			/*
1612			 * the caller is taking responsibility for
1613			 * locked_page
1614			 */
1615			if (pages[i] != locked_page) {
1616				lock_page(pages[i]);
1617				if (!PageDirty(pages[i]) ||
1618				    pages[i]->mapping != inode->i_mapping) {
1619					ret = -EAGAIN;
1620					unlock_page(pages[i]);
1621					page_cache_release(pages[i]);
1622					goto done;
1623				}
1624			}
1625			page_cache_release(pages[i]);
1626			pages_locked++;
1627		}
1628		nrpages -= ret;
1629		index += ret;
1630		cond_resched();
1631	}
1632	ret = 0;
1633done:
1634	if (ret && pages_locked) {
1635		__unlock_for_delalloc(inode, locked_page,
1636			      delalloc_start,
1637			      ((u64)(start_index + pages_locked - 1)) <<
1638			      PAGE_CACHE_SHIFT);
1639	}
1640	return ret;
1641}
1642
1643/*
1644 * find a contiguous range of bytes in the file marked as delalloc, not
1645 * more than 'max_bytes'.  start and end are used to return the range,
1646 *
1647 * 1 is returned if we find something, 0 if nothing was in the tree
1648 */
1649STATIC u64 find_lock_delalloc_range(struct inode *inode,
1650				    struct extent_io_tree *tree,
1651				    struct page *locked_page, u64 *start,
1652				    u64 *end, u64 max_bytes)
 
1653{
1654	u64 delalloc_start;
1655	u64 delalloc_end;
1656	u64 found;
1657	struct extent_state *cached_state = NULL;
1658	int ret;
1659	int loops = 0;
1660
1661again:
1662	/* step one, find a bunch of delalloc bytes starting at start */
1663	delalloc_start = *start;
1664	delalloc_end = 0;
1665	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1666				    max_bytes, &cached_state);
1667	if (!found || delalloc_end <= *start) {
1668		*start = delalloc_start;
1669		*end = delalloc_end;
1670		free_extent_state(cached_state);
1671		return 0;
1672	}
1673
1674	/*
1675	 * start comes from the offset of locked_page.  We have to lock
1676	 * pages in order, so we can't process delalloc bytes before
1677	 * locked_page
1678	 */
1679	if (delalloc_start < *start)
1680		delalloc_start = *start;
1681
1682	/*
1683	 * make sure to limit the number of pages we try to lock down
 
1684	 */
1685	if (delalloc_end + 1 - delalloc_start > max_bytes)
1686		delalloc_end = delalloc_start + max_bytes - 1;
1687
1688	/* step two, lock all the pages after the page that has start */
1689	ret = lock_delalloc_pages(inode, locked_page,
1690				  delalloc_start, delalloc_end);
1691	if (ret == -EAGAIN) {
1692		/* some of the pages are gone, lets avoid looping by
1693		 * shortening the size of the delalloc range we're searching
1694		 */
1695		free_extent_state(cached_state);
1696		if (!loops) {
1697			max_bytes = PAGE_CACHE_SIZE;
 
1698			loops = 1;
1699			goto again;
1700		} else {
1701			found = 0;
1702			goto out_failed;
1703		}
1704	}
1705	BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
1706
1707	/* step three, lock the state bits for the whole range */
1708	lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
1709
1710	/* then test to make sure it is all still delalloc */
1711	ret = test_range_bit(tree, delalloc_start, delalloc_end,
1712			     EXTENT_DELALLOC, 1, cached_state);
1713	if (!ret) {
1714		unlock_extent_cached(tree, delalloc_start, delalloc_end,
1715				     &cached_state, GFP_NOFS);
1716		__unlock_for_delalloc(inode, locked_page,
1717			      delalloc_start, delalloc_end);
1718		cond_resched();
1719		goto again;
1720	}
1721	free_extent_state(cached_state);
1722	*start = delalloc_start;
1723	*end = delalloc_end;
1724out_failed:
1725	return found;
1726}
1727
1728int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
1729				 struct page *locked_page,
1730				 unsigned long clear_bits,
1731				 unsigned long page_ops)
1732{
1733	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
1734	int ret;
1735	struct page *pages[16];
1736	unsigned long index = start >> PAGE_CACHE_SHIFT;
1737	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1738	unsigned long nr_pages = end_index - index + 1;
1739	int i;
 
 
 
 
 
 
 
 
 
1740
1741	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1742	if (page_ops == 0)
 
 
1743		return 0;
1744
1745	while (nr_pages > 0) {
1746		ret = find_get_pages_contig(inode->i_mapping, index,
1747				     min_t(unsigned long,
1748				     nr_pages, ARRAY_SIZE(pages)), pages);
1749		for (i = 0; i < ret; i++) {
1750
1751			if (page_ops & PAGE_SET_PRIVATE2)
1752				SetPagePrivate2(pages[i]);
1753
1754			if (pages[i] == locked_page) {
1755				page_cache_release(pages[i]);
1756				continue;
1757			}
1758			if (page_ops & PAGE_CLEAR_DIRTY)
1759				clear_page_dirty_for_io(pages[i]);
1760			if (page_ops & PAGE_SET_WRITEBACK)
1761				set_page_writeback(pages[i]);
1762			if (page_ops & PAGE_END_WRITEBACK)
1763				end_page_writeback(pages[i]);
1764			if (page_ops & PAGE_UNLOCK)
1765				unlock_page(pages[i]);
1766			page_cache_release(pages[i]);
1767		}
1768		nr_pages -= ret;
1769		index += ret;
1770		cond_resched();
1771	}
1772	return 0;
1773}
1774
1775/*
1776 * count the number of bytes in the tree that have a given bit(s)
1777 * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1778 * cached.  The total number found is returned.
1779 */
1780u64 count_range_bits(struct extent_io_tree *tree,
1781		     u64 *start, u64 search_end, u64 max_bytes,
1782		     unsigned long bits, int contig)
1783{
1784	struct rb_node *node;
1785	struct extent_state *state;
1786	u64 cur_start = *start;
1787	u64 total_bytes = 0;
1788	u64 last = 0;
1789	int found = 0;
1790
1791	if (WARN_ON(search_end <= cur_start))
 
1792		return 0;
 
1793
1794	spin_lock(&tree->lock);
1795	if (cur_start == 0 && bits == EXTENT_DIRTY) {
1796		total_bytes = tree->dirty_bytes;
1797		goto out;
1798	}
1799	/*
1800	 * this search will find all the extents that end after
1801	 * our range starts.
1802	 */
1803	node = tree_search(tree, cur_start);
1804	if (!node)
1805		goto out;
1806
1807	while (1) {
1808		state = rb_entry(node, struct extent_state, rb_node);
1809		if (state->start > search_end)
1810			break;
1811		if (contig && found && state->start > last + 1)
1812			break;
1813		if (state->end >= cur_start && (state->state & bits) == bits) {
1814			total_bytes += min(search_end, state->end) + 1 -
1815				       max(cur_start, state->start);
1816			if (total_bytes >= max_bytes)
1817				break;
1818			if (!found) {
1819				*start = max(cur_start, state->start);
1820				found = 1;
1821			}
1822			last = state->end;
1823		} else if (contig && found) {
1824			break;
1825		}
1826		node = rb_next(node);
1827		if (!node)
1828			break;
1829	}
1830out:
1831	spin_unlock(&tree->lock);
1832	return total_bytes;
1833}
1834
1835/*
1836 * set the private field for a given byte offset in the tree.  If there isn't
1837 * an extent_state there already, this does nothing.
1838 */
1839static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1840{
1841	struct rb_node *node;
1842	struct extent_state *state;
1843	int ret = 0;
1844
1845	spin_lock(&tree->lock);
1846	/*
1847	 * this search will find all the extents that end after
1848	 * our range starts.
1849	 */
1850	node = tree_search(tree, start);
1851	if (!node) {
1852		ret = -ENOENT;
1853		goto out;
1854	}
1855	state = rb_entry(node, struct extent_state, rb_node);
1856	if (state->start != start) {
1857		ret = -ENOENT;
1858		goto out;
1859	}
1860	state->private = private;
1861out:
1862	spin_unlock(&tree->lock);
1863	return ret;
1864}
1865
1866int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1867{
1868	struct rb_node *node;
1869	struct extent_state *state;
1870	int ret = 0;
1871
1872	spin_lock(&tree->lock);
1873	/*
1874	 * this search will find all the extents that end after
1875	 * our range starts.
1876	 */
1877	node = tree_search(tree, start);
1878	if (!node) {
1879		ret = -ENOENT;
1880		goto out;
1881	}
1882	state = rb_entry(node, struct extent_state, rb_node);
1883	if (state->start != start) {
1884		ret = -ENOENT;
1885		goto out;
1886	}
1887	*private = state->private;
1888out:
1889	spin_unlock(&tree->lock);
1890	return ret;
1891}
1892
1893/*
1894 * searches a range in the state tree for a given mask.
1895 * If 'filled' == 1, this returns 1 only if every extent in the tree
1896 * has the bits set.  Otherwise, 1 is returned if any bit in the
1897 * range is found set.
1898 */
1899int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1900		   unsigned long bits, int filled, struct extent_state *cached)
1901{
1902	struct extent_state *state = NULL;
1903	struct rb_node *node;
1904	int bitset = 0;
1905
1906	spin_lock(&tree->lock);
1907	if (cached && cached->tree && cached->start <= start &&
1908	    cached->end > start)
1909		node = &cached->rb_node;
1910	else
1911		node = tree_search(tree, start);
1912	while (node && start <= end) {
1913		state = rb_entry(node, struct extent_state, rb_node);
1914
1915		if (filled && state->start > start) {
1916			bitset = 0;
1917			break;
1918		}
1919
1920		if (state->start > end)
1921			break;
1922
1923		if (state->state & bits) {
1924			bitset = 1;
1925			if (!filled)
1926				break;
1927		} else if (filled) {
1928			bitset = 0;
1929			break;
1930		}
1931
1932		if (state->end == (u64)-1)
1933			break;
1934
1935		start = state->end + 1;
1936		if (start > end)
1937			break;
1938		node = rb_next(node);
1939		if (!node) {
1940			if (filled)
1941				bitset = 0;
1942			break;
1943		}
1944	}
1945	spin_unlock(&tree->lock);
1946	return bitset;
1947}
1948
1949/*
1950 * helper function to set a given page up to date if all the
1951 * extents in the tree for that page are up to date
1952 */
1953static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
1954{
1955	u64 start = page_offset(page);
1956	u64 end = start + PAGE_CACHE_SIZE - 1;
1957	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1958		SetPageUptodate(page);
1959}
1960
1961/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1962 * When IO fails, either with EIO or csum verification fails, we
1963 * try other mirrors that might have a good copy of the data.  This
1964 * io_failure_record is used to record state as we go through all the
1965 * mirrors.  If another mirror has good data, the page is set up to date
1966 * and things continue.  If a good mirror can't be found, the original
1967 * bio end_io callback is called to indicate things have failed.
1968 */
1969struct io_failure_record {
1970	struct page *page;
1971	u64 start;
1972	u64 len;
1973	u64 logical;
1974	unsigned long bio_flags;
1975	int this_mirror;
1976	int failed_mirror;
1977	int in_validation;
1978};
1979
1980static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1981				int did_repair)
1982{
1983	int ret;
1984	int err = 0;
1985	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1986
1987	set_state_private(failure_tree, rec->start, 0);
1988	ret = clear_extent_bits(failure_tree, rec->start,
1989				rec->start + rec->len - 1,
1990				EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1991	if (ret)
1992		err = ret;
1993
1994	ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1995				rec->start + rec->len - 1,
1996				EXTENT_DAMAGED, GFP_NOFS);
1997	if (ret && !err)
1998		err = ret;
 
 
1999
2000	kfree(rec);
2001	return err;
2002}
2003
 
 
 
 
 
2004/*
2005 * this bypasses the standard btrfs submit functions deliberately, as
2006 * the standard behavior is to write all copies in a raid setup. here we only
2007 * want to write the one bad copy. so we do the mapping for ourselves and issue
2008 * submit_bio directly.
2009 * to avoid any synchronization issues, wait for the data after writing, which
2010 * actually prevents the read that triggered the error from finishing.
2011 * currently, there can be no more than two copies of every data bit. thus,
2012 * exactly one rewrite is required.
2013 */
2014int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
2015			u64 length, u64 logical, struct page *page,
2016			int mirror_num)
2017{
2018	struct bio *bio;
2019	struct btrfs_device *dev;
 
2020	u64 map_length = 0;
2021	u64 sector;
2022	struct btrfs_bio *bbio = NULL;
2023	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
2024	int ret;
2025
2026	ASSERT(!(fs_info->sb->s_flags & MS_RDONLY));
2027	BUG_ON(!mirror_num);
2028
2029	/* we can't repair anything in raid56 yet */
2030	if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
2031		return 0;
2032
2033	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
2034	if (!bio)
2035		return -EIO;
2036	bio->bi_iter.bi_size = 0;
 
 
2037	map_length = length;
2038
2039	ret = btrfs_map_block(fs_info, WRITE, logical,
2040			      &map_length, &bbio, mirror_num);
2041	if (ret) {
2042		bio_put(bio);
2043		return -EIO;
2044	}
2045	BUG_ON(mirror_num != bbio->mirror_num);
2046	sector = bbio->stripes[mirror_num-1].physical >> 9;
2047	bio->bi_iter.bi_sector = sector;
2048	dev = bbio->stripes[mirror_num-1].dev;
2049	kfree(bbio);
2050	if (!dev || !dev->bdev || !dev->writeable) {
2051		bio_put(bio);
2052		return -EIO;
2053	}
2054	bio->bi_bdev = dev->bdev;
2055	bio_add_page(bio, page, length, start - page_offset(page));
 
 
2056
2057	if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) {
2058		/* try to remap that extent elsewhere? */
2059		bio_put(bio);
2060		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
2061		return -EIO;
2062	}
2063
2064	printk_ratelimited_in_rcu(KERN_INFO
2065			"BTRFS: read error corrected: ino %lu off %llu "
2066		    "(dev %s sector %llu)\n", page->mapping->host->i_ino,
2067		    start, rcu_str_deref(dev->name), sector);
2068
2069	bio_put(bio);
2070	return 0;
2071}
2072
2073int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
2074			 int mirror_num)
2075{
 
2076	u64 start = eb->start;
2077	unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
2078	int ret = 0;
2079
2080	if (root->fs_info->sb->s_flags & MS_RDONLY)
2081		return -EROFS;
2082
2083	for (i = 0; i < num_pages; i++) {
2084		struct page *p = extent_buffer_page(eb, i);
2085		ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
2086					start, p, mirror_num);
2087		if (ret)
2088			break;
2089		start += PAGE_CACHE_SIZE;
2090	}
2091
2092	return ret;
2093}
2094
2095/*
2096 * each time an IO finishes, we do a fast check in the IO failure tree
2097 * to see if we need to process or clean up an io_failure_record
2098 */
2099static int clean_io_failure(u64 start, struct page *page)
2100{
2101	u64 private;
2102	u64 private_failure;
2103	struct io_failure_record *failrec;
2104	struct inode *inode = page->mapping->host;
2105	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2106	struct extent_state *state;
2107	int num_copies;
2108	int did_repair = 0;
2109	int ret;
 
2110
2111	private = 0;
2112	ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
2113				(u64)-1, 1, EXTENT_DIRTY, 0);
2114	if (!ret)
2115		return 0;
2116
2117	ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
2118				&private_failure);
2119	if (ret)
2120		return 0;
2121
2122	failrec = (struct io_failure_record *)(unsigned long) private_failure;
2123	BUG_ON(!failrec->this_mirror);
2124
2125	if (failrec->in_validation) {
2126		/* there was no real error, just free the record */
2127		pr_debug("clean_io_failure: freeing dummy error at %llu\n",
2128			 failrec->start);
2129		did_repair = 1;
2130		goto out;
2131	}
2132	if (fs_info->sb->s_flags & MS_RDONLY)
2133		goto out;
2134
2135	spin_lock(&BTRFS_I(inode)->io_tree.lock);
2136	state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
2137					    failrec->start,
2138					    EXTENT_LOCKED);
2139	spin_unlock(&BTRFS_I(inode)->io_tree.lock);
2140
2141	if (state && state->start <= failrec->start &&
2142	    state->end >= failrec->start + failrec->len - 1) {
2143		num_copies = btrfs_num_copies(fs_info, failrec->logical,
2144					      failrec->len);
2145		if (num_copies > 1)  {
2146			ret = repair_io_failure(fs_info, start, failrec->len,
2147						failrec->logical, page,
2148						failrec->failed_mirror);
2149			did_repair = !ret;
2150		}
2151		ret = 0;
2152	}
2153
2154out:
2155	if (!ret)
2156		ret = free_io_failure(inode, failrec, did_repair);
2157
2158	return ret;
2159}
2160
2161/*
2162 * this is a generic handler for readpage errors (default
2163 * readpage_io_failed_hook). if other copies exist, read those and write back
2164 * good data to the failed position. does not investigate in remapping the
2165 * failed extent elsewhere, hoping the device will be smart enough to do this as
2166 * needed
2167 */
2168
2169static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2170			      struct page *page, u64 start, u64 end,
2171			      int failed_mirror)
2172{
2173	struct io_failure_record *failrec = NULL;
2174	u64 private;
2175	struct extent_map *em;
2176	struct inode *inode = page->mapping->host;
2177	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2178	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2179	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2180	struct bio *bio;
2181	struct btrfs_io_bio *btrfs_failed_bio;
2182	struct btrfs_io_bio *btrfs_bio;
2183	int num_copies;
2184	int ret;
2185	int read_mode;
2186	u64 logical;
2187
2188	BUG_ON(failed_bio->bi_rw & REQ_WRITE);
2189
2190	ret = get_state_private(failure_tree, start, &private);
2191	if (ret) {
2192		failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2193		if (!failrec)
2194			return -ENOMEM;
2195		failrec->start = start;
2196		failrec->len = end - start + 1;
2197		failrec->this_mirror = 0;
2198		failrec->bio_flags = 0;
2199		failrec->in_validation = 0;
2200
2201		read_lock(&em_tree->lock);
2202		em = lookup_extent_mapping(em_tree, start, failrec->len);
2203		if (!em) {
2204			read_unlock(&em_tree->lock);
2205			kfree(failrec);
2206			return -EIO;
2207		}
2208
2209		if (em->start > start || em->start + em->len <= start) {
2210			free_extent_map(em);
2211			em = NULL;
2212		}
2213		read_unlock(&em_tree->lock);
2214
2215		if (!em) {
2216			kfree(failrec);
2217			return -EIO;
2218		}
2219		logical = start - em->start;
2220		logical = em->block_start + logical;
2221		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2222			logical = em->block_start;
2223			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2224			extent_set_compress_type(&failrec->bio_flags,
2225						 em->compress_type);
2226		}
2227		pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
2228			 "len=%llu\n", logical, start, failrec->len);
2229		failrec->logical = logical;
2230		free_extent_map(em);
2231
2232		/* set the bits in the private failure tree */
2233		ret = set_extent_bits(failure_tree, start, end,
2234					EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2235		if (ret >= 0)
2236			ret = set_state_private(failure_tree, start,
2237						(u64)(unsigned long)failrec);
2238		/* set the bits in the inode's tree */
2239		if (ret >= 0)
2240			ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
2241						GFP_NOFS);
2242		if (ret < 0) {
2243			kfree(failrec);
2244			return ret;
2245		}
2246	} else {
2247		failrec = (struct io_failure_record *)(unsigned long)private;
2248		pr_debug("bio_readpage_error: (found) logical=%llu, "
2249			 "start=%llu, len=%llu, validation=%d\n",
2250			 failrec->logical, failrec->start, failrec->len,
2251			 failrec->in_validation);
2252		/*
2253		 * when data can be on disk more than twice, add to failrec here
2254		 * (e.g. with a list for failed_mirror) to make
2255		 * clean_io_failure() clean all those errors at once.
2256		 */
2257	}
2258	num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
2259				      failrec->logical, failrec->len);
 
2260	if (num_copies == 1) {
2261		/*
2262		 * we only have a single copy of the data, so don't bother with
2263		 * all the retry and error correction code that follows. no
2264		 * matter what the error is, it is very likely to persist.
2265		 */
2266		pr_debug("bio_readpage_error: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
2267			 num_copies, failrec->this_mirror, failed_mirror);
 
 
2268		free_io_failure(inode, failrec, 0);
2269		return -EIO;
2270	}
2271
 
 
 
 
 
 
 
 
 
2272	/*
2273	 * there are two premises:
2274	 *	a) deliver good data to the caller
2275	 *	b) correct the bad sectors on disk
2276	 */
2277	if (failed_bio->bi_vcnt > 1) {
2278		/*
2279		 * to fulfill b), we need to know the exact failing sectors, as
2280		 * we don't want to rewrite any more than the failed ones. thus,
2281		 * we need separate read requests for the failed bio
2282		 *
2283		 * if the following BUG_ON triggers, our validation request got
2284		 * merged. we need separate requests for our algorithm to work.
2285		 */
2286		BUG_ON(failrec->in_validation);
2287		failrec->in_validation = 1;
2288		failrec->this_mirror = failed_mirror;
2289		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2290	} else {
2291		/*
2292		 * we're ready to fulfill a) and b) alongside. get a good copy
2293		 * of the failed sector and if we succeed, we have setup
2294		 * everything for repair_io_failure to do the rest for us.
2295		 */
2296		if (failrec->in_validation) {
2297			BUG_ON(failrec->this_mirror != failed_mirror);
2298			failrec->in_validation = 0;
2299			failrec->this_mirror = 0;
2300		}
2301		failrec->failed_mirror = failed_mirror;
2302		failrec->this_mirror++;
2303		if (failrec->this_mirror == failed_mirror)
2304			failrec->this_mirror++;
2305		read_mode = READ_SYNC;
2306	}
2307
2308	if (failrec->this_mirror > num_copies) {
2309		pr_debug("bio_readpage_error: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
 
2310			 num_copies, failrec->this_mirror, failed_mirror);
2311		free_io_failure(inode, failrec, 0);
2312		return -EIO;
2313	}
2314
2315	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
2316	if (!bio) {
2317		free_io_failure(inode, failrec, 0);
2318		return -EIO;
2319	}
 
2320	bio->bi_end_io = failed_bio->bi_end_io;
2321	bio->bi_iter.bi_sector = failrec->logical >> 9;
2322	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2323	bio->bi_iter.bi_size = 0;
2324
2325	btrfs_failed_bio = btrfs_io_bio(failed_bio);
2326	if (btrfs_failed_bio->csum) {
2327		struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2328		u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
2329
2330		btrfs_bio = btrfs_io_bio(bio);
2331		btrfs_bio->csum = btrfs_bio->csum_inline;
2332		phy_offset >>= inode->i_sb->s_blocksize_bits;
2333		phy_offset *= csum_size;
2334		memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + phy_offset,
2335		       csum_size);
2336	}
2337
2338	bio_add_page(bio, page, failrec->len, start - page_offset(page));
2339
2340	pr_debug("bio_readpage_error: submitting new read[%#x] to "
2341		 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
2342		 failrec->this_mirror, num_copies, failrec->in_validation);
2343
2344	ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
2345					 failrec->this_mirror,
2346					 failrec->bio_flags, 0);
2347	return ret;
2348}
2349
2350/* lots and lots of room for performance fixes in the end_bio funcs */
2351
2352int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2353{
2354	int uptodate = (err == 0);
2355	struct extent_io_tree *tree;
2356	int ret;
2357
2358	tree = &BTRFS_I(page->mapping->host)->io_tree;
2359
2360	if (tree->ops && tree->ops->writepage_end_io_hook) {
2361		ret = tree->ops->writepage_end_io_hook(page, start,
2362					       end, NULL, uptodate);
2363		if (ret)
2364			uptodate = 0;
2365	}
2366
2367	if (!uptodate) {
2368		ClearPageUptodate(page);
2369		SetPageError(page);
2370	}
2371	return 0;
2372}
2373
2374/*
2375 * after a writepage IO is done, we need to:
2376 * clear the uptodate bits on error
2377 * clear the writeback bits in the extent tree for this IO
2378 * end_page_writeback if the page has no more pending IO
2379 *
2380 * Scheduling is not allowed, so the extent state tree is expected
2381 * to have one and only one object corresponding to this IO.
2382 */
2383static void end_bio_extent_writepage(struct bio *bio, int err)
2384{
2385	struct bio_vec *bvec;
 
2386	u64 start;
2387	u64 end;
2388	int i;
2389
2390	bio_for_each_segment_all(bvec, bio, i) {
2391		struct page *page = bvec->bv_page;
 
 
 
 
 
2392
2393		/* We always issue full-page reads, but if some block
2394		 * in a page fails to read, blk_update_request() will
2395		 * advance bv_offset and adjust bv_len to compensate.
2396		 * Print a warning for nonzero offsets, and an error
2397		 * if they don't add up to a full page.  */
2398		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
2399			if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
2400				btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
2401				   "partial page write in btrfs with offset %u and length %u",
2402					bvec->bv_offset, bvec->bv_len);
2403			else
2404				btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
2405				   "incomplete page write in btrfs with offset %u and "
2406				   "length %u",
2407					bvec->bv_offset, bvec->bv_len);
2408		}
2409
2410		start = page_offset(page);
2411		end = start + bvec->bv_offset + bvec->bv_len - 1;
2412
2413		if (end_extent_writepage(page, err, start, end))
2414			continue;
2415
2416		end_page_writeback(page);
2417	}
 
 
 
2418
2419	bio_put(bio);
2420}
2421
2422static void
2423endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
2424			      int uptodate)
2425{
2426	struct extent_state *cached = NULL;
2427	u64 end = start + len - 1;
2428
2429	if (uptodate && tree->track_uptodate)
2430		set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
2431	unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
2432}
2433
2434/*
2435 * after a readpage IO is done, we need to:
2436 * clear the uptodate bits on error
2437 * set the uptodate bits if things worked
2438 * set the page up to date if all extents in the tree are uptodate
2439 * clear the lock bit in the extent tree
2440 * unlock the page if there are no other extents locked for it
2441 *
2442 * Scheduling is not allowed, so the extent state tree is expected
2443 * to have one and only one object corresponding to this IO.
2444 */
2445static void end_bio_extent_readpage(struct bio *bio, int err)
2446{
2447	struct bio_vec *bvec;
2448	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
2449	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
 
2450	struct extent_io_tree *tree;
2451	u64 offset = 0;
2452	u64 start;
2453	u64 end;
2454	u64 len;
2455	u64 extent_start = 0;
2456	u64 extent_len = 0;
2457	int mirror;
2458	int ret;
2459	int i;
2460
2461	if (err)
2462		uptodate = 0;
2463
2464	bio_for_each_segment_all(bvec, bio, i) {
2465		struct page *page = bvec->bv_page;
2466		struct inode *inode = page->mapping->host;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2467
2468		pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
2469			 "mirror=%lu\n", (u64)bio->bi_iter.bi_sector, err,
2470			 io_bio->mirror_num);
2471		tree = &BTRFS_I(inode)->io_tree;
2472
2473		/* We always issue full-page reads, but if some block
2474		 * in a page fails to read, blk_update_request() will
2475		 * advance bv_offset and adjust bv_len to compensate.
2476		 * Print a warning for nonzero offsets, and an error
2477		 * if they don't add up to a full page.  */
2478		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
2479			if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
2480				btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
2481				   "partial page read in btrfs with offset %u and length %u",
2482					bvec->bv_offset, bvec->bv_len);
2483			else
2484				btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
2485				   "incomplete page read in btrfs with offset %u and "
2486				   "length %u",
2487					bvec->bv_offset, bvec->bv_len);
2488		}
2489
2490		start = page_offset(page);
2491		end = start + bvec->bv_offset + bvec->bv_len - 1;
2492		len = bvec->bv_len;
2493
2494		mirror = io_bio->mirror_num;
2495		if (likely(uptodate && tree->ops &&
2496			   tree->ops->readpage_end_io_hook)) {
2497			ret = tree->ops->readpage_end_io_hook(io_bio, offset,
2498							      page, start, end,
2499							      mirror);
2500			if (ret)
2501				uptodate = 0;
2502			else
2503				clean_io_failure(start, page);
2504		}
2505
2506		if (likely(uptodate))
2507			goto readpage_ok;
2508
2509		if (tree->ops && tree->ops->readpage_io_failed_hook) {
2510			ret = tree->ops->readpage_io_failed_hook(page, mirror);
2511			if (!ret && !err &&
2512			    test_bit(BIO_UPTODATE, &bio->bi_flags))
2513				uptodate = 1;
2514		} else {
2515			/*
2516			 * The generic bio_readpage_error handles errors the
2517			 * following way: If possible, new read requests are
2518			 * created and submitted and will end up in
2519			 * end_bio_extent_readpage as well (if we're lucky, not
2520			 * in the !uptodate case). In that case it returns 0 and
2521			 * we just go on with the next page in our bio. If it
2522			 * can't handle the error it will return -EIO and we
2523			 * remain responsible for that page.
2524			 */
2525			ret = bio_readpage_error(bio, offset, page, start, end,
2526						 mirror);
2527			if (ret == 0) {
2528				uptodate =
2529					test_bit(BIO_UPTODATE, &bio->bi_flags);
2530				if (err)
2531					uptodate = 0;
 
2532				continue;
2533			}
2534		}
2535readpage_ok:
2536		if (likely(uptodate)) {
2537			loff_t i_size = i_size_read(inode);
2538			pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2539			unsigned offset;
2540
2541			/* Zero out the end if this page straddles i_size */
2542			offset = i_size & (PAGE_CACHE_SIZE-1);
2543			if (page->index == end_index && offset)
2544				zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2545			SetPageUptodate(page);
2546		} else {
2547			ClearPageUptodate(page);
2548			SetPageError(page);
2549		}
2550		unlock_page(page);
2551		offset += len;
2552
2553		if (unlikely(!uptodate)) {
2554			if (extent_len) {
2555				endio_readpage_release_extent(tree,
2556							      extent_start,
2557							      extent_len, 1);
2558				extent_start = 0;
2559				extent_len = 0;
 
 
 
 
 
2560			}
2561			endio_readpage_release_extent(tree, start,
2562						      end - start + 1, 0);
2563		} else if (!extent_len) {
2564			extent_start = start;
2565			extent_len = end + 1 - start;
2566		} else if (extent_start + extent_len == start) {
2567			extent_len += end + 1 - start;
2568		} else {
2569			endio_readpage_release_extent(tree, extent_start,
2570						      extent_len, uptodate);
2571			extent_start = start;
2572			extent_len = end + 1 - start;
 
 
 
2573		}
2574	}
2575
2576	if (extent_len)
2577		endio_readpage_release_extent(tree, extent_start, extent_len,
2578					      uptodate);
2579	if (io_bio->end_io)
2580		io_bio->end_io(io_bio, err);
2581	bio_put(bio);
2582}
2583
2584/*
2585 * this allocates from the btrfs_bioset.  We're returning a bio right now
2586 * but you can call btrfs_io_bio for the appropriate container_of magic
2587 */
2588struct bio *
2589btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2590		gfp_t gfp_flags)
2591{
2592	struct btrfs_io_bio *btrfs_bio;
2593	struct bio *bio;
2594
2595	bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset);
2596
2597	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
2598		while (!bio && (nr_vecs /= 2)) {
2599			bio = bio_alloc_bioset(gfp_flags,
2600					       nr_vecs, btrfs_bioset);
2601		}
2602	}
2603
2604	if (bio) {
 
2605		bio->bi_bdev = bdev;
2606		bio->bi_iter.bi_sector = first_sector;
2607		btrfs_bio = btrfs_io_bio(bio);
2608		btrfs_bio->csum = NULL;
2609		btrfs_bio->csum_allocated = NULL;
2610		btrfs_bio->end_io = NULL;
2611	}
2612	return bio;
2613}
2614
2615struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
2616{
2617	return bio_clone_bioset(bio, gfp_mask, btrfs_bioset);
2618}
2619
2620
2621/* this also allocates from the btrfs_bioset */
2622struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
2623{
2624	struct btrfs_io_bio *btrfs_bio;
2625	struct bio *bio;
2626
2627	bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset);
2628	if (bio) {
2629		btrfs_bio = btrfs_io_bio(bio);
2630		btrfs_bio->csum = NULL;
2631		btrfs_bio->csum_allocated = NULL;
2632		btrfs_bio->end_io = NULL;
2633	}
2634	return bio;
2635}
2636
2637
2638static int __must_check submit_one_bio(int rw, struct bio *bio,
2639				       int mirror_num, unsigned long bio_flags)
2640{
2641	int ret = 0;
2642	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
2643	struct page *page = bvec->bv_page;
2644	struct extent_io_tree *tree = bio->bi_private;
2645	u64 start;
2646
2647	start = page_offset(page) + bvec->bv_offset;
2648
2649	bio->bi_private = NULL;
2650
2651	bio_get(bio);
2652
2653	if (tree->ops && tree->ops->submit_bio_hook)
2654		ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
2655					   mirror_num, bio_flags, start);
2656	else
2657		btrfsic_submit_bio(rw, bio);
2658
2659	if (bio_flagged(bio, BIO_EOPNOTSUPP))
2660		ret = -EOPNOTSUPP;
2661	bio_put(bio);
2662	return ret;
2663}
2664
2665static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
2666		     unsigned long offset, size_t size, struct bio *bio,
2667		     unsigned long bio_flags)
2668{
2669	int ret = 0;
2670	if (tree->ops && tree->ops->merge_bio_hook)
2671		ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio,
2672						bio_flags);
2673	BUG_ON(ret < 0);
2674	return ret;
2675
2676}
2677
2678static int submit_extent_page(int rw, struct extent_io_tree *tree,
2679			      struct page *page, sector_t sector,
2680			      size_t size, unsigned long offset,
2681			      struct block_device *bdev,
2682			      struct bio **bio_ret,
2683			      unsigned long max_pages,
2684			      bio_end_io_t end_io_func,
2685			      int mirror_num,
2686			      unsigned long prev_bio_flags,
2687			      unsigned long bio_flags)
2688{
2689	int ret = 0;
2690	struct bio *bio;
2691	int nr;
2692	int contig = 0;
2693	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
2694	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
2695	size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
2696
2697	if (bio_ret && *bio_ret) {
2698		bio = *bio_ret;
2699		if (old_compressed)
2700			contig = bio->bi_iter.bi_sector == sector;
2701		else
2702			contig = bio_end_sector(bio) == sector;
 
2703
2704		if (prev_bio_flags != bio_flags || !contig ||
2705		    merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
2706		    bio_add_page(bio, page, page_size, offset) < page_size) {
2707			ret = submit_one_bio(rw, bio, mirror_num,
2708					     prev_bio_flags);
2709			if (ret < 0)
2710				return ret;
2711			bio = NULL;
2712		} else {
2713			return 0;
2714		}
2715	}
2716	if (this_compressed)
2717		nr = BIO_MAX_PAGES;
2718	else
2719		nr = bio_get_nr_vecs(bdev);
2720
2721	bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
2722	if (!bio)
2723		return -ENOMEM;
2724
2725	bio_add_page(bio, page, page_size, offset);
2726	bio->bi_end_io = end_io_func;
2727	bio->bi_private = tree;
2728
2729	if (bio_ret)
2730		*bio_ret = bio;
2731	else
2732		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
2733
2734	return ret;
2735}
2736
2737static void attach_extent_buffer_page(struct extent_buffer *eb,
2738				      struct page *page)
2739{
2740	if (!PagePrivate(page)) {
2741		SetPagePrivate(page);
2742		page_cache_get(page);
2743		set_page_private(page, (unsigned long)eb);
2744	} else {
2745		WARN_ON(page->private != (unsigned long)eb);
2746	}
2747}
2748
2749void set_page_extent_mapped(struct page *page)
2750{
2751	if (!PagePrivate(page)) {
2752		SetPagePrivate(page);
2753		page_cache_get(page);
2754		set_page_private(page, EXTENT_PAGE_PRIVATE);
2755	}
2756}
2757
2758static struct extent_map *
2759__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
2760		 u64 start, u64 len, get_extent_t *get_extent,
2761		 struct extent_map **em_cached)
2762{
2763	struct extent_map *em;
2764
2765	if (em_cached && *em_cached) {
2766		em = *em_cached;
2767		if (extent_map_in_tree(em) && start >= em->start &&
2768		    start < extent_map_end(em)) {
2769			atomic_inc(&em->refs);
2770			return em;
2771		}
2772
2773		free_extent_map(em);
2774		*em_cached = NULL;
2775	}
2776
2777	em = get_extent(inode, page, pg_offset, start, len, 0);
2778	if (em_cached && !IS_ERR_OR_NULL(em)) {
2779		BUG_ON(*em_cached);
2780		atomic_inc(&em->refs);
2781		*em_cached = em;
2782	}
2783	return em;
2784}
2785/*
2786 * basic readpage implementation.  Locked extent state structs are inserted
2787 * into the tree that are removed when the IO is done (by the end_io
2788 * handlers)
2789 * XXX JDM: This needs looking at to ensure proper page locking
2790 */
2791static int __do_readpage(struct extent_io_tree *tree,
2792			 struct page *page,
2793			 get_extent_t *get_extent,
2794			 struct extent_map **em_cached,
2795			 struct bio **bio, int mirror_num,
2796			 unsigned long *bio_flags, int rw)
2797{
2798	struct inode *inode = page->mapping->host;
2799	u64 start = page_offset(page);
2800	u64 page_end = start + PAGE_CACHE_SIZE - 1;
2801	u64 end;
2802	u64 cur = start;
2803	u64 extent_offset;
2804	u64 last_byte = i_size_read(inode);
2805	u64 block_start;
2806	u64 cur_end;
2807	sector_t sector;
2808	struct extent_map *em;
2809	struct block_device *bdev;
 
2810	int ret;
2811	int nr = 0;
2812	int parent_locked = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
2813	size_t pg_offset = 0;
2814	size_t iosize;
2815	size_t disk_io_size;
2816	size_t blocksize = inode->i_sb->s_blocksize;
2817	unsigned long this_bio_flag = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
2818
2819	set_page_extent_mapped(page);
2820
2821	end = page_end;
2822	if (!PageUptodate(page)) {
2823		if (cleancache_get_page(page) == 0) {
2824			BUG_ON(blocksize != PAGE_SIZE);
2825			unlock_extent(tree, start, end);
2826			goto out;
2827		}
2828	}
2829
 
 
 
 
 
 
 
 
 
 
 
2830	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2831		char *userpage;
2832		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
2833
2834		if (zero_offset) {
2835			iosize = PAGE_CACHE_SIZE - zero_offset;
2836			userpage = kmap_atomic(page);
2837			memset(userpage + zero_offset, 0, iosize);
2838			flush_dcache_page(page);
2839			kunmap_atomic(userpage);
2840		}
2841	}
2842	while (cur <= end) {
2843		unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2844
2845		if (cur >= last_byte) {
2846			char *userpage;
2847			struct extent_state *cached = NULL;
2848
2849			iosize = PAGE_CACHE_SIZE - pg_offset;
2850			userpage = kmap_atomic(page);
2851			memset(userpage + pg_offset, 0, iosize);
2852			flush_dcache_page(page);
2853			kunmap_atomic(userpage);
2854			set_extent_uptodate(tree, cur, cur + iosize - 1,
2855					    &cached, GFP_NOFS);
2856			if (!parent_locked)
2857				unlock_extent_cached(tree, cur,
2858						     cur + iosize - 1,
2859						     &cached, GFP_NOFS);
2860			break;
2861		}
2862		em = __get_extent_map(inode, page, pg_offset, cur,
2863				      end - cur + 1, get_extent, em_cached);
2864		if (IS_ERR_OR_NULL(em)) {
2865			SetPageError(page);
2866			if (!parent_locked)
2867				unlock_extent(tree, cur, end);
2868			break;
2869		}
2870		extent_offset = cur - em->start;
2871		BUG_ON(extent_map_end(em) <= cur);
2872		BUG_ON(end < cur);
2873
2874		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2875			this_bio_flag |= EXTENT_BIO_COMPRESSED;
2876			extent_set_compress_type(&this_bio_flag,
2877						 em->compress_type);
2878		}
2879
2880		iosize = min(extent_map_end(em) - cur, end - cur + 1);
2881		cur_end = min(extent_map_end(em) - 1, end);
2882		iosize = ALIGN(iosize, blocksize);
2883		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2884			disk_io_size = em->block_len;
2885			sector = em->block_start >> 9;
2886		} else {
2887			sector = (em->block_start + extent_offset) >> 9;
2888			disk_io_size = iosize;
2889		}
2890		bdev = em->bdev;
2891		block_start = em->block_start;
2892		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2893			block_start = EXTENT_MAP_HOLE;
2894		free_extent_map(em);
2895		em = NULL;
2896
2897		/* we've found a hole, just zero and go on */
2898		if (block_start == EXTENT_MAP_HOLE) {
2899			char *userpage;
2900			struct extent_state *cached = NULL;
2901
2902			userpage = kmap_atomic(page);
2903			memset(userpage + pg_offset, 0, iosize);
2904			flush_dcache_page(page);
2905			kunmap_atomic(userpage);
2906
2907			set_extent_uptodate(tree, cur, cur + iosize - 1,
2908					    &cached, GFP_NOFS);
2909			unlock_extent_cached(tree, cur, cur + iosize - 1,
2910			                     &cached, GFP_NOFS);
2911			cur = cur + iosize;
2912			pg_offset += iosize;
2913			continue;
2914		}
2915		/* the get_extent function already copied into the page */
2916		if (test_range_bit(tree, cur, cur_end,
2917				   EXTENT_UPTODATE, 1, NULL)) {
2918			check_page_uptodate(tree, page);
2919			if (!parent_locked)
2920				unlock_extent(tree, cur, cur + iosize - 1);
2921			cur = cur + iosize;
2922			pg_offset += iosize;
2923			continue;
2924		}
2925		/* we have an inline extent but it didn't get marked up
2926		 * to date.  Error out
2927		 */
2928		if (block_start == EXTENT_MAP_INLINE) {
2929			SetPageError(page);
2930			if (!parent_locked)
2931				unlock_extent(tree, cur, cur + iosize - 1);
2932			cur = cur + iosize;
2933			pg_offset += iosize;
2934			continue;
2935		}
2936
2937		pnr -= page->index;
2938		ret = submit_extent_page(rw, tree, page,
 
 
 
 
 
 
 
2939					 sector, disk_io_size, pg_offset,
2940					 bdev, bio, pnr,
2941					 end_bio_extent_readpage, mirror_num,
2942					 *bio_flags,
2943					 this_bio_flag);
2944		if (!ret) {
2945			nr++;
2946			*bio_flags = this_bio_flag;
2947		} else {
 
2948			SetPageError(page);
2949			if (!parent_locked)
2950				unlock_extent(tree, cur, cur + iosize - 1);
2951		}
2952		cur = cur + iosize;
2953		pg_offset += iosize;
2954	}
2955out:
2956	if (!nr) {
2957		if (!PageError(page))
2958			SetPageUptodate(page);
2959		unlock_page(page);
2960	}
2961	return 0;
2962}
2963
2964static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
2965					     struct page *pages[], int nr_pages,
2966					     u64 start, u64 end,
2967					     get_extent_t *get_extent,
2968					     struct extent_map **em_cached,
2969					     struct bio **bio, int mirror_num,
2970					     unsigned long *bio_flags, int rw)
2971{
2972	struct inode *inode;
2973	struct btrfs_ordered_extent *ordered;
2974	int index;
2975
2976	inode = pages[0]->mapping->host;
2977	while (1) {
2978		lock_extent(tree, start, end);
2979		ordered = btrfs_lookup_ordered_range(inode, start,
2980						     end - start + 1);
2981		if (!ordered)
2982			break;
2983		unlock_extent(tree, start, end);
2984		btrfs_start_ordered_extent(inode, ordered, 1);
2985		btrfs_put_ordered_extent(ordered);
2986	}
2987
2988	for (index = 0; index < nr_pages; index++) {
2989		__do_readpage(tree, pages[index], get_extent, em_cached, bio,
2990			      mirror_num, bio_flags, rw);
2991		page_cache_release(pages[index]);
2992	}
2993}
2994
2995static void __extent_readpages(struct extent_io_tree *tree,
2996			       struct page *pages[],
2997			       int nr_pages, get_extent_t *get_extent,
2998			       struct extent_map **em_cached,
2999			       struct bio **bio, int mirror_num,
3000			       unsigned long *bio_flags, int rw)
3001{
3002	u64 start = 0;
3003	u64 end = 0;
3004	u64 page_start;
3005	int index;
3006	int first_index = 0;
3007
3008	for (index = 0; index < nr_pages; index++) {
3009		page_start = page_offset(pages[index]);
3010		if (!end) {
3011			start = page_start;
3012			end = start + PAGE_CACHE_SIZE - 1;
3013			first_index = index;
3014		} else if (end + 1 == page_start) {
3015			end += PAGE_CACHE_SIZE;
3016		} else {
3017			__do_contiguous_readpages(tree, &pages[first_index],
3018						  index - first_index, start,
3019						  end, get_extent, em_cached,
3020						  bio, mirror_num, bio_flags,
3021						  rw);
3022			start = page_start;
3023			end = start + PAGE_CACHE_SIZE - 1;
3024			first_index = index;
3025		}
3026	}
3027
3028	if (end)
3029		__do_contiguous_readpages(tree, &pages[first_index],
3030					  index - first_index, start,
3031					  end, get_extent, em_cached, bio,
3032					  mirror_num, bio_flags, rw);
3033}
3034
3035static int __extent_read_full_page(struct extent_io_tree *tree,
3036				   struct page *page,
3037				   get_extent_t *get_extent,
3038				   struct bio **bio, int mirror_num,
3039				   unsigned long *bio_flags, int rw)
3040{
3041	struct inode *inode = page->mapping->host;
3042	struct btrfs_ordered_extent *ordered;
3043	u64 start = page_offset(page);
3044	u64 end = start + PAGE_CACHE_SIZE - 1;
3045	int ret;
3046
3047	while (1) {
3048		lock_extent(tree, start, end);
3049		ordered = btrfs_lookup_ordered_extent(inode, start);
3050		if (!ordered)
3051			break;
3052		unlock_extent(tree, start, end);
3053		btrfs_start_ordered_extent(inode, ordered, 1);
3054		btrfs_put_ordered_extent(ordered);
3055	}
3056
3057	ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
3058			    bio_flags, rw);
3059	return ret;
3060}
3061
3062int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
3063			    get_extent_t *get_extent, int mirror_num)
3064{
3065	struct bio *bio = NULL;
3066	unsigned long bio_flags = 0;
3067	int ret;
3068
3069	ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
3070				      &bio_flags, READ);
3071	if (bio)
3072		ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
3073	return ret;
3074}
3075
3076int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
3077				 get_extent_t *get_extent, int mirror_num)
3078{
3079	struct bio *bio = NULL;
3080	unsigned long bio_flags = EXTENT_BIO_PARENT_LOCKED;
3081	int ret;
3082
3083	ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num,
3084				      &bio_flags, READ);
3085	if (bio)
3086		ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
3087	return ret;
3088}
3089
3090static noinline void update_nr_written(struct page *page,
3091				      struct writeback_control *wbc,
3092				      unsigned long nr_written)
3093{
3094	wbc->nr_to_write -= nr_written;
3095	if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
3096	    wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
3097		page->mapping->writeback_index = page->index + nr_written;
3098}
3099
3100/*
3101 * the writepage semantics are similar to regular writepage.  extent
3102 * records are inserted to lock ranges in the tree, and as dirty areas
3103 * are found, they are marked writeback.  Then the lock bits are removed
3104 * and the end_io handler clears the writeback ranges
3105 */
3106static int __extent_writepage(struct page *page, struct writeback_control *wbc,
3107			      void *data)
3108{
3109	struct inode *inode = page->mapping->host;
3110	struct extent_page_data *epd = data;
3111	struct extent_io_tree *tree = epd->tree;
3112	u64 start = page_offset(page);
3113	u64 delalloc_start;
3114	u64 page_end = start + PAGE_CACHE_SIZE - 1;
3115	u64 end;
3116	u64 cur = start;
3117	u64 extent_offset;
3118	u64 last_byte = i_size_read(inode);
3119	u64 block_start;
3120	u64 iosize;
3121	sector_t sector;
3122	struct extent_state *cached_state = NULL;
3123	struct extent_map *em;
3124	struct block_device *bdev;
3125	int ret;
3126	int nr = 0;
3127	size_t pg_offset = 0;
3128	size_t blocksize;
3129	loff_t i_size = i_size_read(inode);
3130	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
3131	u64 nr_delalloc;
3132	u64 delalloc_end;
3133	int page_started;
3134	int compressed;
3135	int write_flags;
3136	unsigned long nr_written = 0;
3137	bool fill_delalloc = true;
3138
3139	if (wbc->sync_mode == WB_SYNC_ALL)
3140		write_flags = WRITE_SYNC;
3141	else
3142		write_flags = WRITE;
3143
3144	trace___extent_writepage(page, inode, wbc);
3145
3146	WARN_ON(!PageLocked(page));
3147
3148	ClearPageError(page);
3149
3150	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
3151	if (page->index > end_index ||
3152	   (page->index == end_index && !pg_offset)) {
3153		page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
3154		unlock_page(page);
3155		return 0;
3156	}
3157
3158	if (page->index == end_index) {
3159		char *userpage;
3160
3161		userpage = kmap_atomic(page);
3162		memset(userpage + pg_offset, 0,
3163		       PAGE_CACHE_SIZE - pg_offset);
3164		kunmap_atomic(userpage);
3165		flush_dcache_page(page);
3166	}
3167	pg_offset = 0;
3168
3169	set_page_extent_mapped(page);
3170
3171	if (!tree->ops || !tree->ops->fill_delalloc)
3172		fill_delalloc = false;
3173
3174	delalloc_start = start;
3175	delalloc_end = 0;
3176	page_started = 0;
3177	if (!epd->extent_locked && fill_delalloc) {
3178		u64 delalloc_to_write = 0;
3179		/*
3180		 * make sure the wbc mapping index is at least updated
3181		 * to this page.
3182		 */
3183		update_nr_written(page, wbc, 0);
3184
3185		while (delalloc_end < page_end) {
3186			nr_delalloc = find_lock_delalloc_range(inode, tree,
3187						       page,
3188						       &delalloc_start,
3189						       &delalloc_end,
3190						       128 * 1024 * 1024);
3191			if (nr_delalloc == 0) {
3192				delalloc_start = delalloc_end + 1;
3193				continue;
3194			}
3195			ret = tree->ops->fill_delalloc(inode, page,
3196						       delalloc_start,
3197						       delalloc_end,
3198						       &page_started,
3199						       &nr_written);
3200			/* File system has been set read-only */
3201			if (ret) {
3202				SetPageError(page);
3203				goto done;
3204			}
3205			/*
3206			 * delalloc_end is already one less than the total
3207			 * length, so we don't subtract one from
3208			 * PAGE_CACHE_SIZE
3209			 */
3210			delalloc_to_write += (delalloc_end - delalloc_start +
3211					      PAGE_CACHE_SIZE) >>
3212					      PAGE_CACHE_SHIFT;
3213			delalloc_start = delalloc_end + 1;
3214		}
3215		if (wbc->nr_to_write < delalloc_to_write) {
3216			int thresh = 8192;
3217
3218			if (delalloc_to_write < thresh * 2)
3219				thresh = delalloc_to_write;
3220			wbc->nr_to_write = min_t(u64, delalloc_to_write,
3221						 thresh);
3222		}
3223
3224		/* did the fill delalloc function already unlock and start
3225		 * the IO?
3226		 */
3227		if (page_started) {
3228			ret = 0;
3229			/*
3230			 * we've unlocked the page, so we can't update
3231			 * the mapping's writeback index, just update
3232			 * nr_to_write.
3233			 */
3234			wbc->nr_to_write -= nr_written;
3235			goto done_unlocked;
3236		}
3237	}
3238	if (tree->ops && tree->ops->writepage_start_hook) {
3239		ret = tree->ops->writepage_start_hook(page, start,
3240						      page_end);
3241		if (ret) {
3242			/* Fixup worker will requeue */
3243			if (ret == -EBUSY)
3244				wbc->pages_skipped++;
3245			else
3246				redirty_page_for_writepage(wbc, page);
3247			update_nr_written(page, wbc, nr_written);
3248			unlock_page(page);
3249			ret = 0;
3250			goto done_unlocked;
3251		}
3252	}
3253
3254	/*
3255	 * we don't want to touch the inode after unlocking the page,
3256	 * so we update the mapping writeback index now
3257	 */
3258	update_nr_written(page, wbc, nr_written + 1);
3259
3260	end = page_end;
3261	if (last_byte <= start) {
3262		if (tree->ops && tree->ops->writepage_end_io_hook)
3263			tree->ops->writepage_end_io_hook(page, start,
3264							 page_end, NULL, 1);
3265		goto done;
3266	}
3267
3268	blocksize = inode->i_sb->s_blocksize;
3269
3270	while (cur <= end) {
3271		if (cur >= last_byte) {
3272			if (tree->ops && tree->ops->writepage_end_io_hook)
3273				tree->ops->writepage_end_io_hook(page, cur,
3274							 page_end, NULL, 1);
3275			break;
3276		}
3277		em = epd->get_extent(inode, page, pg_offset, cur,
3278				     end - cur + 1, 1);
3279		if (IS_ERR_OR_NULL(em)) {
3280			SetPageError(page);
3281			break;
3282		}
3283
3284		extent_offset = cur - em->start;
3285		BUG_ON(extent_map_end(em) <= cur);
3286		BUG_ON(end < cur);
3287		iosize = min(extent_map_end(em) - cur, end - cur + 1);
3288		iosize = ALIGN(iosize, blocksize);
3289		sector = (em->block_start + extent_offset) >> 9;
3290		bdev = em->bdev;
3291		block_start = em->block_start;
3292		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
3293		free_extent_map(em);
3294		em = NULL;
3295
3296		/*
3297		 * compressed and inline extents are written through other
3298		 * paths in the FS
3299		 */
3300		if (compressed || block_start == EXTENT_MAP_HOLE ||
3301		    block_start == EXTENT_MAP_INLINE) {
3302			/*
3303			 * end_io notification does not happen here for
3304			 * compressed extents
3305			 */
3306			if (!compressed && tree->ops &&
3307			    tree->ops->writepage_end_io_hook)
3308				tree->ops->writepage_end_io_hook(page, cur,
3309							 cur + iosize - 1,
3310							 NULL, 1);
3311			else if (compressed) {
3312				/* we don't want to end_page_writeback on
3313				 * a compressed extent.  this happens
3314				 * elsewhere
3315				 */
3316				nr++;
3317			}
3318
3319			cur += iosize;
3320			pg_offset += iosize;
3321			continue;
3322		}
3323		/* leave this out until we have a page_mkwrite call */
3324		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
3325				   EXTENT_DIRTY, 0, NULL)) {
3326			cur = cur + iosize;
3327			pg_offset += iosize;
3328			continue;
3329		}
3330
3331		if (tree->ops && tree->ops->writepage_io_hook) {
3332			ret = tree->ops->writepage_io_hook(page, cur,
3333						cur + iosize - 1);
3334		} else {
3335			ret = 0;
3336		}
3337		if (ret) {
3338			SetPageError(page);
3339		} else {
3340			unsigned long max_nr = end_index + 1;
3341
3342			set_range_writeback(tree, cur, cur + iosize - 1);
3343			if (!PageWriteback(page)) {
3344				btrfs_err(BTRFS_I(inode)->root->fs_info,
3345					   "page %lu not writeback, cur %llu end %llu",
3346				       page->index, cur, end);
 
3347			}
3348
3349			ret = submit_extent_page(write_flags, tree, page,
3350						 sector, iosize, pg_offset,
3351						 bdev, &epd->bio, max_nr,
3352						 end_bio_extent_writepage,
3353						 0, 0, 0);
3354			if (ret)
3355				SetPageError(page);
3356		}
3357		cur = cur + iosize;
3358		pg_offset += iosize;
3359		nr++;
3360	}
3361done:
3362	if (nr == 0) {
3363		/* make sure the mapping tag for page dirty gets cleared */
3364		set_page_writeback(page);
3365		end_page_writeback(page);
3366	}
3367	unlock_page(page);
3368
3369done_unlocked:
3370
3371	/* drop our reference on any cached states */
3372	free_extent_state(cached_state);
3373	return 0;
3374}
3375
3376static int eb_wait(void *word)
3377{
3378	io_schedule();
3379	return 0;
3380}
3381
3382void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
3383{
3384	wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait,
3385		    TASK_UNINTERRUPTIBLE);
3386}
3387
3388static int lock_extent_buffer_for_io(struct extent_buffer *eb,
3389				     struct btrfs_fs_info *fs_info,
3390				     struct extent_page_data *epd)
3391{
3392	unsigned long i, num_pages;
3393	int flush = 0;
3394	int ret = 0;
3395
3396	if (!btrfs_try_tree_write_lock(eb)) {
3397		flush = 1;
3398		flush_write_bio(epd);
3399		btrfs_tree_lock(eb);
3400	}
3401
3402	if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
3403		btrfs_tree_unlock(eb);
3404		if (!epd->sync_io)
3405			return 0;
3406		if (!flush) {
3407			flush_write_bio(epd);
3408			flush = 1;
3409		}
3410		while (1) {
3411			wait_on_extent_buffer_writeback(eb);
3412			btrfs_tree_lock(eb);
3413			if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
3414				break;
3415			btrfs_tree_unlock(eb);
3416		}
3417	}
3418
3419	/*
3420	 * We need to do this to prevent races in people who check if the eb is
3421	 * under IO since we can end up having no IO bits set for a short period
3422	 * of time.
3423	 */
3424	spin_lock(&eb->refs_lock);
3425	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3426		set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3427		spin_unlock(&eb->refs_lock);
3428		btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3429		__percpu_counter_add(&fs_info->dirty_metadata_bytes,
3430				     -eb->len,
3431				     fs_info->dirty_metadata_batch);
 
 
 
3432		ret = 1;
3433	} else {
3434		spin_unlock(&eb->refs_lock);
3435	}
3436
3437	btrfs_tree_unlock(eb);
3438
3439	if (!ret)
3440		return ret;
3441
3442	num_pages = num_extent_pages(eb->start, eb->len);
3443	for (i = 0; i < num_pages; i++) {
3444		struct page *p = extent_buffer_page(eb, i);
3445
3446		if (!trylock_page(p)) {
3447			if (!flush) {
3448				flush_write_bio(epd);
3449				flush = 1;
3450			}
3451			lock_page(p);
3452		}
3453	}
3454
3455	return ret;
3456}
3457
3458static void end_extent_buffer_writeback(struct extent_buffer *eb)
3459{
3460	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3461	smp_mb__after_clear_bit();
3462	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3463}
3464
3465static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
3466{
3467	struct bio_vec *bvec;
 
3468	struct extent_buffer *eb;
3469	int i, done;
3470
3471	bio_for_each_segment_all(bvec, bio, i) {
3472		struct page *page = bvec->bv_page;
3473
 
3474		eb = (struct extent_buffer *)page->private;
3475		BUG_ON(!eb);
3476		done = atomic_dec_and_test(&eb->io_pages);
3477
3478		if (err || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
3479			set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3480			ClearPageUptodate(page);
3481			SetPageError(page);
3482		}
3483
3484		end_page_writeback(page);
3485
3486		if (!done)
3487			continue;
3488
3489		end_extent_buffer_writeback(eb);
3490	}
3491
3492	bio_put(bio);
 
3493}
3494
3495static int write_one_eb(struct extent_buffer *eb,
3496			struct btrfs_fs_info *fs_info,
3497			struct writeback_control *wbc,
3498			struct extent_page_data *epd)
3499{
3500	struct block_device *bdev = fs_info->fs_devices->latest_bdev;
3501	struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
3502	u64 offset = eb->start;
3503	unsigned long i, num_pages;
3504	unsigned long bio_flags = 0;
3505	int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META;
3506	int ret = 0;
3507
3508	clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3509	num_pages = num_extent_pages(eb->start, eb->len);
3510	atomic_set(&eb->io_pages, num_pages);
3511	if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
3512		bio_flags = EXTENT_BIO_TREE_LOG;
3513
3514	for (i = 0; i < num_pages; i++) {
3515		struct page *p = extent_buffer_page(eb, i);
3516
3517		clear_page_dirty_for_io(p);
3518		set_page_writeback(p);
3519		ret = submit_extent_page(rw, tree, p, offset >> 9,
3520					 PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
3521					 -1, end_bio_extent_buffer_writepage,
3522					 0, epd->bio_flags, bio_flags);
3523		epd->bio_flags = bio_flags;
3524		if (ret) {
3525			set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3526			SetPageError(p);
3527			if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
3528				end_extent_buffer_writeback(eb);
3529			ret = -EIO;
3530			break;
3531		}
3532		offset += PAGE_CACHE_SIZE;
3533		update_nr_written(p, wbc, 1);
3534		unlock_page(p);
3535	}
3536
3537	if (unlikely(ret)) {
3538		for (; i < num_pages; i++) {
3539			struct page *p = extent_buffer_page(eb, i);
3540			unlock_page(p);
3541		}
3542	}
3543
3544	return ret;
3545}
3546
3547int btree_write_cache_pages(struct address_space *mapping,
3548				   struct writeback_control *wbc)
3549{
3550	struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
3551	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
3552	struct extent_buffer *eb, *prev_eb = NULL;
3553	struct extent_page_data epd = {
3554		.bio = NULL,
3555		.tree = tree,
3556		.extent_locked = 0,
3557		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
3558		.bio_flags = 0,
3559	};
3560	int ret = 0;
3561	int done = 0;
3562	int nr_to_write_done = 0;
3563	struct pagevec pvec;
3564	int nr_pages;
3565	pgoff_t index;
3566	pgoff_t end;		/* Inclusive */
3567	int scanned = 0;
3568	int tag;
3569
3570	pagevec_init(&pvec, 0);
3571	if (wbc->range_cyclic) {
3572		index = mapping->writeback_index; /* Start from prev offset */
3573		end = -1;
3574	} else {
3575		index = wbc->range_start >> PAGE_CACHE_SHIFT;
3576		end = wbc->range_end >> PAGE_CACHE_SHIFT;
3577		scanned = 1;
3578	}
3579	if (wbc->sync_mode == WB_SYNC_ALL)
3580		tag = PAGECACHE_TAG_TOWRITE;
3581	else
3582		tag = PAGECACHE_TAG_DIRTY;
3583retry:
3584	if (wbc->sync_mode == WB_SYNC_ALL)
3585		tag_pages_for_writeback(mapping, index, end);
3586	while (!done && !nr_to_write_done && (index <= end) &&
3587	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
3588			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
3589		unsigned i;
3590
3591		scanned = 1;
3592		for (i = 0; i < nr_pages; i++) {
3593			struct page *page = pvec.pages[i];
3594
3595			if (!PagePrivate(page))
3596				continue;
3597
3598			if (!wbc->range_cyclic && page->index > end) {
3599				done = 1;
3600				break;
3601			}
3602
3603			spin_lock(&mapping->private_lock);
3604			if (!PagePrivate(page)) {
3605				spin_unlock(&mapping->private_lock);
3606				continue;
3607			}
3608
3609			eb = (struct extent_buffer *)page->private;
3610
3611			/*
3612			 * Shouldn't happen and normally this would be a BUG_ON
3613			 * but no sense in crashing the users box for something
3614			 * we can survive anyway.
3615			 */
3616			if (WARN_ON(!eb)) {
3617				spin_unlock(&mapping->private_lock);
3618				continue;
3619			}
3620
3621			if (eb == prev_eb) {
3622				spin_unlock(&mapping->private_lock);
3623				continue;
3624			}
3625
3626			ret = atomic_inc_not_zero(&eb->refs);
3627			spin_unlock(&mapping->private_lock);
3628			if (!ret)
3629				continue;
3630
3631			prev_eb = eb;
3632			ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
3633			if (!ret) {
3634				free_extent_buffer(eb);
3635				continue;
3636			}
3637
3638			ret = write_one_eb(eb, fs_info, wbc, &epd);
3639			if (ret) {
3640				done = 1;
3641				free_extent_buffer(eb);
3642				break;
3643			}
3644			free_extent_buffer(eb);
3645
3646			/*
3647			 * the filesystem may choose to bump up nr_to_write.
3648			 * We have to make sure to honor the new nr_to_write
3649			 * at any time
3650			 */
3651			nr_to_write_done = wbc->nr_to_write <= 0;
3652		}
3653		pagevec_release(&pvec);
3654		cond_resched();
3655	}
3656	if (!scanned && !done) {
3657		/*
3658		 * We hit the last page and there is more work to be done: wrap
3659		 * back to the start of the file
3660		 */
3661		scanned = 1;
3662		index = 0;
3663		goto retry;
3664	}
3665	flush_write_bio(&epd);
3666	return ret;
3667}
3668
3669/**
3670 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
3671 * @mapping: address space structure to write
3672 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
3673 * @writepage: function called for each page
3674 * @data: data passed to writepage function
3675 *
3676 * If a page is already under I/O, write_cache_pages() skips it, even
3677 * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
3678 * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
3679 * and msync() need to guarantee that all the data which was dirty at the time
3680 * the call was made get new I/O started against them.  If wbc->sync_mode is
3681 * WB_SYNC_ALL then we were called for data integrity and we must wait for
3682 * existing IO to complete.
3683 */
3684static int extent_write_cache_pages(struct extent_io_tree *tree,
3685			     struct address_space *mapping,
3686			     struct writeback_control *wbc,
3687			     writepage_t writepage, void *data,
3688			     void (*flush_fn)(void *))
3689{
3690	struct inode *inode = mapping->host;
3691	int ret = 0;
3692	int done = 0;
3693	int nr_to_write_done = 0;
3694	struct pagevec pvec;
3695	int nr_pages;
3696	pgoff_t index;
3697	pgoff_t end;		/* Inclusive */
3698	int scanned = 0;
3699	int tag;
3700
3701	/*
3702	 * We have to hold onto the inode so that ordered extents can do their
3703	 * work when the IO finishes.  The alternative to this is failing to add
3704	 * an ordered extent if the igrab() fails there and that is a huge pain
3705	 * to deal with, so instead just hold onto the inode throughout the
3706	 * writepages operation.  If it fails here we are freeing up the inode
3707	 * anyway and we'd rather not waste our time writing out stuff that is
3708	 * going to be truncated anyway.
3709	 */
3710	if (!igrab(inode))
3711		return 0;
3712
3713	pagevec_init(&pvec, 0);
3714	if (wbc->range_cyclic) {
3715		index = mapping->writeback_index; /* Start from prev offset */
3716		end = -1;
3717	} else {
3718		index = wbc->range_start >> PAGE_CACHE_SHIFT;
3719		end = wbc->range_end >> PAGE_CACHE_SHIFT;
3720		scanned = 1;
3721	}
3722	if (wbc->sync_mode == WB_SYNC_ALL)
3723		tag = PAGECACHE_TAG_TOWRITE;
3724	else
3725		tag = PAGECACHE_TAG_DIRTY;
3726retry:
3727	if (wbc->sync_mode == WB_SYNC_ALL)
3728		tag_pages_for_writeback(mapping, index, end);
3729	while (!done && !nr_to_write_done && (index <= end) &&
3730	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
3731			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
3732		unsigned i;
3733
3734		scanned = 1;
3735		for (i = 0; i < nr_pages; i++) {
3736			struct page *page = pvec.pages[i];
3737
3738			/*
3739			 * At this point we hold neither mapping->tree_lock nor
3740			 * lock on the page itself: the page may be truncated or
3741			 * invalidated (changing page->mapping to NULL), or even
3742			 * swizzled back from swapper_space to tmpfs file
3743			 * mapping
3744			 */
3745			if (!trylock_page(page)) {
3746				flush_fn(data);
3747				lock_page(page);
 
 
 
 
 
 
3748			}
3749
3750			if (unlikely(page->mapping != mapping)) {
3751				unlock_page(page);
3752				continue;
3753			}
3754
3755			if (!wbc->range_cyclic && page->index > end) {
3756				done = 1;
3757				unlock_page(page);
3758				continue;
3759			}
3760
3761			if (wbc->sync_mode != WB_SYNC_NONE) {
3762				if (PageWriteback(page))
3763					flush_fn(data);
3764				wait_on_page_writeback(page);
3765			}
3766
3767			if (PageWriteback(page) ||
3768			    !clear_page_dirty_for_io(page)) {
3769				unlock_page(page);
3770				continue;
3771			}
3772
3773			ret = (*writepage)(page, wbc, data);
3774
3775			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
3776				unlock_page(page);
3777				ret = 0;
3778			}
3779			if (ret)
3780				done = 1;
3781
3782			/*
3783			 * the filesystem may choose to bump up nr_to_write.
3784			 * We have to make sure to honor the new nr_to_write
3785			 * at any time
3786			 */
3787			nr_to_write_done = wbc->nr_to_write <= 0;
3788		}
3789		pagevec_release(&pvec);
3790		cond_resched();
3791	}
3792	if (!scanned && !done) {
3793		/*
3794		 * We hit the last page and there is more work to be done: wrap
3795		 * back to the start of the file
3796		 */
3797		scanned = 1;
3798		index = 0;
3799		goto retry;
3800	}
3801	btrfs_add_delayed_iput(inode);
3802	return ret;
3803}
3804
3805static void flush_epd_write_bio(struct extent_page_data *epd)
3806{
3807	if (epd->bio) {
3808		int rw = WRITE;
3809		int ret;
3810
3811		if (epd->sync_io)
3812			rw = WRITE_SYNC;
3813
3814		ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags);
3815		BUG_ON(ret < 0); /* -ENOMEM */
3816		epd->bio = NULL;
3817	}
3818}
3819
3820static noinline void flush_write_bio(void *data)
3821{
3822	struct extent_page_data *epd = data;
3823	flush_epd_write_bio(epd);
3824}
3825
3826int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
3827			  get_extent_t *get_extent,
3828			  struct writeback_control *wbc)
3829{
3830	int ret;
3831	struct extent_page_data epd = {
3832		.bio = NULL,
3833		.tree = tree,
3834		.get_extent = get_extent,
3835		.extent_locked = 0,
3836		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
3837		.bio_flags = 0,
3838	};
3839
3840	ret = __extent_writepage(page, wbc, &epd);
3841
3842	flush_epd_write_bio(&epd);
3843	return ret;
3844}
3845
3846int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
3847			      u64 start, u64 end, get_extent_t *get_extent,
3848			      int mode)
3849{
3850	int ret = 0;
3851	struct address_space *mapping = inode->i_mapping;
3852	struct page *page;
3853	unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
3854		PAGE_CACHE_SHIFT;
3855
3856	struct extent_page_data epd = {
3857		.bio = NULL,
3858		.tree = tree,
3859		.get_extent = get_extent,
3860		.extent_locked = 1,
3861		.sync_io = mode == WB_SYNC_ALL,
3862		.bio_flags = 0,
3863	};
3864	struct writeback_control wbc_writepages = {
3865		.sync_mode	= mode,
3866		.nr_to_write	= nr_pages * 2,
3867		.range_start	= start,
3868		.range_end	= end + 1,
3869	};
3870
3871	while (start <= end) {
3872		page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
3873		if (clear_page_dirty_for_io(page))
3874			ret = __extent_writepage(page, &wbc_writepages, &epd);
3875		else {
3876			if (tree->ops && tree->ops->writepage_end_io_hook)
3877				tree->ops->writepage_end_io_hook(page, start,
3878						 start + PAGE_CACHE_SIZE - 1,
3879						 NULL, 1);
3880			unlock_page(page);
3881		}
3882		page_cache_release(page);
3883		start += PAGE_CACHE_SIZE;
3884	}
3885
3886	flush_epd_write_bio(&epd);
3887	return ret;
3888}
3889
3890int extent_writepages(struct extent_io_tree *tree,
3891		      struct address_space *mapping,
3892		      get_extent_t *get_extent,
3893		      struct writeback_control *wbc)
3894{
3895	int ret = 0;
3896	struct extent_page_data epd = {
3897		.bio = NULL,
3898		.tree = tree,
3899		.get_extent = get_extent,
3900		.extent_locked = 0,
3901		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
3902		.bio_flags = 0,
3903	};
3904
3905	ret = extent_write_cache_pages(tree, mapping, wbc,
3906				       __extent_writepage, &epd,
3907				       flush_write_bio);
3908	flush_epd_write_bio(&epd);
3909	return ret;
3910}
3911
3912int extent_readpages(struct extent_io_tree *tree,
3913		     struct address_space *mapping,
3914		     struct list_head *pages, unsigned nr_pages,
3915		     get_extent_t get_extent)
3916{
3917	struct bio *bio = NULL;
3918	unsigned page_idx;
3919	unsigned long bio_flags = 0;
3920	struct page *pagepool[16];
3921	struct page *page;
3922	struct extent_map *em_cached = NULL;
3923	int nr = 0;
3924
3925	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
3926		page = list_entry(pages->prev, struct page, lru);
3927
3928		prefetchw(&page->flags);
3929		list_del(&page->lru);
3930		if (add_to_page_cache_lru(page, mapping,
3931					page->index, GFP_NOFS)) {
3932			page_cache_release(page);
3933			continue;
3934		}
3935
3936		pagepool[nr++] = page;
3937		if (nr < ARRAY_SIZE(pagepool))
3938			continue;
3939		__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
3940				   &bio, 0, &bio_flags, READ);
3941		nr = 0;
3942	}
3943	if (nr)
3944		__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
3945				   &bio, 0, &bio_flags, READ);
3946
3947	if (em_cached)
3948		free_extent_map(em_cached);
3949
3950	BUG_ON(!list_empty(pages));
3951	if (bio)
3952		return submit_one_bio(READ, bio, 0, bio_flags);
3953	return 0;
3954}
3955
3956/*
3957 * basic invalidatepage code, this waits on any locked or writeback
3958 * ranges corresponding to the page, and then deletes any extent state
3959 * records from the tree
3960 */
3961int extent_invalidatepage(struct extent_io_tree *tree,
3962			  struct page *page, unsigned long offset)
3963{
3964	struct extent_state *cached_state = NULL;
3965	u64 start = page_offset(page);
3966	u64 end = start + PAGE_CACHE_SIZE - 1;
3967	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
3968
3969	start += ALIGN(offset, blocksize);
3970	if (start > end)
3971		return 0;
3972
3973	lock_extent_bits(tree, start, end, 0, &cached_state);
3974	wait_on_page_writeback(page);
3975	clear_extent_bit(tree, start, end,
3976			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
3977			 EXTENT_DO_ACCOUNTING,
3978			 1, 1, &cached_state, GFP_NOFS);
3979	return 0;
3980}
3981
3982/*
3983 * a helper for releasepage, this tests for areas of the page that
3984 * are locked or under IO and drops the related state bits if it is safe
3985 * to drop the page.
3986 */
3987static int try_release_extent_state(struct extent_map_tree *map,
3988				    struct extent_io_tree *tree,
3989				    struct page *page, gfp_t mask)
3990{
3991	u64 start = page_offset(page);
3992	u64 end = start + PAGE_CACHE_SIZE - 1;
3993	int ret = 1;
3994
3995	if (test_range_bit(tree, start, end,
3996			   EXTENT_IOBITS, 0, NULL))
3997		ret = 0;
3998	else {
3999		if ((mask & GFP_NOFS) == GFP_NOFS)
4000			mask = GFP_NOFS;
4001		/*
4002		 * at this point we can safely clear everything except the
4003		 * locked bit and the nodatasum bit
4004		 */
4005		ret = clear_extent_bit(tree, start, end,
4006				 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
4007				 0, 0, NULL, mask);
4008
4009		/* if clear_extent_bit failed for enomem reasons,
4010		 * we can't allow the release to continue.
4011		 */
4012		if (ret < 0)
4013			ret = 0;
4014		else
4015			ret = 1;
4016	}
4017	return ret;
4018}
4019
4020/*
4021 * a helper for releasepage.  As long as there are no locked extents
4022 * in the range corresponding to the page, both state records and extent
4023 * map records are removed
4024 */
4025int try_release_extent_mapping(struct extent_map_tree *map,
4026			       struct extent_io_tree *tree, struct page *page,
4027			       gfp_t mask)
4028{
4029	struct extent_map *em;
4030	u64 start = page_offset(page);
4031	u64 end = start + PAGE_CACHE_SIZE - 1;
4032
4033	if ((mask & __GFP_WAIT) &&
4034	    page->mapping->host->i_size > 16 * 1024 * 1024) {
4035		u64 len;
4036		while (start <= end) {
4037			len = end - start + 1;
4038			write_lock(&map->lock);
4039			em = lookup_extent_mapping(map, start, len);
4040			if (!em) {
4041				write_unlock(&map->lock);
4042				break;
4043			}
4044			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
4045			    em->start != start) {
4046				write_unlock(&map->lock);
4047				free_extent_map(em);
4048				break;
4049			}
4050			if (!test_range_bit(tree, em->start,
4051					    extent_map_end(em) - 1,
4052					    EXTENT_LOCKED | EXTENT_WRITEBACK,
4053					    0, NULL)) {
4054				remove_extent_mapping(map, em);
4055				/* once for the rb tree */
4056				free_extent_map(em);
4057			}
4058			start = extent_map_end(em);
4059			write_unlock(&map->lock);
4060
4061			/* once for us */
4062			free_extent_map(em);
4063		}
4064	}
4065	return try_release_extent_state(map, tree, page, mask);
4066}
4067
4068/*
4069 * helper function for fiemap, which doesn't want to see any holes.
4070 * This maps until we find something past 'last'
4071 */
4072static struct extent_map *get_extent_skip_holes(struct inode *inode,
4073						u64 offset,
4074						u64 last,
4075						get_extent_t *get_extent)
4076{
4077	u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
4078	struct extent_map *em;
4079	u64 len;
4080
4081	if (offset >= last)
4082		return NULL;
4083
4084	while (1) {
4085		len = last - offset;
4086		if (len == 0)
4087			break;
4088		len = ALIGN(len, sectorsize);
4089		em = get_extent(inode, NULL, 0, offset, len, 0);
4090		if (IS_ERR_OR_NULL(em))
4091			return em;
4092
4093		/* if this isn't a hole return it */
4094		if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
4095		    em->block_start != EXTENT_MAP_HOLE) {
4096			return em;
4097		}
4098
4099		/* this is a hole, advance to the next extent */
4100		offset = extent_map_end(em);
4101		free_extent_map(em);
4102		if (offset >= last)
4103			break;
4104	}
4105	return NULL;
4106}
4107
4108static noinline int count_ext_ref(u64 inum, u64 offset, u64 root_id, void *ctx)
4109{
4110	unsigned long cnt = *((unsigned long *)ctx);
4111
4112	cnt++;
4113	*((unsigned long *)ctx) = cnt;
4114
4115	/* Now we're sure that the extent is shared. */
4116	if (cnt > 1)
4117		return 1;
4118	return 0;
4119}
4120
4121int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4122		__u64 start, __u64 len, get_extent_t *get_extent)
4123{
4124	int ret = 0;
4125	u64 off = start;
4126	u64 max = start + len;
4127	u32 flags = 0;
4128	u32 found_type;
4129	u64 last;
4130	u64 last_for_get_extent = 0;
4131	u64 disko = 0;
4132	u64 isize = i_size_read(inode);
4133	struct btrfs_key found_key;
4134	struct extent_map *em = NULL;
4135	struct extent_state *cached_state = NULL;
4136	struct btrfs_path *path;
 
4137	int end = 0;
4138	u64 em_start = 0;
4139	u64 em_len = 0;
4140	u64 em_end = 0;
 
4141
4142	if (len == 0)
4143		return -EINVAL;
4144
4145	path = btrfs_alloc_path();
4146	if (!path)
4147		return -ENOMEM;
4148	path->leave_spinning = 1;
4149
4150	start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
4151	len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
4152
4153	/*
4154	 * lookup the last file extent.  We're not using i_size here
4155	 * because there might be preallocation past i_size
4156	 */
4157	ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
4158				       path, btrfs_ino(inode), -1, 0);
4159	if (ret < 0) {
4160		btrfs_free_path(path);
4161		return ret;
4162	}
4163	WARN_ON(!ret);
4164	path->slots[0]--;
 
 
4165	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
4166	found_type = btrfs_key_type(&found_key);
4167
4168	/* No extents, but there might be delalloc bits */
4169	if (found_key.objectid != btrfs_ino(inode) ||
4170	    found_type != BTRFS_EXTENT_DATA_KEY) {
4171		/* have to trust i_size as the end */
4172		last = (u64)-1;
4173		last_for_get_extent = isize;
4174	} else {
4175		/*
4176		 * remember the start of the last extent.  There are a
4177		 * bunch of different factors that go into the length of the
4178		 * extent, so its much less complex to remember where it started
4179		 */
4180		last = found_key.offset;
4181		last_for_get_extent = last + 1;
4182	}
4183	btrfs_release_path(path);
4184
4185	/*
4186	 * we might have some extents allocated but more delalloc past those
4187	 * extents.  so, we trust isize unless the start of the last extent is
4188	 * beyond isize
4189	 */
4190	if (last < isize) {
4191		last = (u64)-1;
4192		last_for_get_extent = isize;
4193	}
4194
4195	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0,
4196			 &cached_state);
4197
4198	em = get_extent_skip_holes(inode, start, last_for_get_extent,
4199				   get_extent);
4200	if (!em)
4201		goto out;
4202	if (IS_ERR(em)) {
4203		ret = PTR_ERR(em);
4204		goto out;
4205	}
4206
4207	while (!end) {
4208		u64 offset_in_extent = 0;
4209
4210		/* break if the extent we found is outside the range */
4211		if (em->start >= max || extent_map_end(em) < off)
4212			break;
4213
4214		/*
4215		 * get_extent may return an extent that starts before our
4216		 * requested range.  We have to make sure the ranges
4217		 * we return to fiemap always move forward and don't
4218		 * overlap, so adjust the offsets here
4219		 */
4220		em_start = max(em->start, off);
4221
4222		/*
4223		 * record the offset from the start of the extent
4224		 * for adjusting the disk offset below.  Only do this if the
4225		 * extent isn't compressed since our in ram offset may be past
4226		 * what we have actually allocated on disk.
4227		 */
4228		if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4229			offset_in_extent = em_start - em->start;
4230		em_end = extent_map_end(em);
4231		em_len = em_end - em_start;
 
4232		disko = 0;
4233		flags = 0;
4234
4235		/*
4236		 * bump off for our next call to get_extent
4237		 */
4238		off = extent_map_end(em);
4239		if (off >= max)
4240			end = 1;
4241
4242		if (em->block_start == EXTENT_MAP_LAST_BYTE) {
4243			end = 1;
4244			flags |= FIEMAP_EXTENT_LAST;
4245		} else if (em->block_start == EXTENT_MAP_INLINE) {
4246			flags |= (FIEMAP_EXTENT_DATA_INLINE |
4247				  FIEMAP_EXTENT_NOT_ALIGNED);
4248		} else if (em->block_start == EXTENT_MAP_DELALLOC) {
4249			flags |= (FIEMAP_EXTENT_DELALLOC |
4250				  FIEMAP_EXTENT_UNKNOWN);
4251		} else {
4252			unsigned long ref_cnt = 0;
4253
4254			disko = em->block_start + offset_in_extent;
4255
4256			/*
4257			 * As btrfs supports shared space, this information
4258			 * can be exported to userspace tools via
4259			 * flag FIEMAP_EXTENT_SHARED.
4260			 */
4261			ret = iterate_inodes_from_logical(
4262					em->block_start,
4263					BTRFS_I(inode)->root->fs_info,
4264					path, count_ext_ref, &ref_cnt);
4265			if (ret < 0 && ret != -ENOENT)
4266				goto out_free;
4267
4268			if (ref_cnt > 1)
4269				flags |= FIEMAP_EXTENT_SHARED;
4270		}
4271		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4272			flags |= FIEMAP_EXTENT_ENCODED;
4273
4274		free_extent_map(em);
4275		em = NULL;
4276		if ((em_start >= last) || em_len == (u64)-1 ||
4277		   (last == (u64)-1 && isize <= em_end)) {
4278			flags |= FIEMAP_EXTENT_LAST;
4279			end = 1;
4280		}
4281
4282		/* now scan forward to see if this is really the last extent. */
4283		em = get_extent_skip_holes(inode, off, last_for_get_extent,
4284					   get_extent);
4285		if (IS_ERR(em)) {
4286			ret = PTR_ERR(em);
4287			goto out;
4288		}
4289		if (!em) {
4290			flags |= FIEMAP_EXTENT_LAST;
4291			end = 1;
4292		}
4293		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
4294					      em_len, flags);
4295		if (ret)
4296			goto out_free;
4297	}
4298out_free:
4299	free_extent_map(em);
4300out:
4301	btrfs_free_path(path);
4302	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
4303			     &cached_state, GFP_NOFS);
4304	return ret;
4305}
4306
4307static void __free_extent_buffer(struct extent_buffer *eb)
 
4308{
4309	btrfs_leak_debug_del(&eb->leak_list);
4310	kmem_cache_free(extent_buffer_cache, eb);
4311}
4312
4313int extent_buffer_under_io(struct extent_buffer *eb)
4314{
4315	return (atomic_read(&eb->io_pages) ||
4316		test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
4317		test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4318}
4319
4320/*
4321 * Helper for releasing extent buffer page.
4322 */
4323static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
4324						unsigned long start_idx)
4325{
4326	unsigned long index;
4327	unsigned long num_pages;
4328	struct page *page;
4329	int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4330
4331	BUG_ON(extent_buffer_under_io(eb));
4332
4333	num_pages = num_extent_pages(eb->start, eb->len);
4334	index = start_idx + num_pages;
4335	if (start_idx >= index)
4336		return;
4337
4338	do {
4339		index--;
4340		page = extent_buffer_page(eb, index);
4341		if (page && mapped) {
4342			spin_lock(&page->mapping->private_lock);
4343			/*
4344			 * We do this since we'll remove the pages after we've
4345			 * removed the eb from the radix tree, so we could race
4346			 * and have this page now attached to the new eb.  So
4347			 * only clear page_private if it's still connected to
4348			 * this eb.
4349			 */
4350			if (PagePrivate(page) &&
4351			    page->private == (unsigned long)eb) {
4352				BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4353				BUG_ON(PageDirty(page));
4354				BUG_ON(PageWriteback(page));
4355				/*
4356				 * We need to make sure we haven't be attached
4357				 * to a new eb.
4358				 */
4359				ClearPagePrivate(page);
4360				set_page_private(page, 0);
4361				/* One for the page private */
4362				page_cache_release(page);
4363			}
4364			spin_unlock(&page->mapping->private_lock);
4365
4366		}
4367		if (page) {
4368			/* One for when we alloced the page */
4369			page_cache_release(page);
4370		}
4371	} while (index != start_idx);
4372}
4373
4374/*
4375 * Helper for releasing the extent buffer.
4376 */
4377static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
4378{
4379	btrfs_release_extent_buffer_page(eb, 0);
4380	__free_extent_buffer(eb);
4381}
4382
4383static struct extent_buffer *
4384__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
4385		      unsigned long len, gfp_t mask)
4386{
4387	struct extent_buffer *eb = NULL;
 
 
 
4388
4389	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
4390	if (eb == NULL)
4391		return NULL;
4392	eb->start = start;
4393	eb->len = len;
4394	eb->fs_info = fs_info;
4395	eb->bflags = 0;
4396	rwlock_init(&eb->lock);
4397	atomic_set(&eb->write_locks, 0);
4398	atomic_set(&eb->read_locks, 0);
4399	atomic_set(&eb->blocking_readers, 0);
4400	atomic_set(&eb->blocking_writers, 0);
4401	atomic_set(&eb->spinning_readers, 0);
4402	atomic_set(&eb->spinning_writers, 0);
4403	eb->lock_nested = 0;
4404	init_waitqueue_head(&eb->write_lock_wq);
4405	init_waitqueue_head(&eb->read_lock_wq);
4406
4407	btrfs_leak_debug_add(&eb->leak_list, &buffers);
4408
 
 
 
4409	spin_lock_init(&eb->refs_lock);
4410	atomic_set(&eb->refs, 1);
4411	atomic_set(&eb->io_pages, 0);
4412
4413	/*
4414	 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
4415	 */
4416	BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
4417		> MAX_INLINE_EXTENT_BUFFER_SIZE);
4418	BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
 
 
 
 
 
 
 
4419
4420	return eb;
4421}
4422
4423struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
4424{
4425	unsigned long i;
4426	struct page *p;
4427	struct extent_buffer *new;
4428	unsigned long num_pages = num_extent_pages(src->start, src->len);
4429
4430	new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_NOFS);
4431	if (new == NULL)
4432		return NULL;
4433
4434	for (i = 0; i < num_pages; i++) {
4435		p = alloc_page(GFP_NOFS);
4436		if (!p) {
4437			btrfs_release_extent_buffer(new);
4438			return NULL;
4439		}
4440		attach_extent_buffer_page(new, p);
4441		WARN_ON(PageDirty(p));
4442		SetPageUptodate(p);
4443		new->pages[i] = p;
4444	}
4445
4446	copy_extent_buffer(new, src, 0, 0, src->len);
4447	set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
4448	set_bit(EXTENT_BUFFER_DUMMY, &new->bflags);
4449
4450	return new;
4451}
4452
4453struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
4454{
4455	struct extent_buffer *eb;
4456	unsigned long num_pages = num_extent_pages(0, len);
4457	unsigned long i;
4458
4459	eb = __alloc_extent_buffer(NULL, start, len, GFP_NOFS);
4460	if (!eb)
4461		return NULL;
4462
4463	for (i = 0; i < num_pages; i++) {
4464		eb->pages[i] = alloc_page(GFP_NOFS);
4465		if (!eb->pages[i])
4466			goto err;
4467	}
4468	set_extent_buffer_uptodate(eb);
4469	btrfs_set_header_nritems(eb, 0);
4470	set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4471
4472	return eb;
4473err:
4474	for (; i > 0; i--)
4475		__free_page(eb->pages[i - 1]);
4476	__free_extent_buffer(eb);
4477	return NULL;
4478}
4479
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4480static void check_buffer_tree_ref(struct extent_buffer *eb)
4481{
4482	int refs;
4483	/* the ref bit is tricky.  We have to make sure it is set
4484	 * if we have the buffer dirty.   Otherwise the
4485	 * code to free a buffer can end up dropping a dirty
4486	 * page
4487	 *
4488	 * Once the ref bit is set, it won't go away while the
4489	 * buffer is dirty or in writeback, and it also won't
4490	 * go away while we have the reference count on the
4491	 * eb bumped.
4492	 *
4493	 * We can't just set the ref bit without bumping the
4494	 * ref on the eb because free_extent_buffer might
4495	 * see the ref bit and try to clear it.  If this happens
4496	 * free_extent_buffer might end up dropping our original
4497	 * ref by mistake and freeing the page before we are able
4498	 * to add one more ref.
4499	 *
4500	 * So bump the ref count first, then set the bit.  If someone
4501	 * beat us to it, drop the ref we added.
4502	 */
4503	refs = atomic_read(&eb->refs);
4504	if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4505		return;
4506
4507	spin_lock(&eb->refs_lock);
4508	if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4509		atomic_inc(&eb->refs);
4510	spin_unlock(&eb->refs_lock);
 
 
4511}
4512
4513static void mark_extent_buffer_accessed(struct extent_buffer *eb)
4514{
4515	unsigned long num_pages, i;
4516
4517	check_buffer_tree_ref(eb);
4518
4519	num_pages = num_extent_pages(eb->start, eb->len);
4520	for (i = 0; i < num_pages; i++) {
4521		struct page *p = extent_buffer_page(eb, i);
4522		mark_page_accessed(p);
4523	}
4524}
4525
4526struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
4527					 u64 start)
4528{
4529	struct extent_buffer *eb;
4530
4531	rcu_read_lock();
4532	eb = radix_tree_lookup(&fs_info->buffer_radix,
4533			       start >> PAGE_CACHE_SHIFT);
4534	if (eb && atomic_inc_not_zero(&eb->refs)) {
4535		rcu_read_unlock();
4536		mark_extent_buffer_accessed(eb);
4537		return eb;
4538	}
4539	rcu_read_unlock();
4540
4541	return NULL;
4542}
4543
4544struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
4545					  u64 start, unsigned long len)
4546{
4547	unsigned long num_pages = num_extent_pages(start, len);
4548	unsigned long i;
4549	unsigned long index = start >> PAGE_CACHE_SHIFT;
4550	struct extent_buffer *eb;
4551	struct extent_buffer *exists = NULL;
4552	struct page *p;
4553	struct address_space *mapping = fs_info->btree_inode->i_mapping;
4554	int uptodate = 1;
4555	int ret;
4556
4557	eb = find_extent_buffer(fs_info, start);
4558	if (eb)
 
 
 
4559		return eb;
 
 
4560
4561	eb = __alloc_extent_buffer(fs_info, start, len, GFP_NOFS);
4562	if (!eb)
4563		return NULL;
4564
4565	for (i = 0; i < num_pages; i++, index++) {
4566		p = find_or_create_page(mapping, index, GFP_NOFS);
4567		if (!p)
 
4568			goto free_eb;
 
4569
4570		spin_lock(&mapping->private_lock);
4571		if (PagePrivate(p)) {
4572			/*
4573			 * We could have already allocated an eb for this page
4574			 * and attached one so lets see if we can get a ref on
4575			 * the existing eb, and if we can we know it's good and
4576			 * we can just return that one, else we know we can just
4577			 * overwrite page->private.
4578			 */
4579			exists = (struct extent_buffer *)p->private;
4580			if (atomic_inc_not_zero(&exists->refs)) {
4581				spin_unlock(&mapping->private_lock);
4582				unlock_page(p);
4583				page_cache_release(p);
4584				mark_extent_buffer_accessed(exists);
4585				goto free_eb;
4586			}
4587
4588			/*
4589			 * Do this so attach doesn't complain and we need to
4590			 * drop the ref the old guy had.
4591			 */
4592			ClearPagePrivate(p);
4593			WARN_ON(PageDirty(p));
4594			page_cache_release(p);
4595		}
4596		attach_extent_buffer_page(eb, p);
4597		spin_unlock(&mapping->private_lock);
4598		WARN_ON(PageDirty(p));
4599		mark_page_accessed(p);
4600		eb->pages[i] = p;
4601		if (!PageUptodate(p))
4602			uptodate = 0;
4603
4604		/*
4605		 * see below about how we avoid a nasty race with release page
4606		 * and why we unlock later
4607		 */
4608	}
4609	if (uptodate)
4610		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4611again:
4612	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
4613	if (ret)
4614		goto free_eb;
4615
4616	spin_lock(&fs_info->buffer_lock);
4617	ret = radix_tree_insert(&fs_info->buffer_radix,
4618				start >> PAGE_CACHE_SHIFT, eb);
4619	spin_unlock(&fs_info->buffer_lock);
4620	radix_tree_preload_end();
4621	if (ret == -EEXIST) {
4622		exists = find_extent_buffer(fs_info, start);
4623		if (exists)
4624			goto free_eb;
4625		else
 
 
4626			goto again;
 
 
 
 
 
4627	}
4628	/* add one reference for the tree */
 
4629	check_buffer_tree_ref(eb);
4630	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
 
 
4631
4632	/*
4633	 * there is a race where release page may have
4634	 * tried to find this extent buffer in the radix
4635	 * but failed.  It will tell the VM it is safe to
4636	 * reclaim the, and it will clear the page private bit.
4637	 * We must make sure to set the page private bit properly
4638	 * after the extent buffer is in the radix tree so
4639	 * it doesn't get lost
4640	 */
4641	SetPageChecked(eb->pages[0]);
4642	for (i = 1; i < num_pages; i++) {
4643		p = extent_buffer_page(eb, i);
4644		ClearPageChecked(p);
4645		unlock_page(p);
4646	}
4647	unlock_page(eb->pages[0]);
4648	return eb;
4649
4650free_eb:
4651	for (i = 0; i < num_pages; i++) {
4652		if (eb->pages[i])
4653			unlock_page(eb->pages[i]);
4654	}
4655
4656	WARN_ON(!atomic_dec_and_test(&eb->refs));
4657	btrfs_release_extent_buffer(eb);
4658	return exists;
4659}
4660
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4661static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
4662{
4663	struct extent_buffer *eb =
4664			container_of(head, struct extent_buffer, rcu_head);
4665
4666	__free_extent_buffer(eb);
4667}
4668
4669/* Expects to have eb->eb_lock already held */
4670static int release_extent_buffer(struct extent_buffer *eb)
4671{
4672	WARN_ON(atomic_read(&eb->refs) == 0);
4673	if (atomic_dec_and_test(&eb->refs)) {
4674		if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
4675			struct btrfs_fs_info *fs_info = eb->fs_info;
 
 
4676
4677			spin_unlock(&eb->refs_lock);
4678
4679			spin_lock(&fs_info->buffer_lock);
4680			radix_tree_delete(&fs_info->buffer_radix,
4681					  eb->start >> PAGE_CACHE_SHIFT);
4682			spin_unlock(&fs_info->buffer_lock);
4683		} else {
4684			spin_unlock(&eb->refs_lock);
4685		}
4686
4687		/* Should be safe to release our pages at this point */
4688		btrfs_release_extent_buffer_page(eb, 0);
 
4689		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
4690		return 1;
4691	}
4692	spin_unlock(&eb->refs_lock);
4693
4694	return 0;
4695}
4696
4697void free_extent_buffer(struct extent_buffer *eb)
4698{
4699	int refs;
4700	int old;
4701	if (!eb)
4702		return;
4703
4704	while (1) {
4705		refs = atomic_read(&eb->refs);
4706		if (refs <= 3)
4707			break;
4708		old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
4709		if (old == refs)
4710			return;
4711	}
4712
4713	spin_lock(&eb->refs_lock);
4714	if (atomic_read(&eb->refs) == 2 &&
4715	    test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
4716		atomic_dec(&eb->refs);
4717
4718	if (atomic_read(&eb->refs) == 2 &&
4719	    test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
4720	    !extent_buffer_under_io(eb) &&
4721	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4722		atomic_dec(&eb->refs);
4723
4724	/*
4725	 * I know this is terrible, but it's temporary until we stop tracking
4726	 * the uptodate bits and such for the extent buffers.
4727	 */
4728	release_extent_buffer(eb);
4729}
4730
4731void free_extent_buffer_stale(struct extent_buffer *eb)
4732{
4733	if (!eb)
4734		return;
4735
4736	spin_lock(&eb->refs_lock);
4737	set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
4738
4739	if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
4740	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4741		atomic_dec(&eb->refs);
4742	release_extent_buffer(eb);
4743}
4744
4745void clear_extent_buffer_dirty(struct extent_buffer *eb)
4746{
4747	unsigned long i;
4748	unsigned long num_pages;
4749	struct page *page;
4750
4751	num_pages = num_extent_pages(eb->start, eb->len);
4752
4753	for (i = 0; i < num_pages; i++) {
4754		page = extent_buffer_page(eb, i);
4755		if (!PageDirty(page))
4756			continue;
4757
4758		lock_page(page);
4759		WARN_ON(!PagePrivate(page));
4760
4761		clear_page_dirty_for_io(page);
4762		spin_lock_irq(&page->mapping->tree_lock);
4763		if (!PageDirty(page)) {
4764			radix_tree_tag_clear(&page->mapping->page_tree,
4765						page_index(page),
4766						PAGECACHE_TAG_DIRTY);
4767		}
4768		spin_unlock_irq(&page->mapping->tree_lock);
4769		ClearPageError(page);
4770		unlock_page(page);
4771	}
4772	WARN_ON(atomic_read(&eb->refs) == 0);
4773}
4774
4775int set_extent_buffer_dirty(struct extent_buffer *eb)
4776{
4777	unsigned long i;
4778	unsigned long num_pages;
4779	int was_dirty = 0;
4780
4781	check_buffer_tree_ref(eb);
4782
4783	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
4784
4785	num_pages = num_extent_pages(eb->start, eb->len);
4786	WARN_ON(atomic_read(&eb->refs) == 0);
4787	WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
4788
4789	for (i = 0; i < num_pages; i++)
4790		set_page_dirty(extent_buffer_page(eb, i));
4791	return was_dirty;
4792}
4793
 
 
 
 
 
 
 
 
 
 
 
4794int clear_extent_buffer_uptodate(struct extent_buffer *eb)
4795{
4796	unsigned long i;
4797	struct page *page;
4798	unsigned long num_pages;
4799
4800	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4801	num_pages = num_extent_pages(eb->start, eb->len);
4802	for (i = 0; i < num_pages; i++) {
4803		page = extent_buffer_page(eb, i);
4804		if (page)
4805			ClearPageUptodate(page);
4806	}
4807	return 0;
4808}
4809
4810int set_extent_buffer_uptodate(struct extent_buffer *eb)
4811{
4812	unsigned long i;
4813	struct page *page;
4814	unsigned long num_pages;
4815
4816	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4817	num_pages = num_extent_pages(eb->start, eb->len);
4818	for (i = 0; i < num_pages; i++) {
4819		page = extent_buffer_page(eb, i);
4820		SetPageUptodate(page);
4821	}
4822	return 0;
4823}
4824
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4825int extent_buffer_uptodate(struct extent_buffer *eb)
4826{
4827	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4828}
4829
4830int read_extent_buffer_pages(struct extent_io_tree *tree,
4831			     struct extent_buffer *eb, u64 start, int wait,
4832			     get_extent_t *get_extent, int mirror_num)
4833{
4834	unsigned long i;
4835	unsigned long start_i;
4836	struct page *page;
4837	int err;
4838	int ret = 0;
4839	int locked_pages = 0;
4840	int all_uptodate = 1;
4841	unsigned long num_pages;
4842	unsigned long num_reads = 0;
4843	struct bio *bio = NULL;
4844	unsigned long bio_flags = 0;
4845
4846	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
4847		return 0;
4848
4849	if (start) {
4850		WARN_ON(start < eb->start);
4851		start_i = (start >> PAGE_CACHE_SHIFT) -
4852			(eb->start >> PAGE_CACHE_SHIFT);
4853	} else {
4854		start_i = 0;
4855	}
4856
4857	num_pages = num_extent_pages(eb->start, eb->len);
4858	for (i = start_i; i < num_pages; i++) {
4859		page = extent_buffer_page(eb, i);
4860		if (wait == WAIT_NONE) {
4861			if (!trylock_page(page))
4862				goto unlock_exit;
4863		} else {
4864			lock_page(page);
4865		}
4866		locked_pages++;
4867		if (!PageUptodate(page)) {
4868			num_reads++;
4869			all_uptodate = 0;
4870		}
4871	}
4872	if (all_uptodate) {
4873		if (start_i == 0)
4874			set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4875		goto unlock_exit;
4876	}
4877
4878	clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
4879	eb->read_mirror = 0;
4880	atomic_set(&eb->io_pages, num_reads);
4881	for (i = start_i; i < num_pages; i++) {
4882		page = extent_buffer_page(eb, i);
4883		if (!PageUptodate(page)) {
4884			ClearPageError(page);
4885			err = __extent_read_full_page(tree, page,
4886						      get_extent, &bio,
4887						      mirror_num, &bio_flags,
4888						      READ | REQ_META);
4889			if (err)
4890				ret = err;
4891		} else {
4892			unlock_page(page);
4893		}
4894	}
4895
4896	if (bio) {
4897		err = submit_one_bio(READ | REQ_META, bio, mirror_num,
4898				     bio_flags);
4899		if (err)
4900			return err;
4901	}
4902
4903	if (ret || wait != WAIT_COMPLETE)
4904		return ret;
4905
4906	for (i = start_i; i < num_pages; i++) {
4907		page = extent_buffer_page(eb, i);
4908		wait_on_page_locked(page);
4909		if (!PageUptodate(page))
4910			ret = -EIO;
4911	}
4912
4913	return ret;
4914
4915unlock_exit:
4916	i = start_i;
4917	while (locked_pages > 0) {
4918		page = extent_buffer_page(eb, i);
4919		i++;
4920		unlock_page(page);
4921		locked_pages--;
4922	}
4923	return ret;
4924}
4925
4926void read_extent_buffer(struct extent_buffer *eb, void *dstv,
4927			unsigned long start,
4928			unsigned long len)
4929{
4930	size_t cur;
4931	size_t offset;
4932	struct page *page;
4933	char *kaddr;
4934	char *dst = (char *)dstv;
4935	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4936	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4937
4938	WARN_ON(start > eb->len);
4939	WARN_ON(start + len > eb->start + eb->len);
4940
4941	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
4942
4943	while (len > 0) {
4944		page = extent_buffer_page(eb, i);
4945
4946		cur = min(len, (PAGE_CACHE_SIZE - offset));
4947		kaddr = page_address(page);
4948		memcpy(dst, kaddr + offset, cur);
4949
4950		dst += cur;
4951		len -= cur;
4952		offset = 0;
4953		i++;
4954	}
4955}
4956
4957int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
4958			       unsigned long min_len, char **map,
4959			       unsigned long *map_start,
4960			       unsigned long *map_len)
4961{
4962	size_t offset = start & (PAGE_CACHE_SIZE - 1);
4963	char *kaddr;
4964	struct page *p;
4965	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4966	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4967	unsigned long end_i = (start_offset + start + min_len - 1) >>
4968		PAGE_CACHE_SHIFT;
4969
4970	if (i != end_i)
4971		return -EINVAL;
4972
4973	if (i == 0) {
4974		offset = start_offset;
4975		*map_start = 0;
4976	} else {
4977		offset = 0;
4978		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
4979	}
4980
4981	if (start + min_len > eb->len) {
4982		WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
4983		       "wanted %lu %lu\n",
4984		       eb->start, eb->len, start, min_len);
 
4985		return -EINVAL;
4986	}
4987
4988	p = extent_buffer_page(eb, i);
4989	kaddr = page_address(p);
4990	*map = kaddr + offset;
4991	*map_len = PAGE_CACHE_SIZE - offset;
4992	return 0;
4993}
4994
4995int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
4996			  unsigned long start,
4997			  unsigned long len)
4998{
4999	size_t cur;
5000	size_t offset;
5001	struct page *page;
5002	char *kaddr;
5003	char *ptr = (char *)ptrv;
5004	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
5005	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
5006	int ret = 0;
5007
5008	WARN_ON(start > eb->len);
5009	WARN_ON(start + len > eb->start + eb->len);
5010
5011	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5012
5013	while (len > 0) {
5014		page = extent_buffer_page(eb, i);
5015
5016		cur = min(len, (PAGE_CACHE_SIZE - offset));
5017
5018		kaddr = page_address(page);
5019		ret = memcmp(ptr, kaddr + offset, cur);
5020		if (ret)
5021			break;
5022
5023		ptr += cur;
5024		len -= cur;
5025		offset = 0;
5026		i++;
5027	}
5028	return ret;
5029}
5030
5031void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
5032			 unsigned long start, unsigned long len)
5033{
5034	size_t cur;
5035	size_t offset;
5036	struct page *page;
5037	char *kaddr;
5038	char *src = (char *)srcv;
5039	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
5040	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
5041
5042	WARN_ON(start > eb->len);
5043	WARN_ON(start + len > eb->start + eb->len);
5044
5045	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5046
5047	while (len > 0) {
5048		page = extent_buffer_page(eb, i);
5049		WARN_ON(!PageUptodate(page));
5050
5051		cur = min(len, PAGE_CACHE_SIZE - offset);
5052		kaddr = page_address(page);
5053		memcpy(kaddr + offset, src, cur);
5054
5055		src += cur;
5056		len -= cur;
5057		offset = 0;
5058		i++;
5059	}
5060}
5061
5062void memset_extent_buffer(struct extent_buffer *eb, char c,
5063			  unsigned long start, unsigned long len)
5064{
5065	size_t cur;
5066	size_t offset;
5067	struct page *page;
5068	char *kaddr;
5069	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
5070	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
5071
5072	WARN_ON(start > eb->len);
5073	WARN_ON(start + len > eb->start + eb->len);
5074
5075	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5076
5077	while (len > 0) {
5078		page = extent_buffer_page(eb, i);
5079		WARN_ON(!PageUptodate(page));
5080
5081		cur = min(len, PAGE_CACHE_SIZE - offset);
5082		kaddr = page_address(page);
5083		memset(kaddr + offset, c, cur);
5084
5085		len -= cur;
5086		offset = 0;
5087		i++;
5088	}
5089}
5090
5091void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
5092			unsigned long dst_offset, unsigned long src_offset,
5093			unsigned long len)
5094{
5095	u64 dst_len = dst->len;
5096	size_t cur;
5097	size_t offset;
5098	struct page *page;
5099	char *kaddr;
5100	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
5101	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
5102
5103	WARN_ON(src->len != dst_len);
5104
5105	offset = (start_offset + dst_offset) &
5106		(PAGE_CACHE_SIZE - 1);
5107
5108	while (len > 0) {
5109		page = extent_buffer_page(dst, i);
5110		WARN_ON(!PageUptodate(page));
5111
5112		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
5113
5114		kaddr = page_address(page);
5115		read_extent_buffer(src, kaddr + offset, src_offset, cur);
5116
5117		src_offset += cur;
5118		len -= cur;
5119		offset = 0;
5120		i++;
5121	}
5122}
5123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5124static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
5125{
5126	unsigned long distance = (src > dst) ? src - dst : dst - src;
5127	return distance < len;
5128}
5129
5130static void copy_pages(struct page *dst_page, struct page *src_page,
5131		       unsigned long dst_off, unsigned long src_off,
5132		       unsigned long len)
5133{
5134	char *dst_kaddr = page_address(dst_page);
5135	char *src_kaddr;
5136	int must_memmove = 0;
5137
5138	if (dst_page != src_page) {
5139		src_kaddr = page_address(src_page);
5140	} else {
5141		src_kaddr = dst_kaddr;
5142		if (areas_overlap(src_off, dst_off, len))
5143			must_memmove = 1;
5144	}
5145
5146	if (must_memmove)
5147		memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
5148	else
5149		memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
5150}
5151
5152void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
5153			   unsigned long src_offset, unsigned long len)
5154{
5155	size_t cur;
5156	size_t dst_off_in_page;
5157	size_t src_off_in_page;
5158	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
5159	unsigned long dst_i;
5160	unsigned long src_i;
5161
5162	if (src_offset + len > dst->len) {
5163		printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
5164		       "len %lu dst len %lu\n", src_offset, len, dst->len);
5165		BUG_ON(1);
5166	}
5167	if (dst_offset + len > dst->len) {
5168		printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
5169		       "len %lu dst len %lu\n", dst_offset, len, dst->len);
5170		BUG_ON(1);
5171	}
5172
5173	while (len > 0) {
5174		dst_off_in_page = (start_offset + dst_offset) &
5175			(PAGE_CACHE_SIZE - 1);
5176		src_off_in_page = (start_offset + src_offset) &
5177			(PAGE_CACHE_SIZE - 1);
5178
5179		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
5180		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
5181
5182		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
5183					       src_off_in_page));
5184		cur = min_t(unsigned long, cur,
5185			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
5186
5187		copy_pages(extent_buffer_page(dst, dst_i),
5188			   extent_buffer_page(dst, src_i),
5189			   dst_off_in_page, src_off_in_page, cur);
5190
5191		src_offset += cur;
5192		dst_offset += cur;
5193		len -= cur;
5194	}
5195}
5196
5197void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
5198			   unsigned long src_offset, unsigned long len)
5199{
5200	size_t cur;
5201	size_t dst_off_in_page;
5202	size_t src_off_in_page;
5203	unsigned long dst_end = dst_offset + len - 1;
5204	unsigned long src_end = src_offset + len - 1;
5205	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
5206	unsigned long dst_i;
5207	unsigned long src_i;
5208
5209	if (src_offset + len > dst->len) {
5210		printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
5211		       "len %lu len %lu\n", src_offset, len, dst->len);
5212		BUG_ON(1);
5213	}
5214	if (dst_offset + len > dst->len) {
5215		printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
5216		       "len %lu len %lu\n", dst_offset, len, dst->len);
5217		BUG_ON(1);
5218	}
5219	if (dst_offset < src_offset) {
5220		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
5221		return;
5222	}
5223	while (len > 0) {
5224		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
5225		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
5226
5227		dst_off_in_page = (start_offset + dst_end) &
5228			(PAGE_CACHE_SIZE - 1);
5229		src_off_in_page = (start_offset + src_end) &
5230			(PAGE_CACHE_SIZE - 1);
5231
5232		cur = min_t(unsigned long, len, src_off_in_page + 1);
5233		cur = min(cur, dst_off_in_page + 1);
5234		copy_pages(extent_buffer_page(dst, dst_i),
5235			   extent_buffer_page(dst, src_i),
5236			   dst_off_in_page - cur + 1,
5237			   src_off_in_page - cur + 1, cur);
5238
5239		dst_end -= cur;
5240		src_end -= cur;
5241		len -= cur;
5242	}
5243}
5244
5245int try_release_extent_buffer(struct page *page)
5246{
5247	struct extent_buffer *eb;
5248
5249	/*
5250	 * We need to make sure noboody is attaching this page to an eb right
5251	 * now.
5252	 */
5253	spin_lock(&page->mapping->private_lock);
5254	if (!PagePrivate(page)) {
5255		spin_unlock(&page->mapping->private_lock);
5256		return 1;
5257	}
5258
5259	eb = (struct extent_buffer *)page->private;
5260	BUG_ON(!eb);
5261
5262	/*
5263	 * This is a little awful but should be ok, we need to make sure that
5264	 * the eb doesn't disappear out from under us while we're looking at
5265	 * this page.
5266	 */
5267	spin_lock(&eb->refs_lock);
5268	if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
5269		spin_unlock(&eb->refs_lock);
5270		spin_unlock(&page->mapping->private_lock);
5271		return 0;
5272	}
5273	spin_unlock(&page->mapping->private_lock);
5274
 
 
 
5275	/*
5276	 * If tree ref isn't set then we know the ref on this eb is a real ref,
5277	 * so just return, this page will likely be freed soon anyway.
5278	 */
5279	if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
5280		spin_unlock(&eb->refs_lock);
5281		return 0;
5282	}
 
5283
5284	return release_extent_buffer(eb);
5285}