buffer.c - fs/buffer.c - Linux diff v4.6 - Bootlin Elixir Cross Referencer

   1/*
   2 *  linux/fs/buffer.c
   3 *
   4 *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
   5 */
   6
   7/*
   8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
   9 *
  10 * Removed a lot of unnecessary code and simplified things now that
  11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  12 *
  13 * Speed up hash, lru, and free list operations.  Use gfp() for allocating
  14 * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
  15 *
  16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
  17 *
  18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
  19 */
  20
  21#include <linux/kernel.h>
  22#include <linux/syscalls.h>
  23#include <linux/fs.h>
  24#include <linux/mm.h>
  25#include <linux/percpu.h>
  26#include <linux/slab.h>
  27#include <linux/capability.h>
  28#include <linux/blkdev.h>
  29#include <linux/file.h>
  30#include <linux/quotaops.h>
  31#include <linux/highmem.h>
  32#include <linux/export.h>
  33#include <linux/backing-dev.h>
  34#include <linux/writeback.h>
  35#include <linux/hash.h>
  36#include <linux/suspend.h>
  37#include <linux/buffer_head.h>
  38#include <linux/task_io_accounting_ops.h>
  39#include <linux/bio.h>
  40#include <linux/notifier.h>
  41#include <linux/cpu.h>
  42#include <linux/bitops.h>
  43#include <linux/mpage.h>
  44#include <linux/bit_spinlock.h>
  45#include <trace/events/block.h>
  46
  47static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
  48static int submit_bh_wbc(int rw, struct buffer_head *bh,
  49			 unsigned long bio_flags,
  50			 struct writeback_control *wbc);
  51
  52#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
  53
  54void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 
  55{
  56	bh->b_end_io = handler;
  57	bh->b_private = private;
  58}
  59EXPORT_SYMBOL(init_buffer);
  60
  61inline void touch_buffer(struct buffer_head *bh)
  62{
  63	trace_block_touch_buffer(bh);
  64	mark_page_accessed(bh->b_page);
  65}
  66EXPORT_SYMBOL(touch_buffer);
  67
  68void __lock_buffer(struct buffer_head *bh)
  69{
  70	wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
 
  71}
  72EXPORT_SYMBOL(__lock_buffer);
  73
  74void unlock_buffer(struct buffer_head *bh)
  75{
  76	clear_bit_unlock(BH_Lock, &bh->b_state);
  77	smp_mb__after_atomic();
  78	wake_up_bit(&bh->b_state, BH_Lock);
  79}
  80EXPORT_SYMBOL(unlock_buffer);
  81
  82/*
  83 * Returns if the page has dirty or writeback buffers. If all the buffers
  84 * are unlocked and clean then the PageDirty information is stale. If
  85 * any of the pages are locked, it is assumed they are locked for IO.
  86 */
  87void buffer_check_dirty_writeback(struct page *page,
  88				     bool *dirty, bool *writeback)
  89{
  90	struct buffer_head *head, *bh;
  91	*dirty = false;
  92	*writeback = false;
  93
  94	BUG_ON(!PageLocked(page));
  95
  96	if (!page_has_buffers(page))
  97		return;
  98
  99	if (PageWriteback(page))
 100		*writeback = true;
 101
 102	head = page_buffers(page);
 103	bh = head;
 104	do {
 105		if (buffer_locked(bh))
 106			*writeback = true;
 107
 108		if (buffer_dirty(bh))
 109			*dirty = true;
 110
 111		bh = bh->b_this_page;
 112	} while (bh != head);
 113}
 114EXPORT_SYMBOL(buffer_check_dirty_writeback);
 115
 116/*
 117 * Block until a buffer comes unlocked.  This doesn't stop it
 118 * from becoming locked again - you have to lock it yourself
 119 * if you want to preserve its state.
 120 */
 121void __wait_on_buffer(struct buffer_head * bh)
 122{
 123	wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
 124}
 125EXPORT_SYMBOL(__wait_on_buffer);
 126
 127static void
 128__clear_page_buffers(struct page *page)
 129{
 130	ClearPagePrivate(page);
 131	set_page_private(page, 0);
 132	put_page(page);
 133}
 134
 135static void buffer_io_error(struct buffer_head *bh, char *msg)
 
 136{
 137	if (!test_bit(BH_Quiet, &bh->b_state))
 138		printk_ratelimited(KERN_ERR
 139			"Buffer I/O error on dev %pg, logical block %llu%s\n",
 140			bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
 
 
 
 
 
 
 
 
 141}
 142
 143/*
 144 * End-of-IO handler helper function which does not touch the bh after
 145 * unlocking it.
 146 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
 147 * a race there is benign: unlock_buffer() only use the bh's address for
 148 * hashing after unlocking the buffer, so it doesn't actually touch the bh
 149 * itself.
 150 */
 151static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
 152{
 153	if (uptodate) {
 154		set_buffer_uptodate(bh);
 155	} else {
 156		/* This happens, due to failed READA attempts. */
 157		clear_buffer_uptodate(bh);
 158	}
 159	unlock_buffer(bh);
 160}
 161
 162/*
 163 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 164 * unlock the buffer. This is what ll_rw_block uses too.
 165 */
 166void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
 167{
 168	__end_buffer_read_notouch(bh, uptodate);
 169	put_bh(bh);
 170}
 171EXPORT_SYMBOL(end_buffer_read_sync);
 172
 173void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 174{
 
 
 175	if (uptodate) {
 176		set_buffer_uptodate(bh);
 177	} else {
 178		buffer_io_error(bh, ", lost sync page write");
 
 
 
 
 
 179		set_buffer_write_io_error(bh);
 180		clear_buffer_uptodate(bh);
 181	}
 182	unlock_buffer(bh);
 183	put_bh(bh);
 184}
 185EXPORT_SYMBOL(end_buffer_write_sync);
 186
 187/*
 188 * Various filesystems appear to want __find_get_block to be non-blocking.
 189 * But it's the page lock which protects the buffers.  To get around this,
 190 * we get exclusion from try_to_free_buffers with the blockdev mapping's
 191 * private_lock.
 192 *
 193 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
 194 * may be quite high.  This code could TryLock the page, and if that
 195 * succeeds, there is no need to take private_lock. (But if
 196 * private_lock is contended then so is mapping->tree_lock).
 197 */
 198static struct buffer_head *
 199__find_get_block_slow(struct block_device *bdev, sector_t block)
 200{
 201	struct inode *bd_inode = bdev->bd_inode;
 202	struct address_space *bd_mapping = bd_inode->i_mapping;
 203	struct buffer_head *ret = NULL;
 204	pgoff_t index;
 205	struct buffer_head *bh;
 206	struct buffer_head *head;
 207	struct page *page;
 208	int all_mapped = 1;
 209
 210	index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
 211	page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
 212	if (!page)
 213		goto out;
 214
 215	spin_lock(&bd_mapping->private_lock);
 216	if (!page_has_buffers(page))
 217		goto out_unlock;
 218	head = page_buffers(page);
 219	bh = head;
 220	do {
 221		if (!buffer_mapped(bh))
 222			all_mapped = 0;
 223		else if (bh->b_blocknr == block) {
 224			ret = bh;
 225			get_bh(bh);
 226			goto out_unlock;
 227		}
 228		bh = bh->b_this_page;
 229	} while (bh != head);
 230
 231	/* we might be here because some of the buffers on this page are
 232	 * not mapped.  This is due to various races between
 233	 * file io on the block device and getblk.  It gets dealt with
 234	 * elsewhere, don't buffer_error if we had some unmapped buffers
 235	 */
 236	if (all_mapped) {
 237		printk("__find_get_block_slow() failed. "
 238			"block=%llu, b_blocknr=%llu\n",
 239			(unsigned long long)block,
 240			(unsigned long long)bh->b_blocknr);
 241		printk("b_state=0x%08lx, b_size=%zu\n",
 242			bh->b_state, bh->b_size);
 243		printk("device %pg blocksize: %d\n", bdev,
 244			1 << bd_inode->i_blkbits);
 245	}
 246out_unlock:
 247	spin_unlock(&bd_mapping->private_lock);
 248	put_page(page);
 249out:
 250	return ret;
 251}
 252
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 253/*
 254 * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
 255 */
 256static void free_more_memory(void)
 257{
 258	struct zone *zone;
 259	int nid;
 260
 261	wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
 262	yield();
 263
 264	for_each_online_node(nid) {
 265		(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
 266						gfp_zone(GFP_NOFS), NULL,
 267						&zone);
 268		if (zone)
 269			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
 270						GFP_NOFS, NULL);
 271	}
 272}
 273
 274/*
 275 * I/O completion handler for block_read_full_page() - pages
 276 * which come unlocked at the end of I/O.
 277 */
 278static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 279{
 280	unsigned long flags;
 281	struct buffer_head *first;
 282	struct buffer_head *tmp;
 283	struct page *page;
 284	int page_uptodate = 1;
 285
 286	BUG_ON(!buffer_async_read(bh));
 287
 288	page = bh->b_page;
 289	if (uptodate) {
 290		set_buffer_uptodate(bh);
 291	} else {
 292		clear_buffer_uptodate(bh);
 293		buffer_io_error(bh, ", async page read");
 
 294		SetPageError(page);
 295	}
 296
 297	/*
 298	 * Be _very_ careful from here on. Bad things can happen if
 299	 * two buffer heads end IO at almost the same time and both
 300	 * decide that the page is now completely done.
 301	 */
 302	first = page_buffers(page);
 303	local_irq_save(flags);
 304	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 305	clear_buffer_async_read(bh);
 306	unlock_buffer(bh);
 307	tmp = bh;
 308	do {
 309		if (!buffer_uptodate(tmp))
 310			page_uptodate = 0;
 311		if (buffer_async_read(tmp)) {
 312			BUG_ON(!buffer_locked(tmp));
 313			goto still_busy;
 314		}
 315		tmp = tmp->b_this_page;
 316	} while (tmp != bh);
 317	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 318	local_irq_restore(flags);
 319
 320	/*
 321	 * If none of the buffers had errors and they are all
 322	 * uptodate then we can set the page uptodate.
 323	 */
 324	if (page_uptodate && !PageError(page))
 325		SetPageUptodate(page);
 326	unlock_page(page);
 327	return;
 328
 329still_busy:
 330	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 331	local_irq_restore(flags);
 332	return;
 333}
 334
 335/*
 336 * Completion handler for block_write_full_page() - pages which are unlocked
 337 * during I/O, and which have PageWriteback cleared upon I/O completion.
 338 */
 339void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 340{
 
 341	unsigned long flags;
 342	struct buffer_head *first;
 343	struct buffer_head *tmp;
 344	struct page *page;
 345
 346	BUG_ON(!buffer_async_write(bh));
 347
 348	page = bh->b_page;
 349	if (uptodate) {
 350		set_buffer_uptodate(bh);
 351	} else {
 352		buffer_io_error(bh, ", lost async page write");
 
 
 
 
 
 353		set_bit(AS_EIO, &page->mapping->flags);
 354		set_buffer_write_io_error(bh);
 355		clear_buffer_uptodate(bh);
 356		SetPageError(page);
 357	}
 358
 359	first = page_buffers(page);
 360	local_irq_save(flags);
 361	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 362
 363	clear_buffer_async_write(bh);
 364	unlock_buffer(bh);
 365	tmp = bh->b_this_page;
 366	while (tmp != bh) {
 367		if (buffer_async_write(tmp)) {
 368			BUG_ON(!buffer_locked(tmp));
 369			goto still_busy;
 370		}
 371		tmp = tmp->b_this_page;
 372	}
 373	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 374	local_irq_restore(flags);
 375	end_page_writeback(page);
 376	return;
 377
 378still_busy:
 379	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 380	local_irq_restore(flags);
 381	return;
 382}
 383EXPORT_SYMBOL(end_buffer_async_write);
 384
 385/*
 386 * If a page's buffers are under async readin (end_buffer_async_read
 387 * completion) then there is a possibility that another thread of
 388 * control could lock one of the buffers after it has completed
 389 * but while some of the other buffers have not completed.  This
 390 * locked buffer would confuse end_buffer_async_read() into not unlocking
 391 * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 392 * that this buffer is not under async I/O.
 393 *
 394 * The page comes unlocked when it has no locked buffer_async buffers
 395 * left.
 396 *
 397 * PageLocked prevents anyone starting new async I/O reads any of
 398 * the buffers.
 399 *
 400 * PageWriteback is used to prevent simultaneous writeout of the same
 401 * page.
 402 *
 403 * PageLocked prevents anyone from starting writeback of a page which is
 404 * under read I/O (PageWriteback is only ever set against a locked page).
 405 */
 406static void mark_buffer_async_read(struct buffer_head *bh)
 407{
 408	bh->b_end_io = end_buffer_async_read;
 409	set_buffer_async_read(bh);
 410}
 411
 412static void mark_buffer_async_write_endio(struct buffer_head *bh,
 413					  bh_end_io_t *handler)
 414{
 415	bh->b_end_io = handler;
 416	set_buffer_async_write(bh);
 417}
 418
 419void mark_buffer_async_write(struct buffer_head *bh)
 420{
 421	mark_buffer_async_write_endio(bh, end_buffer_async_write);
 422}
 423EXPORT_SYMBOL(mark_buffer_async_write);
 424
 425
 426/*
 427 * fs/buffer.c contains helper functions for buffer-backed address space's
 428 * fsync functions.  A common requirement for buffer-based filesystems is
 429 * that certain data from the backing blockdev needs to be written out for
 430 * a successful fsync().  For example, ext2 indirect blocks need to be
 431 * written back and waited upon before fsync() returns.
 432 *
 433 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
 434 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 435 * management of a list of dependent buffers at ->i_mapping->private_list.
 436 *
 437 * Locking is a little subtle: try_to_free_buffers() will remove buffers
 438 * from their controlling inode's queue when they are being freed.  But
 439 * try_to_free_buffers() will be operating against the *blockdev* mapping
 440 * at the time, not against the S_ISREG file which depends on those buffers.
 441 * So the locking for private_list is via the private_lock in the address_space
 442 * which backs the buffers.  Which is different from the address_space 
 443 * against which the buffers are listed.  So for a particular address_space,
 444 * mapping->private_lock does *not* protect mapping->private_list!  In fact,
 445 * mapping->private_list will always be protected by the backing blockdev's
 446 * ->private_lock.
 447 *
 448 * Which introduces a requirement: all buffers on an address_space's
 449 * ->private_list must be from the same address_space: the blockdev's.
 450 *
 451 * address_spaces which do not place buffers at ->private_list via these
 452 * utility functions are free to use private_lock and private_list for
 453 * whatever they want.  The only requirement is that list_empty(private_list)
 454 * be true at clear_inode() time.
 455 *
 456 * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 457 * filesystems should do that.  invalidate_inode_buffers() should just go
 458 * BUG_ON(!list_empty).
 459 *
 460 * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 461 * take an address_space, not an inode.  And it should be called
 462 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 463 * queued up.
 464 *
 465 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 466 * list if it is already on a list.  Because if the buffer is on a list,
 467 * it *must* already be on the right one.  If not, the filesystem is being
 468 * silly.  This will save a ton of locking.  But first we have to ensure
 469 * that buffers are taken *off* the old inode's list when they are freed
 470 * (presumably in truncate).  That requires careful auditing of all
 471 * filesystems (do it inside bforget()).  It could also be done by bringing
 472 * b_inode back.
 473 */
 474
 475/*
 476 * The buffer's backing address_space's private_lock must be held
 477 */
 478static void __remove_assoc_queue(struct buffer_head *bh)
 479{
 480	list_del_init(&bh->b_assoc_buffers);
 481	WARN_ON(!bh->b_assoc_map);
 482	if (buffer_write_io_error(bh))
 483		set_bit(AS_EIO, &bh->b_assoc_map->flags);
 484	bh->b_assoc_map = NULL;
 485}
 486
 487int inode_has_buffers(struct inode *inode)
 488{
 489	return !list_empty(&inode->i_data.private_list);
 490}
 491
 492/*
 493 * osync is designed to support O_SYNC io.  It waits synchronously for
 494 * all already-submitted IO to complete, but does not queue any new
 495 * writes to the disk.
 496 *
 497 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 498 * you dirty the buffers, and then use osync_inode_buffers to wait for
 499 * completion.  Any other dirty buffers which are not yet queued for
 500 * write will not be flushed to disk by the osync.
 501 */
 502static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
 503{
 504	struct buffer_head *bh;
 505	struct list_head *p;
 506	int err = 0;
 507
 508	spin_lock(lock);
 509repeat:
 510	list_for_each_prev(p, list) {
 511		bh = BH_ENTRY(p);
 512		if (buffer_locked(bh)) {
 513			get_bh(bh);
 514			spin_unlock(lock);
 515			wait_on_buffer(bh);
 516			if (!buffer_uptodate(bh))
 517				err = -EIO;
 518			brelse(bh);
 519			spin_lock(lock);
 520			goto repeat;
 521		}
 522	}
 523	spin_unlock(lock);
 524	return err;
 525}
 526
 527static void do_thaw_one(struct super_block *sb, void *unused)
 528{
 
 529	while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
 530		printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
 
 531}
 532
 533static void do_thaw_all(struct work_struct *work)
 534{
 535	iterate_supers(do_thaw_one, NULL);
 536	kfree(work);
 537	printk(KERN_WARNING "Emergency Thaw complete\n");
 538}
 539
 540/**
 541 * emergency_thaw_all -- forcibly thaw every frozen filesystem
 542 *
 543 * Used for emergency unfreeze of all filesystems via SysRq
 544 */
 545void emergency_thaw_all(void)
 546{
 547	struct work_struct *work;
 548
 549	work = kmalloc(sizeof(*work), GFP_ATOMIC);
 550	if (work) {
 551		INIT_WORK(work, do_thaw_all);
 552		schedule_work(work);
 553	}
 554}
 555
 556/**
 557 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
 558 * @mapping: the mapping which wants those buffers written
 559 *
 560 * Starts I/O against the buffers at mapping->private_list, and waits upon
 561 * that I/O.
 562 *
 563 * Basically, this is a convenience function for fsync().
 564 * @mapping is a file or directory which needs those buffers to be written for
 565 * a successful fsync().
 566 */
 567int sync_mapping_buffers(struct address_space *mapping)
 568{
 569	struct address_space *buffer_mapping = mapping->private_data;
 570
 571	if (buffer_mapping == NULL || list_empty(&mapping->private_list))
 572		return 0;
 573
 574	return fsync_buffers_list(&buffer_mapping->private_lock,
 575					&mapping->private_list);
 576}
 577EXPORT_SYMBOL(sync_mapping_buffers);
 578
 579/*
 580 * Called when we've recently written block `bblock', and it is known that
 581 * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 582 * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 583 * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 584 */
 585void write_boundary_block(struct block_device *bdev,
 586			sector_t bblock, unsigned blocksize)
 587{
 588	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
 589	if (bh) {
 590		if (buffer_dirty(bh))
 591			ll_rw_block(WRITE, 1, &bh);
 592		put_bh(bh);
 593	}
 594}
 595
 596void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 597{
 598	struct address_space *mapping = inode->i_mapping;
 599	struct address_space *buffer_mapping = bh->b_page->mapping;
 600
 601	mark_buffer_dirty(bh);
 602	if (!mapping->private_data) {
 603		mapping->private_data = buffer_mapping;
 604	} else {
 605		BUG_ON(mapping->private_data != buffer_mapping);
 606	}
 607	if (!bh->b_assoc_map) {
 608		spin_lock(&buffer_mapping->private_lock);
 609		list_move_tail(&bh->b_assoc_buffers,
 610				&mapping->private_list);
 611		bh->b_assoc_map = mapping;
 612		spin_unlock(&buffer_mapping->private_lock);
 613	}
 614}
 615EXPORT_SYMBOL(mark_buffer_dirty_inode);
 616
 617/*
 618 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
 619 * dirty.
 620 *
 621 * If warn is true, then emit a warning if the page is not uptodate and has
 622 * not been truncated.
 623 *
 624 * The caller must hold lock_page_memcg().
 625 */
 626static void __set_page_dirty(struct page *page, struct address_space *mapping,
 627			     int warn)
 628{
 629	unsigned long flags;
 630
 631	spin_lock_irqsave(&mapping->tree_lock, flags);
 632	if (page->mapping) {	/* Race with truncate? */
 633		WARN_ON_ONCE(warn && !PageUptodate(page));
 634		account_page_dirtied(page, mapping);
 635		radix_tree_tag_set(&mapping->page_tree,
 636				page_index(page), PAGECACHE_TAG_DIRTY);
 637	}
 638	spin_unlock_irqrestore(&mapping->tree_lock, flags);
 
 639}
 640
 641/*
 642 * Add a page to the dirty page list.
 643 *
 644 * It is a sad fact of life that this function is called from several places
 645 * deeply under spinlocking.  It may not sleep.
 646 *
 647 * If the page has buffers, the uptodate buffers are set dirty, to preserve
 648 * dirty-state coherency between the page and the buffers.  It the page does
 649 * not have buffers then when they are later attached they will all be set
 650 * dirty.
 651 *
 652 * The buffers are dirtied before the page is dirtied.  There's a small race
 653 * window in which a writepage caller may see the page cleanness but not the
 654 * buffer dirtiness.  That's fine.  If this code were to set the page dirty
 655 * before the buffers, a concurrent writepage caller could clear the page dirty
 656 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
 657 * page on the dirty page list.
 658 *
 659 * We use private_lock to lock against try_to_free_buffers while using the
 660 * page's buffer list.  Also use this to protect against clean buffers being
 661 * added to the page after it was set dirty.
 662 *
 663 * FIXME: may need to call ->reservepage here as well.  That's rather up to the
 664 * address_space though.
 665 */
 666int __set_page_dirty_buffers(struct page *page)
 667{
 668	int newly_dirty;
 669	struct address_space *mapping = page_mapping(page);
 670
 671	if (unlikely(!mapping))
 672		return !TestSetPageDirty(page);
 673
 674	spin_lock(&mapping->private_lock);
 675	if (page_has_buffers(page)) {
 676		struct buffer_head *head = page_buffers(page);
 677		struct buffer_head *bh = head;
 678
 679		do {
 680			set_buffer_dirty(bh);
 681			bh = bh->b_this_page;
 682		} while (bh != head);
 683	}
 684	/*
 685	 * Lock out page->mem_cgroup migration to keep PageDirty
 686	 * synchronized with per-memcg dirty page counters.
 687	 */
 688	lock_page_memcg(page);
 689	newly_dirty = !TestSetPageDirty(page);
 690	spin_unlock(&mapping->private_lock);
 691
 692	if (newly_dirty)
 693		__set_page_dirty(page, mapping, 1);
 694
 695	unlock_page_memcg(page);
 696
 697	if (newly_dirty)
 698		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 699
 700	return newly_dirty;
 701}
 702EXPORT_SYMBOL(__set_page_dirty_buffers);
 703
 704/*
 705 * Write out and wait upon a list of buffers.
 706 *
 707 * We have conflicting pressures: we want to make sure that all
 708 * initially dirty buffers get waited on, but that any subsequently
 709 * dirtied buffers don't.  After all, we don't want fsync to last
 710 * forever if somebody is actively writing to the file.
 711 *
 712 * Do this in two main stages: first we copy dirty buffers to a
 713 * temporary inode list, queueing the writes as we go.  Then we clean
 714 * up, waiting for those writes to complete.
 715 * 
 716 * During this second stage, any subsequent updates to the file may end
 717 * up refiling the buffer on the original inode's dirty list again, so
 718 * there is a chance we will end up with a buffer queued for write but
 719 * not yet completed on that list.  So, as a final cleanup we go through
 720 * the osync code to catch these locked, dirty buffers without requeuing
 721 * any newly dirty buffers for write.
 722 */
 723static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 724{
 725	struct buffer_head *bh;
 726	struct list_head tmp;
 727	struct address_space *mapping;
 728	int err = 0, err2;
 729	struct blk_plug plug;
 730
 731	INIT_LIST_HEAD(&tmp);
 732	blk_start_plug(&plug);
 733
 734	spin_lock(lock);
 735	while (!list_empty(list)) {
 736		bh = BH_ENTRY(list->next);
 737		mapping = bh->b_assoc_map;
 738		__remove_assoc_queue(bh);
 739		/* Avoid race with mark_buffer_dirty_inode() which does
 740		 * a lockless check and we rely on seeing the dirty bit */
 741		smp_mb();
 742		if (buffer_dirty(bh) || buffer_locked(bh)) {
 743			list_add(&bh->b_assoc_buffers, &tmp);
 744			bh->b_assoc_map = mapping;
 745			if (buffer_dirty(bh)) {
 746				get_bh(bh);
 747				spin_unlock(lock);
 748				/*
 749				 * Ensure any pending I/O completes so that
 750				 * write_dirty_buffer() actually writes the
 751				 * current contents - it is a noop if I/O is
 752				 * still in flight on potentially older
 753				 * contents.
 754				 */
 755				write_dirty_buffer(bh, WRITE_SYNC);
 756
 757				/*
 758				 * Kick off IO for the previous mapping. Note
 759				 * that we will not run the very last mapping,
 760				 * wait_on_buffer() will do that for us
 761				 * through sync_buffer().
 762				 */
 763				brelse(bh);
 764				spin_lock(lock);
 765			}
 766		}
 767	}
 768
 769	spin_unlock(lock);
 770	blk_finish_plug(&plug);
 771	spin_lock(lock);
 772
 773	while (!list_empty(&tmp)) {
 774		bh = BH_ENTRY(tmp.prev);
 775		get_bh(bh);
 776		mapping = bh->b_assoc_map;
 777		__remove_assoc_queue(bh);
 778		/* Avoid race with mark_buffer_dirty_inode() which does
 779		 * a lockless check and we rely on seeing the dirty bit */
 780		smp_mb();
 781		if (buffer_dirty(bh)) {
 782			list_add(&bh->b_assoc_buffers,
 783				 &mapping->private_list);
 784			bh->b_assoc_map = mapping;
 785		}
 786		spin_unlock(lock);
 787		wait_on_buffer(bh);
 788		if (!buffer_uptodate(bh))
 789			err = -EIO;
 790		brelse(bh);
 791		spin_lock(lock);
 792	}
 793	
 794	spin_unlock(lock);
 795	err2 = osync_buffers_list(lock, list);
 796	if (err)
 797		return err;
 798	else
 799		return err2;
 800}
 801
 802/*
 803 * Invalidate any and all dirty buffers on a given inode.  We are
 804 * probably unmounting the fs, but that doesn't mean we have already
 805 * done a sync().  Just drop the buffers from the inode list.
 806 *
 807 * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
 808 * assumes that all the buffers are against the blockdev.  Not true
 809 * for reiserfs.
 810 */
 811void invalidate_inode_buffers(struct inode *inode)
 812{
 813	if (inode_has_buffers(inode)) {
 814		struct address_space *mapping = &inode->i_data;
 815		struct list_head *list = &mapping->private_list;
 816		struct address_space *buffer_mapping = mapping->private_data;
 817
 818		spin_lock(&buffer_mapping->private_lock);
 819		while (!list_empty(list))
 820			__remove_assoc_queue(BH_ENTRY(list->next));
 821		spin_unlock(&buffer_mapping->private_lock);
 822	}
 823}
 824EXPORT_SYMBOL(invalidate_inode_buffers);
 825
 826/*
 827 * Remove any clean buffers from the inode's buffer list.  This is called
 828 * when we're trying to free the inode itself.  Those buffers can pin it.
 829 *
 830 * Returns true if all buffers were removed.
 831 */
 832int remove_inode_buffers(struct inode *inode)
 833{
 834	int ret = 1;
 835
 836	if (inode_has_buffers(inode)) {
 837		struct address_space *mapping = &inode->i_data;
 838		struct list_head *list = &mapping->private_list;
 839		struct address_space *buffer_mapping = mapping->private_data;
 840
 841		spin_lock(&buffer_mapping->private_lock);
 842		while (!list_empty(list)) {
 843			struct buffer_head *bh = BH_ENTRY(list->next);
 844			if (buffer_dirty(bh)) {
 845				ret = 0;
 846				break;
 847			}
 848			__remove_assoc_queue(bh);
 849		}
 850		spin_unlock(&buffer_mapping->private_lock);
 851	}
 852	return ret;
 853}
 854
 855/*
 856 * Create the appropriate buffers when given a page for data area and
 857 * the size of each buffer.. Use the bh->b_this_page linked list to
 858 * follow the buffers created.  Return NULL if unable to create more
 859 * buffers.
 860 *
 861 * The retry flag is used to differentiate async IO (paging, swapping)
 862 * which may not fail from ordinary buffer allocations.
 863 */
 864struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 865		int retry)
 866{
 867	struct buffer_head *bh, *head;
 868	long offset;
 869
 870try_again:
 871	head = NULL;
 872	offset = PAGE_SIZE;
 873	while ((offset -= size) >= 0) {
 874		bh = alloc_buffer_head(GFP_NOFS);
 875		if (!bh)
 876			goto no_grow;
 877
 
 878		bh->b_this_page = head;
 879		bh->b_blocknr = -1;
 880		head = bh;
 881
 
 
 882		bh->b_size = size;
 883
 884		/* Link the buffer to its page */
 885		set_bh_page(bh, page, offset);
 
 
 886	}
 887	return head;
 888/*
 889 * In case anything failed, we just free everything we got.
 890 */
 891no_grow:
 892	if (head) {
 893		do {
 894			bh = head;
 895			head = head->b_this_page;
 896			free_buffer_head(bh);
 897		} while (head);
 898	}
 899
 900	/*
 901	 * Return failure for non-async IO requests.  Async IO requests
 902	 * are not allowed to fail, so we have to wait until buffer heads
 903	 * become available.  But we don't want tasks sleeping with 
 904	 * partially complete buffers, so all were released above.
 905	 */
 906	if (!retry)
 907		return NULL;
 908
 909	/* We're _really_ low on memory. Now we just
 910	 * wait for old buffer heads to become free due to
 911	 * finishing IO.  Since this is an async request and
 912	 * the reserve list is empty, we're sure there are 
 913	 * async buffer heads in use.
 914	 */
 915	free_more_memory();
 916	goto try_again;
 917}
 918EXPORT_SYMBOL_GPL(alloc_page_buffers);
 919
 920static inline void
 921link_dev_buffers(struct page *page, struct buffer_head *head)
 922{
 923	struct buffer_head *bh, *tail;
 924
 925	bh = head;
 926	do {
 927		tail = bh;
 928		bh = bh->b_this_page;
 929	} while (bh);
 930	tail->b_this_page = head;
 931	attach_page_buffers(page, head);
 932}
 933
 934static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
 935{
 936	sector_t retval = ~((sector_t)0);
 937	loff_t sz = i_size_read(bdev->bd_inode);
 938
 939	if (sz) {
 940		unsigned int sizebits = blksize_bits(size);
 941		retval = (sz >> sizebits);
 942	}
 943	return retval;
 944}
 945
 946/*
 947 * Initialise the state of a blockdev page's buffers.
 948 */ 
 949static sector_t
 950init_page_buffers(struct page *page, struct block_device *bdev,
 951			sector_t block, int size)
 952{
 953	struct buffer_head *head = page_buffers(page);
 954	struct buffer_head *bh = head;
 955	int uptodate = PageUptodate(page);
 956	sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
 957
 958	do {
 959		if (!buffer_mapped(bh)) {
 960			init_buffer(bh, NULL, NULL);
 961			bh->b_bdev = bdev;
 962			bh->b_blocknr = block;
 963			if (uptodate)
 964				set_buffer_uptodate(bh);
 965			if (block < end_block)
 966				set_buffer_mapped(bh);
 967		}
 968		block++;
 969		bh = bh->b_this_page;
 970	} while (bh != head);
 971
 972	/*
 973	 * Caller needs to validate requested block against end of device.
 974	 */
 975	return end_block;
 976}
 977
 978/*
 979 * Create the page-cache page that contains the requested block.
 980 *
 981 * This is used purely for blockdev mappings.
 982 */
 983static int
 984grow_dev_page(struct block_device *bdev, sector_t block,
 985	      pgoff_t index, int size, int sizebits, gfp_t gfp)
 986{
 987	struct inode *inode = bdev->bd_inode;
 988	struct page *page;
 989	struct buffer_head *bh;
 990	sector_t end_block;
 991	int ret = 0;		/* Will call free_more_memory() */
 992	gfp_t gfp_mask;
 993
 994	gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
 995
 996	/*
 997	 * XXX: __getblk_slow() can not really deal with failure and
 998	 * will endlessly loop on improvised global reclaim.  Prefer
 999	 * looping in the allocator rather than here, at least that
1000	 * code knows what it's doing.
1001	 */
1002	gfp_mask |= __GFP_NOFAIL;
1003
1004	page = find_or_create_page(inode->i_mapping, index, gfp_mask);
1005	if (!page)
1006		return ret;
1007
1008	BUG_ON(!PageLocked(page));
1009
1010	if (page_has_buffers(page)) {
1011		bh = page_buffers(page);
1012		if (bh->b_size == size) {
1013			end_block = init_page_buffers(page, bdev,
1014						(sector_t)index << sizebits,
1015						size);
1016			goto done;
1017		}
1018		if (!try_to_free_buffers(page))
1019			goto failed;
1020	}
1021
1022	/*
1023	 * Allocate some buffers for this page
1024	 */
1025	bh = alloc_page_buffers(page, size, 0);
1026	if (!bh)
1027		goto failed;
1028
1029	/*
1030	 * Link the page to the buffers and initialise them.  Take the
1031	 * lock to be atomic wrt __find_get_block(), which does not
1032	 * run under the page lock.
1033	 */
1034	spin_lock(&inode->i_mapping->private_lock);
1035	link_dev_buffers(page, bh);
1036	end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits,
1037			size);
1038	spin_unlock(&inode->i_mapping->private_lock);
1039done:
1040	ret = (block < end_block) ? 1 : -ENXIO;
1041failed:
 
1042	unlock_page(page);
1043	put_page(page);
1044	return ret;
1045}
1046
1047/*
1048 * Create buffers for the specified block device block's page.  If
1049 * that page was dirty, the buffers are set dirty also.
1050 */
1051static int
1052grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
1053{
 
1054	pgoff_t index;
1055	int sizebits;
1056
1057	sizebits = -1;
1058	do {
1059		sizebits++;
1060	} while ((size << sizebits) < PAGE_SIZE);
1061
1062	index = block >> sizebits;
1063
1064	/*
1065	 * Check for a block which wants to lie outside our maximum possible
1066	 * pagecache index.  (this comparison is done using sector_t types).
1067	 */
1068	if (unlikely(index != block >> sizebits)) {
 
 
1069		printk(KERN_ERR "%s: requested out-of-range block %llu for "
1070			"device %pg\n",
1071			__func__, (unsigned long long)block,
1072			bdev);
1073		return -EIO;
1074	}
1075
1076	/* Create a page with the proper size buffers.. */
1077	return grow_dev_page(bdev, block, index, size, sizebits, gfp);
 
 
 
 
 
1078}
1079
1080struct buffer_head *
1081__getblk_slow(struct block_device *bdev, sector_t block,
1082	     unsigned size, gfp_t gfp)
1083{
1084	/* Size must be multiple of hard sectorsize */
1085	if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1086			(size < 512 || size > PAGE_SIZE))) {
1087		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1088					size);
1089		printk(KERN_ERR "logical block size: %d\n",
1090					bdev_logical_block_size(bdev));
1091
1092		dump_stack();
1093		return NULL;
1094	}
1095
1096	for (;;) {
1097		struct buffer_head *bh;
1098		int ret;
1099
1100		bh = __find_get_block(bdev, block, size);
1101		if (bh)
1102			return bh;
1103
1104		ret = grow_buffers(bdev, block, size, gfp);
1105		if (ret < 0)
1106			return NULL;
1107		if (ret == 0)
1108			free_more_memory();
1109	}
1110}
1111EXPORT_SYMBOL(__getblk_slow);
1112
1113/*
1114 * The relationship between dirty buffers and dirty pages:
1115 *
1116 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1117 * the page is tagged dirty in its radix tree.
1118 *
1119 * At all times, the dirtiness of the buffers represents the dirtiness of
1120 * subsections of the page.  If the page has buffers, the page dirty bit is
1121 * merely a hint about the true dirty state.
1122 *
1123 * When a page is set dirty in its entirety, all its buffers are marked dirty
1124 * (if the page has buffers).
1125 *
1126 * When a buffer is marked dirty, its page is dirtied, but the page's other
1127 * buffers are not.
1128 *
1129 * Also.  When blockdev buffers are explicitly read with bread(), they
1130 * individually become uptodate.  But their backing page remains not
1131 * uptodate - even if all of its buffers are uptodate.  A subsequent
1132 * block_read_full_page() against that page will discover all the uptodate
1133 * buffers, will set the page uptodate and will perform no I/O.
1134 */
1135
1136/**
1137 * mark_buffer_dirty - mark a buffer_head as needing writeout
1138 * @bh: the buffer_head to mark dirty
1139 *
1140 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1141 * backing page dirty, then tag the page as dirty in its address_space's radix
1142 * tree and then attach the address_space's inode to its superblock's dirty
1143 * inode list.
1144 *
1145 * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1146 * mapping->tree_lock and mapping->host->i_lock.
1147 */
1148void mark_buffer_dirty(struct buffer_head *bh)
1149{
1150	WARN_ON_ONCE(!buffer_uptodate(bh));
1151
1152	trace_block_dirty_buffer(bh);
1153
1154	/*
1155	 * Very *carefully* optimize the it-is-already-dirty case.
1156	 *
1157	 * Don't let the final "is it dirty" escape to before we
1158	 * perhaps modified the buffer.
1159	 */
1160	if (buffer_dirty(bh)) {
1161		smp_mb();
1162		if (buffer_dirty(bh))
1163			return;
1164	}
1165
1166	if (!test_set_buffer_dirty(bh)) {
1167		struct page *page = bh->b_page;
1168		struct address_space *mapping = NULL;
1169
1170		lock_page_memcg(page);
1171		if (!TestSetPageDirty(page)) {
1172			mapping = page_mapping(page);
1173			if (mapping)
1174				__set_page_dirty(page, mapping, 0);
1175		}
1176		unlock_page_memcg(page);
1177		if (mapping)
1178			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1179	}
1180}
1181EXPORT_SYMBOL(mark_buffer_dirty);
1182
1183/*
1184 * Decrement a buffer_head's reference count.  If all buffers against a page
1185 * have zero reference count, are clean and unlocked, and if the page is clean
1186 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1187 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1188 * a page but it ends up not being freed, and buffers may later be reattached).
1189 */
1190void __brelse(struct buffer_head * buf)
1191{
1192	if (atomic_read(&buf->b_count)) {
1193		put_bh(buf);
1194		return;
1195	}
1196	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1197}
1198EXPORT_SYMBOL(__brelse);
1199
1200/*
1201 * bforget() is like brelse(), except it discards any
1202 * potentially dirty data.
1203 */
1204void __bforget(struct buffer_head *bh)
1205{
1206	clear_buffer_dirty(bh);
1207	if (bh->b_assoc_map) {
1208		struct address_space *buffer_mapping = bh->b_page->mapping;
1209
1210		spin_lock(&buffer_mapping->private_lock);
1211		list_del_init(&bh->b_assoc_buffers);
1212		bh->b_assoc_map = NULL;
1213		spin_unlock(&buffer_mapping->private_lock);
1214	}
1215	__brelse(bh);
1216}
1217EXPORT_SYMBOL(__bforget);
1218
1219static struct buffer_head *__bread_slow(struct buffer_head *bh)
1220{
1221	lock_buffer(bh);
1222	if (buffer_uptodate(bh)) {
1223		unlock_buffer(bh);
1224		return bh;
1225	} else {
1226		get_bh(bh);
1227		bh->b_end_io = end_buffer_read_sync;
1228		submit_bh(READ, bh);
1229		wait_on_buffer(bh);
1230		if (buffer_uptodate(bh))
1231			return bh;
1232	}
1233	brelse(bh);
1234	return NULL;
1235}
1236
1237/*
1238 * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1239 * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1240 * refcount elevated by one when they're in an LRU.  A buffer can only appear
1241 * once in a particular CPU's LRU.  A single buffer can be present in multiple
1242 * CPU's LRUs at the same time.
1243 *
1244 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1245 * sb_find_get_block().
1246 *
1247 * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1248 * a local interrupt disable for that.
1249 */
1250
1251#define BH_LRU_SIZE	16
1252
1253struct bh_lru {
1254	struct buffer_head *bhs[BH_LRU_SIZE];
1255};
1256
1257static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1258
1259#ifdef CONFIG_SMP
1260#define bh_lru_lock()	local_irq_disable()
1261#define bh_lru_unlock()	local_irq_enable()
1262#else
1263#define bh_lru_lock()	preempt_disable()
1264#define bh_lru_unlock()	preempt_enable()
1265#endif
1266
1267static inline void check_irqs_on(void)
1268{
1269#ifdef irqs_disabled
1270	BUG_ON(irqs_disabled());
1271#endif
1272}
1273
1274/*
1275 * The LRU management algorithm is dopey-but-simple.  Sorry.
1276 */
1277static void bh_lru_install(struct buffer_head *bh)
1278{
1279	struct buffer_head *evictee = NULL;
1280
1281	check_irqs_on();
1282	bh_lru_lock();
1283	if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
1284		struct buffer_head *bhs[BH_LRU_SIZE];
1285		int in;
1286		int out = 0;
1287
1288		get_bh(bh);
1289		bhs[out++] = bh;
1290		for (in = 0; in < BH_LRU_SIZE; in++) {
1291			struct buffer_head *bh2 =
1292				__this_cpu_read(bh_lrus.bhs[in]);
1293
1294			if (bh2 == bh) {
1295				__brelse(bh2);
1296			} else {
1297				if (out >= BH_LRU_SIZE) {
1298					BUG_ON(evictee != NULL);
1299					evictee = bh2;
1300				} else {
1301					bhs[out++] = bh2;
1302				}
1303			}
1304		}
1305		while (out < BH_LRU_SIZE)
1306			bhs[out++] = NULL;
1307		memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
1308	}
1309	bh_lru_unlock();
1310
1311	if (evictee)
1312		__brelse(evictee);
1313}
1314
1315/*
1316 * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1317 */
1318static struct buffer_head *
1319lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1320{
1321	struct buffer_head *ret = NULL;
1322	unsigned int i;
1323
1324	check_irqs_on();
1325	bh_lru_lock();
1326	for (i = 0; i < BH_LRU_SIZE; i++) {
1327		struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1328
1329		if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
1330		    bh->b_size == size) {
1331			if (i) {
1332				while (i) {
1333					__this_cpu_write(bh_lrus.bhs[i],
1334						__this_cpu_read(bh_lrus.bhs[i - 1]));
1335					i--;
1336				}
1337				__this_cpu_write(bh_lrus.bhs[0], bh);
1338			}
1339			get_bh(bh);
1340			ret = bh;
1341			break;
1342		}
1343	}
1344	bh_lru_unlock();
1345	return ret;
1346}
1347
1348/*
1349 * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1350 * it in the LRU and mark it as accessed.  If it is not present then return
1351 * NULL
1352 */
1353struct buffer_head *
1354__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1355{
1356	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1357
1358	if (bh == NULL) {
1359		/* __find_get_block_slow will mark the page accessed */
1360		bh = __find_get_block_slow(bdev, block);
1361		if (bh)
1362			bh_lru_install(bh);
1363	} else
 
1364		touch_buffer(bh);
1365
1366	return bh;
1367}
1368EXPORT_SYMBOL(__find_get_block);
1369
1370/*
1371 * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
1372 * which corresponds to the passed block_device, block and size. The
1373 * returned buffer has its reference count incremented.
1374 *
1375 * __getblk_gfp() will lock up the machine if grow_dev_page's
1376 * try_to_free_buffers() attempt is failing.  FIXME, perhaps?
 
 
 
 
1377 */
1378struct buffer_head *
1379__getblk_gfp(struct block_device *bdev, sector_t block,
1380	     unsigned size, gfp_t gfp)
1381{
1382	struct buffer_head *bh = __find_get_block(bdev, block, size);
1383
1384	might_sleep();
1385	if (bh == NULL)
1386		bh = __getblk_slow(bdev, block, size, gfp);
1387	return bh;
1388}
1389EXPORT_SYMBOL(__getblk_gfp);
1390
1391/*
1392 * Do async read-ahead on a buffer..
1393 */
1394void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1395{
1396	struct buffer_head *bh = __getblk(bdev, block, size);
1397	if (likely(bh)) {
1398		ll_rw_block(READA, 1, &bh);
1399		brelse(bh);
1400	}
1401}
1402EXPORT_SYMBOL(__breadahead);
1403
1404/**
1405 *  __bread_gfp() - reads a specified block and returns the bh
1406 *  @bdev: the block_device to read from
1407 *  @block: number of block
1408 *  @size: size (in bytes) to read
1409 *  @gfp: page allocation flag
1410 *
1411 *  Reads a specified block, and returns buffer head that contains it.
1412 *  The page cache can be allocated from non-movable area
1413 *  not to prevent page migration if you set gfp to zero.
1414 *  It returns NULL if the block was unreadable.
1415 */
1416struct buffer_head *
1417__bread_gfp(struct block_device *bdev, sector_t block,
1418		   unsigned size, gfp_t gfp)
1419{
1420	struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
1421
1422	if (likely(bh) && !buffer_uptodate(bh))
1423		bh = __bread_slow(bh);
1424	return bh;
1425}
1426EXPORT_SYMBOL(__bread_gfp);
1427
1428/*
1429 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1430 * This doesn't race because it runs in each cpu either in irq
1431 * or with preempt disabled.
1432 */
1433static void invalidate_bh_lru(void *arg)
1434{
1435	struct bh_lru *b = &get_cpu_var(bh_lrus);
1436	int i;
1437
1438	for (i = 0; i < BH_LRU_SIZE; i++) {
1439		brelse(b->bhs[i]);
1440		b->bhs[i] = NULL;
1441	}
1442	put_cpu_var(bh_lrus);
1443}
1444
1445static bool has_bh_in_lru(int cpu, void *dummy)
1446{
1447	struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
1448	int i;
1449	
1450	for (i = 0; i < BH_LRU_SIZE; i++) {
1451		if (b->bhs[i])
1452			return 1;
1453	}
1454
1455	return 0;
1456}
1457
1458void invalidate_bh_lrus(void)
1459{
1460	on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1, GFP_KERNEL);
1461}
1462EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1463
1464void set_bh_page(struct buffer_head *bh,
1465		struct page *page, unsigned long offset)
1466{
1467	bh->b_page = page;
1468	BUG_ON(offset >= PAGE_SIZE);
1469	if (PageHighMem(page))
1470		/*
1471		 * This catches illegal uses and preserves the offset:
1472		 */
1473		bh->b_data = (char *)(0 + offset);
1474	else
1475		bh->b_data = page_address(page) + offset;
1476}
1477EXPORT_SYMBOL(set_bh_page);
1478
1479/*
1480 * Called when truncating a buffer on a page completely.
1481 */
1482
1483/* Bits that are cleared during an invalidate */
1484#define BUFFER_FLAGS_DISCARD \
1485	(1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
1486	 1 << BH_Delay | 1 << BH_Unwritten)
1487
1488static void discard_buffer(struct buffer_head * bh)
1489{
1490	unsigned long b_state, b_state_old;
1491
1492	lock_buffer(bh);
1493	clear_buffer_dirty(bh);
1494	bh->b_bdev = NULL;
1495	b_state = bh->b_state;
1496	for (;;) {
1497		b_state_old = cmpxchg(&bh->b_state, b_state,
1498				      (b_state & ~BUFFER_FLAGS_DISCARD));
1499		if (b_state_old == b_state)
1500			break;
1501		b_state = b_state_old;
1502	}
1503	unlock_buffer(bh);
1504}
1505
1506/**
1507 * block_invalidatepage - invalidate part or all of a buffer-backed page
1508 *
1509 * @page: the page which is affected
1510 * @offset: start of the range to invalidate
1511 * @length: length of the range to invalidate
1512 *
1513 * block_invalidatepage() is called when all or part of the page has become
1514 * invalidated by a truncate operation.
1515 *
1516 * block_invalidatepage() does not have to release all buffers, but it must
1517 * ensure that no dirty buffer is left outside @offset and that no I/O
1518 * is underway against any of the blocks which are outside the truncation
1519 * point.  Because the caller is about to free (and possibly reuse) those
1520 * blocks on-disk.
1521 */
1522void block_invalidatepage(struct page *page, unsigned int offset,
1523			  unsigned int length)
1524{
1525	struct buffer_head *head, *bh, *next;
1526	unsigned int curr_off = 0;
1527	unsigned int stop = length + offset;
1528
1529	BUG_ON(!PageLocked(page));
1530	if (!page_has_buffers(page))
1531		goto out;
1532
1533	/*
1534	 * Check for overflow
1535	 */
1536	BUG_ON(stop > PAGE_SIZE || stop < length);
1537
1538	head = page_buffers(page);
1539	bh = head;
1540	do {
1541		unsigned int next_off = curr_off + bh->b_size;
1542		next = bh->b_this_page;
1543
1544		/*
1545		 * Are we still fully in range ?
1546		 */
1547		if (next_off > stop)
1548			goto out;
1549
1550		/*
1551		 * is this block fully invalidated?
1552		 */
1553		if (offset <= curr_off)
1554			discard_buffer(bh);
1555		curr_off = next_off;
1556		bh = next;
1557	} while (bh != head);
1558
1559	/*
1560	 * We release buffers only if the entire page is being invalidated.
1561	 * The get_block cached value has been unconditionally invalidated,
1562	 * so real IO is not possible anymore.
1563	 */
1564	if (offset == 0)
1565		try_to_release_page(page, 0);
1566out:
1567	return;
1568}
1569EXPORT_SYMBOL(block_invalidatepage);
1570
1571
1572/*
1573 * We attach and possibly dirty the buffers atomically wrt
1574 * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1575 * is already excluded via the page lock.
1576 */
1577void create_empty_buffers(struct page *page,
1578			unsigned long blocksize, unsigned long b_state)
1579{
1580	struct buffer_head *bh, *head, *tail;
1581
1582	head = alloc_page_buffers(page, blocksize, 1);
1583	bh = head;
1584	do {
1585		bh->b_state |= b_state;
1586		tail = bh;
1587		bh = bh->b_this_page;
1588	} while (bh);
1589	tail->b_this_page = head;
1590
1591	spin_lock(&page->mapping->private_lock);
1592	if (PageUptodate(page) || PageDirty(page)) {
1593		bh = head;
1594		do {
1595			if (PageDirty(page))
1596				set_buffer_dirty(bh);
1597			if (PageUptodate(page))
1598				set_buffer_uptodate(bh);
1599			bh = bh->b_this_page;
1600		} while (bh != head);
1601	}
1602	attach_page_buffers(page, head);
1603	spin_unlock(&page->mapping->private_lock);
1604}
1605EXPORT_SYMBOL(create_empty_buffers);
1606
1607/*
1608 * We are taking a block for data and we don't want any output from any
1609 * buffer-cache aliases starting from return from that function and
1610 * until the moment when something will explicitly mark the buffer
1611 * dirty (hopefully that will not happen until we will free that block ;-)
1612 * We don't even need to mark it not-uptodate - nobody can expect
1613 * anything from a newly allocated buffer anyway. We used to used
1614 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1615 * don't want to mark the alias unmapped, for example - it would confuse
1616 * anyone who might pick it with bread() afterwards...
1617 *
1618 * Also..  Note that bforget() doesn't lock the buffer.  So there can
1619 * be writeout I/O going on against recently-freed buffers.  We don't
1620 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1621 * only if we really need to.  That happens here.
1622 */
1623void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1624{
1625	struct buffer_head *old_bh;
1626
1627	might_sleep();
1628
1629	old_bh = __find_get_block_slow(bdev, block);
1630	if (old_bh) {
1631		clear_buffer_dirty(old_bh);
1632		wait_on_buffer(old_bh);
1633		clear_buffer_req(old_bh);
1634		__brelse(old_bh);
1635	}
1636}
1637EXPORT_SYMBOL(unmap_underlying_metadata);
1638
1639/*
1640 * Size is a power-of-two in the range 512..PAGE_SIZE,
1641 * and the case we care about most is PAGE_SIZE.
1642 *
1643 * So this *could* possibly be written with those
1644 * constraints in mind (relevant mostly if some
1645 * architecture has a slow bit-scan instruction)
1646 */
1647static inline int block_size_bits(unsigned int blocksize)
1648{
1649	return ilog2(blocksize);
1650}
1651
1652static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
1653{
1654	BUG_ON(!PageLocked(page));
1655
1656	if (!page_has_buffers(page))
1657		create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state);
1658	return page_buffers(page);
1659}
1660
1661/*
1662 * NOTE! All mapped/uptodate combinations are valid:
1663 *
1664 *	Mapped	Uptodate	Meaning
1665 *
1666 *	No	No		"unknown" - must do get_block()
1667 *	No	Yes		"hole" - zero-filled
1668 *	Yes	No		"allocated" - allocated on disk, not read in
1669 *	Yes	Yes		"valid" - allocated and up-to-date in memory.
1670 *
1671 * "Dirty" is valid only with the last case (mapped+uptodate).
1672 */
1673
1674/*
1675 * While block_write_full_page is writing back the dirty buffers under
1676 * the page lock, whoever dirtied the buffers may decide to clean them
1677 * again at any time.  We handle that by only looking at the buffer
1678 * state inside lock_buffer().
1679 *
1680 * If block_write_full_page() is called for regular writeback
1681 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1682 * locked buffer.   This only can happen if someone has written the buffer
1683 * directly, with submit_bh().  At the address_space level PageWriteback
1684 * prevents this contention from occurring.
1685 *
1686 * If block_write_full_page() is called with wbc->sync_mode ==
1687 * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
1688 * causes the writes to be flagged as synchronous writes.
1689 */
1690static int __block_write_full_page(struct inode *inode, struct page *page,
1691			get_block_t *get_block, struct writeback_control *wbc,
1692			bh_end_io_t *handler)
1693{
1694	int err;
1695	sector_t block;
1696	sector_t last_block;
1697	struct buffer_head *bh, *head;
1698	unsigned int blocksize, bbits;
1699	int nr_underway = 0;
1700	int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
 
 
 
 
 
1701
1702	head = create_page_buffers(page, inode,
 
1703					(1 << BH_Dirty)|(1 << BH_Uptodate));
 
1704
1705	/*
1706	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1707	 * here, and the (potentially unmapped) buffers may become dirty at
1708	 * any time.  If a buffer becomes dirty here after we've inspected it
1709	 * then we just miss that fact, and the page stays dirty.
1710	 *
1711	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1712	 * handle that here by just cleaning them.
1713	 */
1714
 
 
1715	bh = head;
1716	blocksize = bh->b_size;
1717	bbits = block_size_bits(blocksize);
1718
1719	block = (sector_t)page->index << (PAGE_SHIFT - bbits);
1720	last_block = (i_size_read(inode) - 1) >> bbits;
1721
1722	/*
1723	 * Get all the dirty buffers mapped to disk addresses and
1724	 * handle any aliases from the underlying blockdev's mapping.
1725	 */
1726	do {
1727		if (block > last_block) {
1728			/*
1729			 * mapped buffers outside i_size will occur, because
1730			 * this page can be outside i_size when there is a
1731			 * truncate in progress.
1732			 */
1733			/*
1734			 * The buffer was zeroed by block_write_full_page()
1735			 */
1736			clear_buffer_dirty(bh);
1737			set_buffer_uptodate(bh);
1738		} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1739			   buffer_dirty(bh)) {
1740			WARN_ON(bh->b_size != blocksize);
1741			err = get_block(inode, block, bh, 1);
1742			if (err)
1743				goto recover;
1744			clear_buffer_delay(bh);
1745			if (buffer_new(bh)) {
1746				/* blockdev mappings never come here */
1747				clear_buffer_new(bh);
1748				unmap_underlying_metadata(bh->b_bdev,
1749							bh->b_blocknr);
1750			}
1751		}
1752		bh = bh->b_this_page;
1753		block++;
1754	} while (bh != head);
1755
1756	do {
1757		if (!buffer_mapped(bh))
1758			continue;
1759		/*
1760		 * If it's a fully non-blocking write attempt and we cannot
1761		 * lock the buffer then redirty the page.  Note that this can
1762		 * potentially cause a busy-wait loop from writeback threads
1763		 * and kswapd activity, but those code paths have their own
1764		 * higher-level throttling.
1765		 */
1766		if (wbc->sync_mode != WB_SYNC_NONE) {
1767			lock_buffer(bh);
1768		} else if (!trylock_buffer(bh)) {
1769			redirty_page_for_writepage(wbc, page);
1770			continue;
1771		}
1772		if (test_clear_buffer_dirty(bh)) {
1773			mark_buffer_async_write_endio(bh, handler);
1774		} else {
1775			unlock_buffer(bh);
1776		}
1777	} while ((bh = bh->b_this_page) != head);
1778
1779	/*
1780	 * The page and its buffers are protected by PageWriteback(), so we can
1781	 * drop the bh refcounts early.
1782	 */
1783	BUG_ON(PageWriteback(page));
1784	set_page_writeback(page);
1785
1786	do {
1787		struct buffer_head *next = bh->b_this_page;
1788		if (buffer_async_write(bh)) {
1789			submit_bh_wbc(write_op, bh, 0, wbc);
1790			nr_underway++;
1791		}
1792		bh = next;
1793	} while (bh != head);
1794	unlock_page(page);
1795
1796	err = 0;
1797done:
1798	if (nr_underway == 0) {
1799		/*
1800		 * The page was marked dirty, but the buffers were
1801		 * clean.  Someone wrote them back by hand with
1802		 * ll_rw_block/submit_bh.  A rare case.
1803		 */
1804		end_page_writeback(page);
1805
1806		/*
1807		 * The page and buffer_heads can be released at any time from
1808		 * here on.
1809		 */
1810	}
1811	return err;
1812
1813recover:
1814	/*
1815	 * ENOSPC, or some other error.  We may already have added some
1816	 * blocks to the file, so we need to write these out to avoid
1817	 * exposing stale data.
1818	 * The page is currently locked and not marked for writeback
1819	 */
1820	bh = head;
1821	/* Recovery: lock and submit the mapped buffers */
1822	do {
1823		if (buffer_mapped(bh) && buffer_dirty(bh) &&
1824		    !buffer_delay(bh)) {
1825			lock_buffer(bh);
1826			mark_buffer_async_write_endio(bh, handler);
1827		} else {
1828			/*
1829			 * The buffer may have been set dirty during
1830			 * attachment to a dirty page.
1831			 */
1832			clear_buffer_dirty(bh);
1833		}
1834	} while ((bh = bh->b_this_page) != head);
1835	SetPageError(page);
1836	BUG_ON(PageWriteback(page));
1837	mapping_set_error(page->mapping, err);
1838	set_page_writeback(page);
1839	do {
1840		struct buffer_head *next = bh->b_this_page;
1841		if (buffer_async_write(bh)) {
1842			clear_buffer_dirty(bh);
1843			submit_bh_wbc(write_op, bh, 0, wbc);
1844			nr_underway++;
1845		}
1846		bh = next;
1847	} while (bh != head);
1848	unlock_page(page);
1849	goto done;
1850}
1851
1852/*
1853 * If a page has any new buffers, zero them out here, and mark them uptodate
1854 * and dirty so they'll be written out (in order to prevent uninitialised
1855 * block data from leaking). And clear the new bit.
1856 */
1857void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1858{
1859	unsigned int block_start, block_end;
1860	struct buffer_head *head, *bh;
1861
1862	BUG_ON(!PageLocked(page));
1863	if (!page_has_buffers(page))
1864		return;
1865
1866	bh = head = page_buffers(page);
1867	block_start = 0;
1868	do {
1869		block_end = block_start + bh->b_size;
1870
1871		if (buffer_new(bh)) {
1872			if (block_end > from && block_start < to) {
1873				if (!PageUptodate(page)) {
1874					unsigned start, size;
1875
1876					start = max(from, block_start);
1877					size = min(to, block_end) - start;
1878
1879					zero_user(page, start, size);
1880					set_buffer_uptodate(bh);
1881				}
1882
1883				clear_buffer_new(bh);
1884				mark_buffer_dirty(bh);
1885			}
1886		}
1887
1888		block_start = block_end;
1889		bh = bh->b_this_page;
1890	} while (bh != head);
1891}
1892EXPORT_SYMBOL(page_zero_new_buffers);
1893
1894int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1895		get_block_t *get_block)
1896{
1897	unsigned from = pos & (PAGE_SIZE - 1);
1898	unsigned to = from + len;
1899	struct inode *inode = page->mapping->host;
1900	unsigned block_start, block_end;
1901	sector_t block;
1902	int err = 0;
1903	unsigned blocksize, bbits;
1904	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1905
1906	BUG_ON(!PageLocked(page));
1907	BUG_ON(from > PAGE_SIZE);
1908	BUG_ON(to > PAGE_SIZE);
1909	BUG_ON(from > to);
1910
1911	head = create_page_buffers(page, inode, 0);
1912	blocksize = head->b_size;
1913	bbits = block_size_bits(blocksize);
 
1914
1915	block = (sector_t)page->index << (PAGE_SHIFT - bbits);
 
1916
1917	for(bh = head, block_start = 0; bh != head || !block_start;
1918	    block++, block_start=block_end, bh = bh->b_this_page) {
1919		block_end = block_start + blocksize;
1920		if (block_end <= from || block_start >= to) {
1921			if (PageUptodate(page)) {
1922				if (!buffer_uptodate(bh))
1923					set_buffer_uptodate(bh);
1924			}
1925			continue;
1926		}
1927		if (buffer_new(bh))
1928			clear_buffer_new(bh);
1929		if (!buffer_mapped(bh)) {
1930			WARN_ON(bh->b_size != blocksize);
1931			err = get_block(inode, block, bh, 1);
1932			if (err)
1933				break;
1934			if (buffer_new(bh)) {
1935				unmap_underlying_metadata(bh->b_bdev,
1936							bh->b_blocknr);
1937				if (PageUptodate(page)) {
1938					clear_buffer_new(bh);
1939					set_buffer_uptodate(bh);
1940					mark_buffer_dirty(bh);
1941					continue;
1942				}
1943				if (block_end > to || block_start < from)
1944					zero_user_segments(page,
1945						to, block_end,
1946						block_start, from);
1947				continue;
1948			}
1949		}
1950		if (PageUptodate(page)) {
1951			if (!buffer_uptodate(bh))
1952				set_buffer_uptodate(bh);
1953			continue; 
1954		}
1955		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1956		    !buffer_unwritten(bh) &&
1957		     (block_start < from || block_end > to)) {
1958			ll_rw_block(READ, 1, &bh);
1959			*wait_bh++=bh;
1960		}
1961	}
1962	/*
1963	 * If we issued read requests - let them complete.
1964	 */
1965	while(wait_bh > wait) {
1966		wait_on_buffer(*--wait_bh);
1967		if (!buffer_uptodate(*wait_bh))
1968			err = -EIO;
1969	}
1970	if (unlikely(err))
1971		page_zero_new_buffers(page, from, to);
1972	return err;
1973}
1974EXPORT_SYMBOL(__block_write_begin);
1975
1976static int __block_commit_write(struct inode *inode, struct page *page,
1977		unsigned from, unsigned to)
1978{
1979	unsigned block_start, block_end;
1980	int partial = 0;
1981	unsigned blocksize;
1982	struct buffer_head *bh, *head;
1983
1984	bh = head = page_buffers(page);
1985	blocksize = bh->b_size;
1986
1987	block_start = 0;
1988	do {
 
1989		block_end = block_start + blocksize;
1990		if (block_end <= from || block_start >= to) {
1991			if (!buffer_uptodate(bh))
1992				partial = 1;
1993		} else {
1994			set_buffer_uptodate(bh);
1995			mark_buffer_dirty(bh);
1996		}
1997		clear_buffer_new(bh);
1998
1999		block_start = block_end;
2000		bh = bh->b_this_page;
2001	} while (bh != head);
2002
2003	/*
2004	 * If this is a partial write which happened to make all buffers
2005	 * uptodate then we can optimize away a bogus readpage() for
2006	 * the next read(). Here we 'discover' whether the page went
2007	 * uptodate as a result of this (potentially partial) write.
2008	 */
2009	if (!partial)
2010		SetPageUptodate(page);
2011	return 0;
2012}
2013
2014/*
2015 * block_write_begin takes care of the basic task of block allocation and
2016 * bringing partial write blocks uptodate first.
2017 *
2018 * The filesystem needs to handle block truncation upon failure.
2019 */
2020int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
2021		unsigned flags, struct page **pagep, get_block_t *get_block)
2022{
2023	pgoff_t index = pos >> PAGE_SHIFT;
2024	struct page *page;
2025	int status;
2026
2027	page = grab_cache_page_write_begin(mapping, index, flags);
2028	if (!page)
2029		return -ENOMEM;
2030
2031	status = __block_write_begin(page, pos, len, get_block);
2032	if (unlikely(status)) {
2033		unlock_page(page);
2034		put_page(page);
2035		page = NULL;
2036	}
2037
2038	*pagep = page;
2039	return status;
2040}
2041EXPORT_SYMBOL(block_write_begin);
2042
2043int block_write_end(struct file *file, struct address_space *mapping,
2044			loff_t pos, unsigned len, unsigned copied,
2045			struct page *page, void *fsdata)
2046{
2047	struct inode *inode = mapping->host;
2048	unsigned start;
2049
2050	start = pos & (PAGE_SIZE - 1);
2051
2052	if (unlikely(copied < len)) {
2053		/*
2054		 * The buffers that were written will now be uptodate, so we
2055		 * don't have to worry about a readpage reading them and
2056		 * overwriting a partial write. However if we have encountered
2057		 * a short write and only partially written into a buffer, it
2058		 * will not be marked uptodate, so a readpage might come in and
2059		 * destroy our partial write.
2060		 *
2061		 * Do the simplest thing, and just treat any short write to a
2062		 * non uptodate page as a zero-length write, and force the
2063		 * caller to redo the whole thing.
2064		 */
2065		if (!PageUptodate(page))
2066			copied = 0;
2067
2068		page_zero_new_buffers(page, start+copied, start+len);
2069	}
2070	flush_dcache_page(page);
2071
2072	/* This could be a short (even 0-length) commit */
2073	__block_commit_write(inode, page, start, start+copied);
2074
2075	return copied;
2076}
2077EXPORT_SYMBOL(block_write_end);
2078
2079int generic_write_end(struct file *file, struct address_space *mapping,
2080			loff_t pos, unsigned len, unsigned copied,
2081			struct page *page, void *fsdata)
2082{
2083	struct inode *inode = mapping->host;
2084	loff_t old_size = inode->i_size;
2085	int i_size_changed = 0;
2086
2087	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2088
2089	/*
2090	 * No need to use i_size_read() here, the i_size
2091	 * cannot change under us because we hold i_mutex.
2092	 *
2093	 * But it's important to update i_size while still holding page lock:
2094	 * page writeout could otherwise come in and zero beyond i_size.
2095	 */
2096	if (pos+copied > inode->i_size) {
2097		i_size_write(inode, pos+copied);
2098		i_size_changed = 1;
2099	}
2100
2101	unlock_page(page);
2102	put_page(page);
2103
2104	if (old_size < pos)
2105		pagecache_isize_extended(inode, old_size, pos);
2106	/*
2107	 * Don't mark the inode dirty under page lock. First, it unnecessarily
2108	 * makes the holding time of page lock longer. Second, it forces lock
2109	 * ordering of page lock and transaction start for journaling
2110	 * filesystems.
2111	 */
2112	if (i_size_changed)
2113		mark_inode_dirty(inode);
2114
2115	return copied;
2116}
2117EXPORT_SYMBOL(generic_write_end);
2118
2119/*
2120 * block_is_partially_uptodate checks whether buffers within a page are
2121 * uptodate or not.
2122 *
2123 * Returns true if all buffers which correspond to a file portion
2124 * we want to read are uptodate.
2125 */
2126int block_is_partially_uptodate(struct page *page, unsigned long from,
2127					unsigned long count)
2128{
 
2129	unsigned block_start, block_end, blocksize;
2130	unsigned to;
2131	struct buffer_head *bh, *head;
2132	int ret = 1;
2133
2134	if (!page_has_buffers(page))
2135		return 0;
2136
2137	head = page_buffers(page);
2138	blocksize = head->b_size;
2139	to = min_t(unsigned, PAGE_SIZE - from, count);
2140	to = from + to;
2141	if (from < blocksize && to > PAGE_SIZE - blocksize)
2142		return 0;
2143
 
2144	bh = head;
2145	block_start = 0;
2146	do {
2147		block_end = block_start + blocksize;
2148		if (block_end > from && block_start < to) {
2149			if (!buffer_uptodate(bh)) {
2150				ret = 0;
2151				break;
2152			}
2153			if (block_end >= to)
2154				break;
2155		}
2156		block_start = block_end;
2157		bh = bh->b_this_page;
2158	} while (bh != head);
2159
2160	return ret;
2161}
2162EXPORT_SYMBOL(block_is_partially_uptodate);
2163
2164/*
2165 * Generic "read page" function for block devices that have the normal
2166 * get_block functionality. This is most of the block device filesystems.
2167 * Reads the page asynchronously --- the unlock_buffer() and
2168 * set/clear_buffer_uptodate() functions propagate buffer state into the
2169 * page struct once IO has completed.
2170 */
2171int block_read_full_page(struct page *page, get_block_t *get_block)
2172{
2173	struct inode *inode = page->mapping->host;
2174	sector_t iblock, lblock;
2175	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2176	unsigned int blocksize, bbits;
2177	int nr, i;
2178	int fully_mapped = 1;
2179
2180	head = create_page_buffers(page, inode, 0);
2181	blocksize = head->b_size;
2182	bbits = block_size_bits(blocksize);
 
 
2183
2184	iblock = (sector_t)page->index << (PAGE_SHIFT - bbits);
2185	lblock = (i_size_read(inode)+blocksize-1) >> bbits;
2186	bh = head;
2187	nr = 0;
2188	i = 0;
2189
2190	do {
2191		if (buffer_uptodate(bh))
2192			continue;
2193
2194		if (!buffer_mapped(bh)) {
2195			int err = 0;
2196
2197			fully_mapped = 0;
2198			if (iblock < lblock) {
2199				WARN_ON(bh->b_size != blocksize);
2200				err = get_block(inode, iblock, bh, 0);
2201				if (err)
2202					SetPageError(page);
2203			}
2204			if (!buffer_mapped(bh)) {
2205				zero_user(page, i * blocksize, blocksize);
2206				if (!err)
2207					set_buffer_uptodate(bh);
2208				continue;
2209			}
2210			/*
2211			 * get_block() might have updated the buffer
2212			 * synchronously
2213			 */
2214			if (buffer_uptodate(bh))
2215				continue;
2216		}
2217		arr[nr++] = bh;
2218	} while (i++, iblock++, (bh = bh->b_this_page) != head);
2219
2220	if (fully_mapped)
2221		SetPageMappedToDisk(page);
2222
2223	if (!nr) {
2224		/*
2225		 * All buffers are uptodate - we can set the page uptodate
2226		 * as well. But not if get_block() returned an error.
2227		 */
2228		if (!PageError(page))
2229			SetPageUptodate(page);
2230		unlock_page(page);
2231		return 0;
2232	}
2233
2234	/* Stage two: lock the buffers */
2235	for (i = 0; i < nr; i++) {
2236		bh = arr[i];
2237		lock_buffer(bh);
2238		mark_buffer_async_read(bh);
2239	}
2240
2241	/*
2242	 * Stage 3: start the IO.  Check for uptodateness
2243	 * inside the buffer lock in case another process reading
2244	 * the underlying blockdev brought it uptodate (the sct fix).
2245	 */
2246	for (i = 0; i < nr; i++) {
2247		bh = arr[i];
2248		if (buffer_uptodate(bh))
2249			end_buffer_async_read(bh, 1);
2250		else
2251			submit_bh(READ, bh);
2252	}
2253	return 0;
2254}
2255EXPORT_SYMBOL(block_read_full_page);
2256
2257/* utility function for filesystems that need to do work on expanding
2258 * truncates.  Uses filesystem pagecache writes to allow the filesystem to
2259 * deal with the hole.  
2260 */
2261int generic_cont_expand_simple(struct inode *inode, loff_t size)
2262{
2263	struct address_space *mapping = inode->i_mapping;
2264	struct page *page;
2265	void *fsdata;
2266	int err;
2267
2268	err = inode_newsize_ok(inode, size);
2269	if (err)
2270		goto out;
2271
2272	err = pagecache_write_begin(NULL, mapping, size, 0,
2273				AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2274				&page, &fsdata);
2275	if (err)
2276		goto out;
2277
2278	err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2279	BUG_ON(err > 0);
2280
2281out:
2282	return err;
2283}
2284EXPORT_SYMBOL(generic_cont_expand_simple);
2285
2286static int cont_expand_zero(struct file *file, struct address_space *mapping,
2287			    loff_t pos, loff_t *bytes)
2288{
2289	struct inode *inode = mapping->host;
2290	unsigned blocksize = 1 << inode->i_blkbits;
2291	struct page *page;
2292	void *fsdata;
2293	pgoff_t index, curidx;
2294	loff_t curpos;
2295	unsigned zerofrom, offset, len;
2296	int err = 0;
2297
2298	index = pos >> PAGE_SHIFT;
2299	offset = pos & ~PAGE_MASK;
2300
2301	while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
2302		zerofrom = curpos & ~PAGE_MASK;
2303		if (zerofrom & (blocksize-1)) {
2304			*bytes |= (blocksize-1);
2305			(*bytes)++;
2306		}
2307		len = PAGE_SIZE - zerofrom;
2308
2309		err = pagecache_write_begin(file, mapping, curpos, len,
2310						AOP_FLAG_UNINTERRUPTIBLE,
2311						&page, &fsdata);
2312		if (err)
2313			goto out;
2314		zero_user(page, zerofrom, len);
2315		err = pagecache_write_end(file, mapping, curpos, len, len,
2316						page, fsdata);
2317		if (err < 0)
2318			goto out;
2319		BUG_ON(err != len);
2320		err = 0;
2321
2322		balance_dirty_pages_ratelimited(mapping);
2323
2324		if (unlikely(fatal_signal_pending(current))) {
2325			err = -EINTR;
2326			goto out;
2327		}
2328	}
2329
2330	/* page covers the boundary, find the boundary offset */
2331	if (index == curidx) {
2332		zerofrom = curpos & ~PAGE_MASK;
2333		/* if we will expand the thing last block will be filled */
2334		if (offset <= zerofrom) {
2335			goto out;
2336		}
2337		if (zerofrom & (blocksize-1)) {
2338			*bytes |= (blocksize-1);
2339			(*bytes)++;
2340		}
2341		len = offset - zerofrom;
2342
2343		err = pagecache_write_begin(file, mapping, curpos, len,
2344						AOP_FLAG_UNINTERRUPTIBLE,
2345						&page, &fsdata);
2346		if (err)
2347			goto out;
2348		zero_user(page, zerofrom, len);
2349		err = pagecache_write_end(file, mapping, curpos, len, len,
2350						page, fsdata);
2351		if (err < 0)
2352			goto out;
2353		BUG_ON(err != len);
2354		err = 0;
2355	}
2356out:
2357	return err;
2358}
2359
2360/*
2361 * For moronic filesystems that do not allow holes in file.
2362 * We may have to extend the file.
2363 */
2364int cont_write_begin(struct file *file, struct address_space *mapping,
2365			loff_t pos, unsigned len, unsigned flags,
2366			struct page **pagep, void **fsdata,
2367			get_block_t *get_block, loff_t *bytes)
2368{
2369	struct inode *inode = mapping->host;
2370	unsigned blocksize = 1 << inode->i_blkbits;
2371	unsigned zerofrom;
2372	int err;
2373
2374	err = cont_expand_zero(file, mapping, pos, bytes);
2375	if (err)
2376		return err;
2377
2378	zerofrom = *bytes & ~PAGE_MASK;
2379	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2380		*bytes |= (blocksize-1);
2381		(*bytes)++;
2382	}
2383
2384	return block_write_begin(mapping, pos, len, flags, pagep, get_block);
2385}
2386EXPORT_SYMBOL(cont_write_begin);
2387
2388int block_commit_write(struct page *page, unsigned from, unsigned to)
2389{
2390	struct inode *inode = page->mapping->host;
2391	__block_commit_write(inode,page,from,to);
2392	return 0;
2393}
2394EXPORT_SYMBOL(block_commit_write);
2395
2396/*
2397 * block_page_mkwrite() is not allowed to change the file size as it gets
2398 * called from a page fault handler when a page is first dirtied. Hence we must
2399 * be careful to check for EOF conditions here. We set the page up correctly
2400 * for a written page which means we get ENOSPC checking when writing into
2401 * holes and correct delalloc and unwritten extent mapping on filesystems that
2402 * support these features.
2403 *
2404 * We are not allowed to take the i_mutex here so we have to play games to
2405 * protect against truncate races as the page could now be beyond EOF.  Because
2406 * truncate writes the inode size before removing pages, once we have the
2407 * page lock we can determine safely if the page is beyond EOF. If it is not
2408 * beyond EOF, then the page is guaranteed safe against truncation until we
2409 * unlock the page.
2410 *
2411 * Direct callers of this function should protect against filesystem freezing
2412 * using sb_start_pagefault() - sb_end_pagefault() functions.
2413 */
2414int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2415			 get_block_t get_block)
2416{
2417	struct page *page = vmf->page;
2418	struct inode *inode = file_inode(vma->vm_file);
2419	unsigned long end;
2420	loff_t size;
2421	int ret;
2422
2423	lock_page(page);
2424	size = i_size_read(inode);
2425	if ((page->mapping != inode->i_mapping) ||
2426	    (page_offset(page) > size)) {
2427		/* We overload EFAULT to mean page got truncated */
2428		ret = -EFAULT;
2429		goto out_unlock;
2430	}
2431
2432	/* page is wholly or partially inside EOF */
2433	if (((page->index + 1) << PAGE_SHIFT) > size)
2434		end = size & ~PAGE_MASK;
2435	else
2436		end = PAGE_SIZE;
2437
2438	ret = __block_write_begin(page, 0, end, get_block);
2439	if (!ret)
2440		ret = block_commit_write(page, 0, end);
2441
2442	if (unlikely(ret < 0))
2443		goto out_unlock;
 
 
 
 
 
 
 
2444	set_page_dirty(page);
2445	wait_for_stable_page(page);
 
 
 
 
2446	return 0;
2447out_unlock:
2448	unlock_page(page);
2449	return ret;
2450}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2451EXPORT_SYMBOL(block_page_mkwrite);
2452
2453/*
2454 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2455 * immediately, while under the page lock.  So it needs a special end_io
2456 * handler which does not touch the bh after unlocking it.
2457 */
2458static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2459{
2460	__end_buffer_read_notouch(bh, uptodate);
2461}
2462
2463/*
2464 * Attach the singly-linked list of buffers created by nobh_write_begin, to
2465 * the page (converting it to circular linked list and taking care of page
2466 * dirty races).
2467 */
2468static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2469{
2470	struct buffer_head *bh;
2471
2472	BUG_ON(!PageLocked(page));
2473
2474	spin_lock(&page->mapping->private_lock);
2475	bh = head;
2476	do {
2477		if (PageDirty(page))
2478			set_buffer_dirty(bh);
2479		if (!bh->b_this_page)
2480			bh->b_this_page = head;
2481		bh = bh->b_this_page;
2482	} while (bh != head);
2483	attach_page_buffers(page, head);
2484	spin_unlock(&page->mapping->private_lock);
2485}
2486
2487/*
2488 * On entry, the page is fully not uptodate.
2489 * On exit the page is fully uptodate in the areas outside (from,to)
2490 * The filesystem needs to handle block truncation upon failure.
2491 */
2492int nobh_write_begin(struct address_space *mapping,
2493			loff_t pos, unsigned len, unsigned flags,
2494			struct page **pagep, void **fsdata,
2495			get_block_t *get_block)
2496{
2497	struct inode *inode = mapping->host;
2498	const unsigned blkbits = inode->i_blkbits;
2499	const unsigned blocksize = 1 << blkbits;
2500	struct buffer_head *head, *bh;
2501	struct page *page;
2502	pgoff_t index;
2503	unsigned from, to;
2504	unsigned block_in_page;
2505	unsigned block_start, block_end;
2506	sector_t block_in_file;
2507	int nr_reads = 0;
2508	int ret = 0;
2509	int is_mapped_to_disk = 1;
2510
2511	index = pos >> PAGE_SHIFT;
2512	from = pos & (PAGE_SIZE - 1);
2513	to = from + len;
2514
2515	page = grab_cache_page_write_begin(mapping, index, flags);
2516	if (!page)
2517		return -ENOMEM;
2518	*pagep = page;
2519	*fsdata = NULL;
2520
2521	if (page_has_buffers(page)) {
2522		ret = __block_write_begin(page, pos, len, get_block);
2523		if (unlikely(ret))
2524			goto out_release;
2525		return ret;
2526	}
2527
2528	if (PageMappedToDisk(page))
2529		return 0;
2530
2531	/*
2532	 * Allocate buffers so that we can keep track of state, and potentially
2533	 * attach them to the page if an error occurs. In the common case of
2534	 * no error, they will just be freed again without ever being attached
2535	 * to the page (which is all OK, because we're under the page lock).
2536	 *
2537	 * Be careful: the buffer linked list is a NULL terminated one, rather
2538	 * than the circular one we're used to.
2539	 */
2540	head = alloc_page_buffers(page, blocksize, 0);
2541	if (!head) {
2542		ret = -ENOMEM;
2543		goto out_release;
2544	}
2545
2546	block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
2547
2548	/*
2549	 * We loop across all blocks in the page, whether or not they are
2550	 * part of the affected region.  This is so we can discover if the
2551	 * page is fully mapped-to-disk.
2552	 */
2553	for (block_start = 0, block_in_page = 0, bh = head;
2554		  block_start < PAGE_SIZE;
2555		  block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2556		int create;
2557
2558		block_end = block_start + blocksize;
2559		bh->b_state = 0;
2560		create = 1;
2561		if (block_start >= to)
2562			create = 0;
2563		ret = get_block(inode, block_in_file + block_in_page,
2564					bh, create);
2565		if (ret)
2566			goto failed;
2567		if (!buffer_mapped(bh))
2568			is_mapped_to_disk = 0;
2569		if (buffer_new(bh))
2570			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2571		if (PageUptodate(page)) {
2572			set_buffer_uptodate(bh);
2573			continue;
2574		}
2575		if (buffer_new(bh) || !buffer_mapped(bh)) {
2576			zero_user_segments(page, block_start, from,
2577							to, block_end);
2578			continue;
2579		}
2580		if (buffer_uptodate(bh))
2581			continue;	/* reiserfs does this */
2582		if (block_start < from || block_end > to) {
2583			lock_buffer(bh);
2584			bh->b_end_io = end_buffer_read_nobh;
2585			submit_bh(READ, bh);
2586			nr_reads++;
2587		}
2588	}
2589
2590	if (nr_reads) {
2591		/*
2592		 * The page is locked, so these buffers are protected from
2593		 * any VM or truncate activity.  Hence we don't need to care
2594		 * for the buffer_head refcounts.
2595		 */
2596		for (bh = head; bh; bh = bh->b_this_page) {
2597			wait_on_buffer(bh);
2598			if (!buffer_uptodate(bh))
2599				ret = -EIO;
2600		}
2601		if (ret)
2602			goto failed;
2603	}
2604
2605	if (is_mapped_to_disk)
2606		SetPageMappedToDisk(page);
2607
2608	*fsdata = head; /* to be released by nobh_write_end */
2609
2610	return 0;
2611
2612failed:
2613	BUG_ON(!ret);
2614	/*
2615	 * Error recovery is a bit difficult. We need to zero out blocks that
2616	 * were newly allocated, and dirty them to ensure they get written out.
2617	 * Buffers need to be attached to the page at this point, otherwise
2618	 * the handling of potential IO errors during writeout would be hard
2619	 * (could try doing synchronous writeout, but what if that fails too?)
2620	 */
2621	attach_nobh_buffers(page, head);
2622	page_zero_new_buffers(page, from, to);
2623
2624out_release:
2625	unlock_page(page);
2626	put_page(page);
2627	*pagep = NULL;
2628
2629	return ret;
2630}
2631EXPORT_SYMBOL(nobh_write_begin);
2632
2633int nobh_write_end(struct file *file, struct address_space *mapping,
2634			loff_t pos, unsigned len, unsigned copied,
2635			struct page *page, void *fsdata)
2636{
2637	struct inode *inode = page->mapping->host;
2638	struct buffer_head *head = fsdata;
2639	struct buffer_head *bh;
2640	BUG_ON(fsdata != NULL && page_has_buffers(page));
2641
2642	if (unlikely(copied < len) && head)
2643		attach_nobh_buffers(page, head);
2644	if (page_has_buffers(page))
2645		return generic_write_end(file, mapping, pos, len,
2646					copied, page, fsdata);
2647
2648	SetPageUptodate(page);
2649	set_page_dirty(page);
2650	if (pos+copied > inode->i_size) {
2651		i_size_write(inode, pos+copied);
2652		mark_inode_dirty(inode);
2653	}
2654
2655	unlock_page(page);
2656	put_page(page);
2657
2658	while (head) {
2659		bh = head;
2660		head = head->b_this_page;
2661		free_buffer_head(bh);
2662	}
2663
2664	return copied;
2665}
2666EXPORT_SYMBOL(nobh_write_end);
2667
2668/*
2669 * nobh_writepage() - based on block_full_write_page() except
2670 * that it tries to operate without attaching bufferheads to
2671 * the page.
2672 */
2673int nobh_writepage(struct page *page, get_block_t *get_block,
2674			struct writeback_control *wbc)
2675{
2676	struct inode * const inode = page->mapping->host;
2677	loff_t i_size = i_size_read(inode);
2678	const pgoff_t end_index = i_size >> PAGE_SHIFT;
2679	unsigned offset;
2680	int ret;
2681
2682	/* Is the page fully inside i_size? */
2683	if (page->index < end_index)
2684		goto out;
2685
2686	/* Is the page fully outside i_size? (truncate in progress) */
2687	offset = i_size & (PAGE_SIZE-1);
2688	if (page->index >= end_index+1 || !offset) {
2689		/*
2690		 * The page may have dirty, unmapped buffers.  For example,
2691		 * they may have been added in ext3_writepage().  Make them
2692		 * freeable here, so the page does not leak.
2693		 */
2694#if 0
2695		/* Not really sure about this  - do we need this ? */
2696		if (page->mapping->a_ops->invalidatepage)
2697			page->mapping->a_ops->invalidatepage(page, offset);
2698#endif
2699		unlock_page(page);
2700		return 0; /* don't care */
2701	}
2702
2703	/*
2704	 * The page straddles i_size.  It must be zeroed out on each and every
2705	 * writepage invocation because it may be mmapped.  "A file is mapped
2706	 * in multiples of the page size.  For a file that is not a multiple of
2707	 * the  page size, the remaining memory is zeroed when mapped, and
2708	 * writes to that region are not written out to the file."
2709	 */
2710	zero_user_segment(page, offset, PAGE_SIZE);
2711out:
2712	ret = mpage_writepage(page, get_block, wbc);
2713	if (ret == -EAGAIN)
2714		ret = __block_write_full_page(inode, page, get_block, wbc,
2715					      end_buffer_async_write);
2716	return ret;
2717}
2718EXPORT_SYMBOL(nobh_writepage);
2719
2720int nobh_truncate_page(struct address_space *mapping,
2721			loff_t from, get_block_t *get_block)
2722{
2723	pgoff_t index = from >> PAGE_SHIFT;
2724	unsigned offset = from & (PAGE_SIZE-1);
2725	unsigned blocksize;
2726	sector_t iblock;
2727	unsigned length, pos;
2728	struct inode *inode = mapping->host;
2729	struct page *page;
2730	struct buffer_head map_bh;
2731	int err;
2732
2733	blocksize = 1 << inode->i_blkbits;
2734	length = offset & (blocksize - 1);
2735
2736	/* Block boundary? Nothing to do */
2737	if (!length)
2738		return 0;
2739
2740	length = blocksize - length;
2741	iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
2742
2743	page = grab_cache_page(mapping, index);
2744	err = -ENOMEM;
2745	if (!page)
2746		goto out;
2747
2748	if (page_has_buffers(page)) {
2749has_buffers:
2750		unlock_page(page);
2751		put_page(page);
2752		return block_truncate_page(mapping, from, get_block);
2753	}
2754
2755	/* Find the buffer that contains "offset" */
2756	pos = blocksize;
2757	while (offset >= pos) {
2758		iblock++;
2759		pos += blocksize;
2760	}
2761
2762	map_bh.b_size = blocksize;
2763	map_bh.b_state = 0;
2764	err = get_block(inode, iblock, &map_bh, 0);
2765	if (err)
2766		goto unlock;
2767	/* unmapped? It's a hole - nothing to do */
2768	if (!buffer_mapped(&map_bh))
2769		goto unlock;
2770
2771	/* Ok, it's mapped. Make sure it's up-to-date */
2772	if (!PageUptodate(page)) {
2773		err = mapping->a_ops->readpage(NULL, page);
2774		if (err) {
2775			put_page(page);
2776			goto out;
2777		}
2778		lock_page(page);
2779		if (!PageUptodate(page)) {
2780			err = -EIO;
2781			goto unlock;
2782		}
2783		if (page_has_buffers(page))
2784			goto has_buffers;
2785	}
2786	zero_user(page, offset, length);
2787	set_page_dirty(page);
2788	err = 0;
2789
2790unlock:
2791	unlock_page(page);
2792	put_page(page);
2793out:
2794	return err;
2795}
2796EXPORT_SYMBOL(nobh_truncate_page);
2797
2798int block_truncate_page(struct address_space *mapping,
2799			loff_t from, get_block_t *get_block)
2800{
2801	pgoff_t index = from >> PAGE_SHIFT;
2802	unsigned offset = from & (PAGE_SIZE-1);
2803	unsigned blocksize;
2804	sector_t iblock;
2805	unsigned length, pos;
2806	struct inode *inode = mapping->host;
2807	struct page *page;
2808	struct buffer_head *bh;
2809	int err;
2810
2811	blocksize = 1 << inode->i_blkbits;
2812	length = offset & (blocksize - 1);
2813
2814	/* Block boundary? Nothing to do */
2815	if (!length)
2816		return 0;
2817
2818	length = blocksize - length;
2819	iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
2820	
2821	page = grab_cache_page(mapping, index);
2822	err = -ENOMEM;
2823	if (!page)
2824		goto out;
2825
2826	if (!page_has_buffers(page))
2827		create_empty_buffers(page, blocksize, 0);
2828
2829	/* Find the buffer that contains "offset" */
2830	bh = page_buffers(page);
2831	pos = blocksize;
2832	while (offset >= pos) {
2833		bh = bh->b_this_page;
2834		iblock++;
2835		pos += blocksize;
2836	}
2837
2838	err = 0;
2839	if (!buffer_mapped(bh)) {
2840		WARN_ON(bh->b_size != blocksize);
2841		err = get_block(inode, iblock, bh, 0);
2842		if (err)
2843			goto unlock;
2844		/* unmapped? It's a hole - nothing to do */
2845		if (!buffer_mapped(bh))
2846			goto unlock;
2847	}
2848
2849	/* Ok, it's mapped. Make sure it's up-to-date */
2850	if (PageUptodate(page))
2851		set_buffer_uptodate(bh);
2852
2853	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2854		err = -EIO;
2855		ll_rw_block(READ, 1, &bh);
2856		wait_on_buffer(bh);
2857		/* Uhhuh. Read error. Complain and punt. */
2858		if (!buffer_uptodate(bh))
2859			goto unlock;
2860	}
2861
2862	zero_user(page, offset, length);
2863	mark_buffer_dirty(bh);
2864	err = 0;
2865
2866unlock:
2867	unlock_page(page);
2868	put_page(page);
2869out:
2870	return err;
2871}
2872EXPORT_SYMBOL(block_truncate_page);
2873
2874/*
2875 * The generic ->writepage function for buffer-backed address_spaces
 
2876 */
2877int block_write_full_page(struct page *page, get_block_t *get_block,
2878			struct writeback_control *wbc)
2879{
2880	struct inode * const inode = page->mapping->host;
2881	loff_t i_size = i_size_read(inode);
2882	const pgoff_t end_index = i_size >> PAGE_SHIFT;
2883	unsigned offset;
2884
2885	/* Is the page fully inside i_size? */
2886	if (page->index < end_index)
2887		return __block_write_full_page(inode, page, get_block, wbc,
2888					       end_buffer_async_write);
2889
2890	/* Is the page fully outside i_size? (truncate in progress) */
2891	offset = i_size & (PAGE_SIZE-1);
2892	if (page->index >= end_index+1 || !offset) {
2893		/*
2894		 * The page may have dirty, unmapped buffers.  For example,
2895		 * they may have been added in ext3_writepage().  Make them
2896		 * freeable here, so the page does not leak.
2897		 */
2898		do_invalidatepage(page, 0, PAGE_SIZE);
2899		unlock_page(page);
2900		return 0; /* don't care */
2901	}
2902
2903	/*
2904	 * The page straddles i_size.  It must be zeroed out on each and every
2905	 * writepage invocation because it may be mmapped.  "A file is mapped
2906	 * in multiples of the page size.  For a file that is not a multiple of
2907	 * the  page size, the remaining memory is zeroed when mapped, and
2908	 * writes to that region are not written out to the file."
2909	 */
2910	zero_user_segment(page, offset, PAGE_SIZE);
2911	return __block_write_full_page(inode, page, get_block, wbc,
2912							end_buffer_async_write);
 
 
 
 
 
 
 
 
 
 
2913}
2914EXPORT_SYMBOL(block_write_full_page);
2915
2916sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2917			    get_block_t *get_block)
2918{
2919	struct buffer_head tmp;
2920	struct inode *inode = mapping->host;
2921	tmp.b_state = 0;
2922	tmp.b_blocknr = 0;
2923	tmp.b_size = 1 << inode->i_blkbits;
2924	get_block(inode, block, &tmp, 0);
2925	return tmp.b_blocknr;
2926}
2927EXPORT_SYMBOL(generic_block_bmap);
2928
2929static void end_bio_bh_io_sync(struct bio *bio)
2930{
2931	struct buffer_head *bh = bio->bi_private;
2932
2933	if (unlikely(bio_flagged(bio, BIO_QUIET)))
 
 
 
 
2934		set_bit(BH_Quiet, &bh->b_state);
2935
2936	bh->b_end_io(bh, !bio->bi_error);
2937	bio_put(bio);
2938}
2939
2940/*
2941 * This allows us to do IO even on the odd last sectors
2942 * of a device, even if the block size is some multiple
2943 * of the physical sector size.
2944 *
2945 * We'll just truncate the bio to the size of the device,
2946 * and clear the end of the buffer head manually.
2947 *
2948 * Truly out-of-range accesses will turn into actual IO
2949 * errors, this only handles the "we need to be able to
2950 * do IO at the final sector" case.
2951 */
2952void guard_bio_eod(int rw, struct bio *bio)
2953{
2954	sector_t maxsector;
2955	struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
2956	unsigned truncated_bytes;
2957
2958	maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
2959	if (!maxsector)
2960		return;
2961
2962	/*
2963	 * If the *whole* IO is past the end of the device,
2964	 * let it through, and the IO layer will turn it into
2965	 * an EIO.
2966	 */
2967	if (unlikely(bio->bi_iter.bi_sector >= maxsector))
2968		return;
2969
2970	maxsector -= bio->bi_iter.bi_sector;
2971	if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
2972		return;
2973
2974	/* Uhhuh. We've got a bio that straddles the device size! */
2975	truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
2976
2977	/* Truncate the bio.. */
2978	bio->bi_iter.bi_size -= truncated_bytes;
2979	bvec->bv_len -= truncated_bytes;
2980
2981	/* ..and clear the end of the buffer for reads */
2982	if ((rw & RW_MASK) == READ) {
2983		zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
2984				truncated_bytes);
2985	}
2986}
2987
2988static int submit_bh_wbc(int rw, struct buffer_head *bh,
2989			 unsigned long bio_flags, struct writeback_control *wbc)
2990{
2991	struct bio *bio;
 
2992
2993	BUG_ON(!buffer_locked(bh));
2994	BUG_ON(!buffer_mapped(bh));
2995	BUG_ON(!bh->b_end_io);
2996	BUG_ON(buffer_delay(bh));
2997	BUG_ON(buffer_unwritten(bh));
2998
2999	/*
3000	 * Only clear out a write error when rewriting
3001	 */
3002	if (test_set_buffer_req(bh) && (rw & WRITE))
3003		clear_buffer_write_io_error(bh);
3004
3005	/*
3006	 * from here on down, it's all bio -- do the initial mapping,
3007	 * submit_bio -> generic_make_request may further map this bio around
3008	 */
3009	bio = bio_alloc(GFP_NOIO, 1);
3010
3011	if (wbc) {
3012		wbc_init_bio(wbc, bio);
3013		wbc_account_io(wbc, bh->b_page, bh->b_size);
3014	}
3015
3016	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
3017	bio->bi_bdev = bh->b_bdev;
3018
3019	bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
3020	BUG_ON(bio->bi_iter.bi_size != bh->b_size);
 
 
 
 
3021
3022	bio->bi_end_io = end_bio_bh_io_sync;
3023	bio->bi_private = bh;
3024	bio->bi_flags |= bio_flags;
3025
3026	/* Take care of bh's that straddle the end of the device */
3027	guard_bio_eod(rw, bio);
3028
3029	if (buffer_meta(bh))
3030		rw |= REQ_META;
3031	if (buffer_prio(bh))
3032		rw |= REQ_PRIO;
3033
 
3034	submit_bio(rw, bio);
3035	return 0;
3036}
3037
3038int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
3039{
3040	return submit_bh_wbc(rw, bh, bio_flags, NULL);
3041}
3042EXPORT_SYMBOL_GPL(_submit_bh);
3043
3044int submit_bh(int rw, struct buffer_head *bh)
3045{
3046	return submit_bh_wbc(rw, bh, 0, NULL);
3047}
3048EXPORT_SYMBOL(submit_bh);
3049
3050/**
3051 * ll_rw_block: low-level access to block devices (DEPRECATED)
3052 * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
3053 * @nr: number of &struct buffer_heads in the array
3054 * @bhs: array of pointers to &struct buffer_head
3055 *
3056 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
3057 * requests an I/O operation on them, either a %READ or a %WRITE.  The third
3058 * %READA option is described in the documentation for generic_make_request()
3059 * which ll_rw_block() calls.
3060 *
3061 * This function drops any buffer that it cannot get a lock on (with the
3062 * BH_Lock state bit), any buffer that appears to be clean when doing a write
3063 * request, and any buffer that appears to be up-to-date when doing read
3064 * request.  Further it marks as clean buffers that are processed for
3065 * writing (the buffer cache won't assume that they are actually clean
3066 * until the buffer gets unlocked).
3067 *
3068 * ll_rw_block sets b_end_io to simple completion handler that marks
3069 * the buffer up-to-date (if appropriate), unlocks the buffer and wakes
3070 * any waiters. 
3071 *
3072 * All of the buffers must be for the same device, and must also be a
3073 * multiple of the current approved size for the device.
3074 */
3075void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
3076{
3077	int i;
3078
3079	for (i = 0; i < nr; i++) {
3080		struct buffer_head *bh = bhs[i];
3081
3082		if (!trylock_buffer(bh))
3083			continue;
3084		if (rw == WRITE) {
3085			if (test_clear_buffer_dirty(bh)) {
3086				bh->b_end_io = end_buffer_write_sync;
3087				get_bh(bh);
3088				submit_bh(WRITE, bh);
3089				continue;
3090			}
3091		} else {
3092			if (!buffer_uptodate(bh)) {
3093				bh->b_end_io = end_buffer_read_sync;
3094				get_bh(bh);
3095				submit_bh(rw, bh);
3096				continue;
3097			}
3098		}
3099		unlock_buffer(bh);
3100	}
3101}
3102EXPORT_SYMBOL(ll_rw_block);
3103
3104void write_dirty_buffer(struct buffer_head *bh, int rw)
3105{
3106	lock_buffer(bh);
3107	if (!test_clear_buffer_dirty(bh)) {
3108		unlock_buffer(bh);
3109		return;
3110	}
3111	bh->b_end_io = end_buffer_write_sync;
3112	get_bh(bh);
3113	submit_bh(rw, bh);
3114}
3115EXPORT_SYMBOL(write_dirty_buffer);
3116
3117/*
3118 * For a data-integrity writeout, we need to wait upon any in-progress I/O
3119 * and then start new I/O and then wait upon it.  The caller must have a ref on
3120 * the buffer_head.
3121 */
3122int __sync_dirty_buffer(struct buffer_head *bh, int rw)
3123{
3124	int ret = 0;
3125
3126	WARN_ON(atomic_read(&bh->b_count) < 1);
3127	lock_buffer(bh);
3128	if (test_clear_buffer_dirty(bh)) {
3129		get_bh(bh);
3130		bh->b_end_io = end_buffer_write_sync;
3131		ret = submit_bh(rw, bh);
3132		wait_on_buffer(bh);
3133		if (!ret && !buffer_uptodate(bh))
3134			ret = -EIO;
3135	} else {
3136		unlock_buffer(bh);
3137	}
3138	return ret;
3139}
3140EXPORT_SYMBOL(__sync_dirty_buffer);
3141
3142int sync_dirty_buffer(struct buffer_head *bh)
3143{
3144	return __sync_dirty_buffer(bh, WRITE_SYNC);
3145}
3146EXPORT_SYMBOL(sync_dirty_buffer);
3147
3148/*
3149 * try_to_free_buffers() checks if all the buffers on this particular page
3150 * are unused, and releases them if so.
3151 *
3152 * Exclusion against try_to_free_buffers may be obtained by either
3153 * locking the page or by holding its mapping's private_lock.
3154 *
3155 * If the page is dirty but all the buffers are clean then we need to
3156 * be sure to mark the page clean as well.  This is because the page
3157 * may be against a block device, and a later reattachment of buffers
3158 * to a dirty page will set *all* buffers dirty.  Which would corrupt
3159 * filesystem data on the same device.
3160 *
3161 * The same applies to regular filesystem pages: if all the buffers are
3162 * clean then we set the page clean and proceed.  To do that, we require
3163 * total exclusion from __set_page_dirty_buffers().  That is obtained with
3164 * private_lock.
3165 *
3166 * try_to_free_buffers() is non-blocking.
3167 */
3168static inline int buffer_busy(struct buffer_head *bh)
3169{
3170	return atomic_read(&bh->b_count) |
3171		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3172}
3173
3174static int
3175drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3176{
3177	struct buffer_head *head = page_buffers(page);
3178	struct buffer_head *bh;
3179
3180	bh = head;
3181	do {
3182		if (buffer_write_io_error(bh) && page->mapping)
3183			set_bit(AS_EIO, &page->mapping->flags);
3184		if (buffer_busy(bh))
3185			goto failed;
3186		bh = bh->b_this_page;
3187	} while (bh != head);
3188
3189	do {
3190		struct buffer_head *next = bh->b_this_page;
3191
3192		if (bh->b_assoc_map)
3193			__remove_assoc_queue(bh);
3194		bh = next;
3195	} while (bh != head);
3196	*buffers_to_free = head;
3197	__clear_page_buffers(page);
3198	return 1;
3199failed:
3200	return 0;
3201}
3202
3203int try_to_free_buffers(struct page *page)
3204{
3205	struct address_space * const mapping = page->mapping;
3206	struct buffer_head *buffers_to_free = NULL;
3207	int ret = 0;
3208
3209	BUG_ON(!PageLocked(page));
3210	if (PageWriteback(page))
3211		return 0;
3212
3213	if (mapping == NULL) {		/* can this still happen? */
3214		ret = drop_buffers(page, &buffers_to_free);
3215		goto out;
3216	}
3217
3218	spin_lock(&mapping->private_lock);
3219	ret = drop_buffers(page, &buffers_to_free);
3220
3221	/*
3222	 * If the filesystem writes its buffers by hand (eg ext3)
3223	 * then we can have clean buffers against a dirty page.  We
3224	 * clean the page here; otherwise the VM will never notice
3225	 * that the filesystem did any IO at all.
3226	 *
3227	 * Also, during truncate, discard_buffer will have marked all
3228	 * the page's buffers clean.  We discover that here and clean
3229	 * the page also.
3230	 *
3231	 * private_lock must be held over this entire operation in order
3232	 * to synchronise against __set_page_dirty_buffers and prevent the
3233	 * dirty bit from being lost.
3234	 */
3235	if (ret)
3236		cancel_dirty_page(page);
3237	spin_unlock(&mapping->private_lock);
3238out:
3239	if (buffers_to_free) {
3240		struct buffer_head *bh = buffers_to_free;
3241
3242		do {
3243			struct buffer_head *next = bh->b_this_page;
3244			free_buffer_head(bh);
3245			bh = next;
3246		} while (bh != buffers_to_free);
3247	}
3248	return ret;
3249}
3250EXPORT_SYMBOL(try_to_free_buffers);
3251
3252/*
3253 * There are no bdflush tunables left.  But distributions are
3254 * still running obsolete flush daemons, so we terminate them here.
3255 *
3256 * Use of bdflush() is deprecated and will be removed in a future kernel.
3257 * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3258 */
3259SYSCALL_DEFINE2(bdflush, int, func, long, data)
3260{
3261	static int msg_count;
3262
3263	if (!capable(CAP_SYS_ADMIN))
3264		return -EPERM;
3265
3266	if (msg_count < 5) {
3267		msg_count++;
3268		printk(KERN_INFO
3269			"warning: process `%s' used the obsolete bdflush"
3270			" system call\n", current->comm);
3271		printk(KERN_INFO "Fix your initscripts?\n");
3272	}
3273
3274	if (func == 1)
3275		do_exit(0);
3276	return 0;
3277}
3278
3279/*
3280 * Buffer-head allocation
3281 */
3282static struct kmem_cache *bh_cachep __read_mostly;
3283
3284/*
3285 * Once the number of bh's in the machine exceeds this level, we start
3286 * stripping them in writeback.
3287 */
3288static unsigned long max_buffer_heads;
3289
3290int buffer_heads_over_limit;
3291
3292struct bh_accounting {
3293	int nr;			/* Number of live bh's */
3294	int ratelimit;		/* Limit cacheline bouncing */
3295};
3296
3297static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3298
3299static void recalc_bh_state(void)
3300{
3301	int i;
3302	int tot = 0;
3303
3304	if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3305		return;
3306	__this_cpu_write(bh_accounting.ratelimit, 0);
3307	for_each_online_cpu(i)
3308		tot += per_cpu(bh_accounting, i).nr;
3309	buffer_heads_over_limit = (tot > max_buffer_heads);
3310}
3311
3312struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3313{
3314	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3315	if (ret) {
3316		INIT_LIST_HEAD(&ret->b_assoc_buffers);
3317		preempt_disable();
3318		__this_cpu_inc(bh_accounting.nr);
3319		recalc_bh_state();
3320		preempt_enable();
3321	}
3322	return ret;
3323}
3324EXPORT_SYMBOL(alloc_buffer_head);
3325
3326void free_buffer_head(struct buffer_head *bh)
3327{
3328	BUG_ON(!list_empty(&bh->b_assoc_buffers));
3329	kmem_cache_free(bh_cachep, bh);
3330	preempt_disable();
3331	__this_cpu_dec(bh_accounting.nr);
3332	recalc_bh_state();
3333	preempt_enable();
3334}
3335EXPORT_SYMBOL(free_buffer_head);
3336
3337static void buffer_exit_cpu(int cpu)
3338{
3339	int i;
3340	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3341
3342	for (i = 0; i < BH_LRU_SIZE; i++) {
3343		brelse(b->bhs[i]);
3344		b->bhs[i] = NULL;
3345	}
3346	this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3347	per_cpu(bh_accounting, cpu).nr = 0;
3348}
3349
3350static int buffer_cpu_notify(struct notifier_block *self,
3351			      unsigned long action, void *hcpu)
3352{
3353	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3354		buffer_exit_cpu((unsigned long)hcpu);
3355	return NOTIFY_OK;
3356}
3357
3358/**
3359 * bh_uptodate_or_lock - Test whether the buffer is uptodate
3360 * @bh: struct buffer_head
3361 *
3362 * Return true if the buffer is up-to-date and false,
3363 * with the buffer locked, if not.
3364 */
3365int bh_uptodate_or_lock(struct buffer_head *bh)
3366{
3367	if (!buffer_uptodate(bh)) {
3368		lock_buffer(bh);
3369		if (!buffer_uptodate(bh))
3370			return 0;
3371		unlock_buffer(bh);
3372	}
3373	return 1;
3374}
3375EXPORT_SYMBOL(bh_uptodate_or_lock);
3376
3377/**
3378 * bh_submit_read - Submit a locked buffer for reading
3379 * @bh: struct buffer_head
3380 *
3381 * Returns zero on success and -EIO on error.
3382 */
3383int bh_submit_read(struct buffer_head *bh)
3384{
3385	BUG_ON(!buffer_locked(bh));
3386
3387	if (buffer_uptodate(bh)) {
3388		unlock_buffer(bh);
3389		return 0;
3390	}
3391
3392	get_bh(bh);
3393	bh->b_end_io = end_buffer_read_sync;
3394	submit_bh(READ, bh);
3395	wait_on_buffer(bh);
3396	if (buffer_uptodate(bh))
3397		return 0;
3398	return -EIO;
3399}
3400EXPORT_SYMBOL(bh_submit_read);
3401
3402void __init buffer_init(void)
3403{
3404	unsigned long nrpages;
3405
3406	bh_cachep = kmem_cache_create("buffer_head",
3407			sizeof(struct buffer_head), 0,
3408				(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3409				SLAB_MEM_SPREAD),
3410				NULL);
3411
3412	/*
3413	 * Limit the bh occupancy to 10% of ZONE_NORMAL
3414	 */
3415	nrpages = (nr_free_buffer_pages() * 10) / 100;
3416	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3417	hotcpu_notifier(buffer_cpu_notify, 0);
3418}

   1/*
   2 *  linux/fs/buffer.c
   3 *
   4 *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
   5 */
   6
   7/*
   8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
   9 *
  10 * Removed a lot of unnecessary code and simplified things now that
  11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  12 *
  13 * Speed up hash, lru, and free list operations.  Use gfp() for allocating
  14 * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
  15 *
  16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
  17 *
  18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
  19 */
  20
  21#include <linux/kernel.h>
  22#include <linux/syscalls.h>
  23#include <linux/fs.h>
  24#include <linux/mm.h>
  25#include <linux/percpu.h>
  26#include <linux/slab.h>
  27#include <linux/capability.h>
  28#include <linux/blkdev.h>
  29#include <linux/file.h>
  30#include <linux/quotaops.h>
  31#include <linux/highmem.h>
  32#include <linux/module.h>
 
  33#include <linux/writeback.h>
  34#include <linux/hash.h>
  35#include <linux/suspend.h>
  36#include <linux/buffer_head.h>
  37#include <linux/task_io_accounting_ops.h>
  38#include <linux/bio.h>
  39#include <linux/notifier.h>
  40#include <linux/cpu.h>
  41#include <linux/bitops.h>
  42#include <linux/mpage.h>
  43#include <linux/bit_spinlock.h>
  44#include <linux/cleancache.h>
  45
  46static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 
 
 
  47
  48#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
  49
  50inline void
  51init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
  52{
  53	bh->b_end_io = handler;
  54	bh->b_private = private;
  55}
  56EXPORT_SYMBOL(init_buffer);
  57
  58static int sleep_on_buffer(void *word)
  59{
  60	io_schedule();
  61	return 0;
  62}
 
  63
  64void __lock_buffer(struct buffer_head *bh)
  65{
  66	wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer,
  67							TASK_UNINTERRUPTIBLE);
  68}
  69EXPORT_SYMBOL(__lock_buffer);
  70
  71void unlock_buffer(struct buffer_head *bh)
  72{
  73	clear_bit_unlock(BH_Lock, &bh->b_state);
  74	smp_mb__after_clear_bit();
  75	wake_up_bit(&bh->b_state, BH_Lock);
  76}
  77EXPORT_SYMBOL(unlock_buffer);
  78
  79/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  80 * Block until a buffer comes unlocked.  This doesn't stop it
  81 * from becoming locked again - you have to lock it yourself
  82 * if you want to preserve its state.
  83 */
  84void __wait_on_buffer(struct buffer_head * bh)
  85{
  86	wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE);
  87}
  88EXPORT_SYMBOL(__wait_on_buffer);
  89
  90static void
  91__clear_page_buffers(struct page *page)
  92{
  93	ClearPagePrivate(page);
  94	set_page_private(page, 0);
  95	page_cache_release(page);
  96}
  97
  98
  99static int quiet_error(struct buffer_head *bh)
 100{
 101	if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
 102		return 0;
 103	return 1;
 104}
 105
 106
 107static void buffer_io_error(struct buffer_head *bh)
 108{
 109	char b[BDEVNAME_SIZE];
 110	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
 111			bdevname(bh->b_bdev, b),
 112			(unsigned long long)bh->b_blocknr);
 113}
 114
 115/*
 116 * End-of-IO handler helper function which does not touch the bh after
 117 * unlocking it.
 118 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
 119 * a race there is benign: unlock_buffer() only use the bh's address for
 120 * hashing after unlocking the buffer, so it doesn't actually touch the bh
 121 * itself.
 122 */
 123static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
 124{
 125	if (uptodate) {
 126		set_buffer_uptodate(bh);
 127	} else {
 128		/* This happens, due to failed READA attempts. */
 129		clear_buffer_uptodate(bh);
 130	}
 131	unlock_buffer(bh);
 132}
 133
 134/*
 135 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 136 * unlock the buffer. This is what ll_rw_block uses too.
 137 */
 138void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
 139{
 140	__end_buffer_read_notouch(bh, uptodate);
 141	put_bh(bh);
 142}
 143EXPORT_SYMBOL(end_buffer_read_sync);
 144
 145void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 146{
 147	char b[BDEVNAME_SIZE];
 148
 149	if (uptodate) {
 150		set_buffer_uptodate(bh);
 151	} else {
 152		if (!quiet_error(bh)) {
 153			buffer_io_error(bh);
 154			printk(KERN_WARNING "lost page write due to "
 155					"I/O error on %s\n",
 156				       bdevname(bh->b_bdev, b));
 157		}
 158		set_buffer_write_io_error(bh);
 159		clear_buffer_uptodate(bh);
 160	}
 161	unlock_buffer(bh);
 162	put_bh(bh);
 163}
 164EXPORT_SYMBOL(end_buffer_write_sync);
 165
 166/*
 167 * Various filesystems appear to want __find_get_block to be non-blocking.
 168 * But it's the page lock which protects the buffers.  To get around this,
 169 * we get exclusion from try_to_free_buffers with the blockdev mapping's
 170 * private_lock.
 171 *
 172 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
 173 * may be quite high.  This code could TryLock the page, and if that
 174 * succeeds, there is no need to take private_lock. (But if
 175 * private_lock is contended then so is mapping->tree_lock).
 176 */
 177static struct buffer_head *
 178__find_get_block_slow(struct block_device *bdev, sector_t block)
 179{
 180	struct inode *bd_inode = bdev->bd_inode;
 181	struct address_space *bd_mapping = bd_inode->i_mapping;
 182	struct buffer_head *ret = NULL;
 183	pgoff_t index;
 184	struct buffer_head *bh;
 185	struct buffer_head *head;
 186	struct page *page;
 187	int all_mapped = 1;
 188
 189	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
 190	page = find_get_page(bd_mapping, index);
 191	if (!page)
 192		goto out;
 193
 194	spin_lock(&bd_mapping->private_lock);
 195	if (!page_has_buffers(page))
 196		goto out_unlock;
 197	head = page_buffers(page);
 198	bh = head;
 199	do {
 200		if (!buffer_mapped(bh))
 201			all_mapped = 0;
 202		else if (bh->b_blocknr == block) {
 203			ret = bh;
 204			get_bh(bh);
 205			goto out_unlock;
 206		}
 207		bh = bh->b_this_page;
 208	} while (bh != head);
 209
 210	/* we might be here because some of the buffers on this page are
 211	 * not mapped.  This is due to various races between
 212	 * file io on the block device and getblk.  It gets dealt with
 213	 * elsewhere, don't buffer_error if we had some unmapped buffers
 214	 */
 215	if (all_mapped) {
 216		printk("__find_get_block_slow() failed. "
 217			"block=%llu, b_blocknr=%llu\n",
 218			(unsigned long long)block,
 219			(unsigned long long)bh->b_blocknr);
 220		printk("b_state=0x%08lx, b_size=%zu\n",
 221			bh->b_state, bh->b_size);
 222		printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
 
 223	}
 224out_unlock:
 225	spin_unlock(&bd_mapping->private_lock);
 226	page_cache_release(page);
 227out:
 228	return ret;
 229}
 230
 231/* If invalidate_buffers() will trash dirty buffers, it means some kind
 232   of fs corruption is going on. Trashing dirty data always imply losing
 233   information that was supposed to be just stored on the physical layer
 234   by the user.
 235
 236   Thus invalidate_buffers in general usage is not allwowed to trash
 237   dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
 238   be preserved.  These buffers are simply skipped.
 239  
 240   We also skip buffers which are still in use.  For example this can
 241   happen if a userspace program is reading the block device.
 242
 243   NOTE: In the case where the user removed a removable-media-disk even if
 244   there's still dirty data not synced on disk (due a bug in the device driver
 245   or due an error of the user), by not destroying the dirty buffers we could
 246   generate corruption also on the next media inserted, thus a parameter is
 247   necessary to handle this case in the most safe way possible (trying
 248   to not corrupt also the new disk inserted with the data belonging to
 249   the old now corrupted disk). Also for the ramdisk the natural thing
 250   to do in order to release the ramdisk memory is to destroy dirty buffers.
 251
 252   These are two special cases. Normal usage imply the device driver
 253   to issue a sync on the device (without waiting I/O completion) and
 254   then an invalidate_buffers call that doesn't trash dirty buffers.
 255
 256   For handling cache coherency with the blkdev pagecache the 'update' case
 257   is been introduced. It is needed to re-read from disk any pinned
 258   buffer. NOTE: re-reading from disk is destructive so we can do it only
 259   when we assume nobody is changing the buffercache under our I/O and when
 260   we think the disk contains more recent information than the buffercache.
 261   The update == 1 pass marks the buffers we need to update, the update == 2
 262   pass does the actual I/O. */
 263void invalidate_bdev(struct block_device *bdev)
 264{
 265	struct address_space *mapping = bdev->bd_inode->i_mapping;
 266
 267	if (mapping->nrpages == 0)
 268		return;
 269
 270	invalidate_bh_lrus();
 271	lru_add_drain_all();	/* make sure all lru add caches are flushed */
 272	invalidate_mapping_pages(mapping, 0, -1);
 273	/* 99% of the time, we don't need to flush the cleancache on the bdev.
 274	 * But, for the strange corners, lets be cautious
 275	 */
 276	cleancache_flush_inode(mapping);
 277}
 278EXPORT_SYMBOL(invalidate_bdev);
 279
 280/*
 281 * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
 282 */
 283static void free_more_memory(void)
 284{
 285	struct zone *zone;
 286	int nid;
 287
 288	wakeup_flusher_threads(1024);
 289	yield();
 290
 291	for_each_online_node(nid) {
 292		(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
 293						gfp_zone(GFP_NOFS), NULL,
 294						&zone);
 295		if (zone)
 296			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
 297						GFP_NOFS, NULL);
 298	}
 299}
 300
 301/*
 302 * I/O completion handler for block_read_full_page() - pages
 303 * which come unlocked at the end of I/O.
 304 */
 305static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 306{
 307	unsigned long flags;
 308	struct buffer_head *first;
 309	struct buffer_head *tmp;
 310	struct page *page;
 311	int page_uptodate = 1;
 312
 313	BUG_ON(!buffer_async_read(bh));
 314
 315	page = bh->b_page;
 316	if (uptodate) {
 317		set_buffer_uptodate(bh);
 318	} else {
 319		clear_buffer_uptodate(bh);
 320		if (!quiet_error(bh))
 321			buffer_io_error(bh);
 322		SetPageError(page);
 323	}
 324
 325	/*
 326	 * Be _very_ careful from here on. Bad things can happen if
 327	 * two buffer heads end IO at almost the same time and both
 328	 * decide that the page is now completely done.
 329	 */
 330	first = page_buffers(page);
 331	local_irq_save(flags);
 332	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 333	clear_buffer_async_read(bh);
 334	unlock_buffer(bh);
 335	tmp = bh;
 336	do {
 337		if (!buffer_uptodate(tmp))
 338			page_uptodate = 0;
 339		if (buffer_async_read(tmp)) {
 340			BUG_ON(!buffer_locked(tmp));
 341			goto still_busy;
 342		}
 343		tmp = tmp->b_this_page;
 344	} while (tmp != bh);
 345	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 346	local_irq_restore(flags);
 347
 348	/*
 349	 * If none of the buffers had errors and they are all
 350	 * uptodate then we can set the page uptodate.
 351	 */
 352	if (page_uptodate && !PageError(page))
 353		SetPageUptodate(page);
 354	unlock_page(page);
 355	return;
 356
 357still_busy:
 358	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 359	local_irq_restore(flags);
 360	return;
 361}
 362
 363/*
 364 * Completion handler for block_write_full_page() - pages which are unlocked
 365 * during I/O, and which have PageWriteback cleared upon I/O completion.
 366 */
 367void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 368{
 369	char b[BDEVNAME_SIZE];
 370	unsigned long flags;
 371	struct buffer_head *first;
 372	struct buffer_head *tmp;
 373	struct page *page;
 374
 375	BUG_ON(!buffer_async_write(bh));
 376
 377	page = bh->b_page;
 378	if (uptodate) {
 379		set_buffer_uptodate(bh);
 380	} else {
 381		if (!quiet_error(bh)) {
 382			buffer_io_error(bh);
 383			printk(KERN_WARNING "lost page write due to "
 384					"I/O error on %s\n",
 385			       bdevname(bh->b_bdev, b));
 386		}
 387		set_bit(AS_EIO, &page->mapping->flags);
 388		set_buffer_write_io_error(bh);
 389		clear_buffer_uptodate(bh);
 390		SetPageError(page);
 391	}
 392
 393	first = page_buffers(page);
 394	local_irq_save(flags);
 395	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 396
 397	clear_buffer_async_write(bh);
 398	unlock_buffer(bh);
 399	tmp = bh->b_this_page;
 400	while (tmp != bh) {
 401		if (buffer_async_write(tmp)) {
 402			BUG_ON(!buffer_locked(tmp));
 403			goto still_busy;
 404		}
 405		tmp = tmp->b_this_page;
 406	}
 407	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 408	local_irq_restore(flags);
 409	end_page_writeback(page);
 410	return;
 411
 412still_busy:
 413	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 414	local_irq_restore(flags);
 415	return;
 416}
 417EXPORT_SYMBOL(end_buffer_async_write);
 418
 419/*
 420 * If a page's buffers are under async readin (end_buffer_async_read
 421 * completion) then there is a possibility that another thread of
 422 * control could lock one of the buffers after it has completed
 423 * but while some of the other buffers have not completed.  This
 424 * locked buffer would confuse end_buffer_async_read() into not unlocking
 425 * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 426 * that this buffer is not under async I/O.
 427 *
 428 * The page comes unlocked when it has no locked buffer_async buffers
 429 * left.
 430 *
 431 * PageLocked prevents anyone starting new async I/O reads any of
 432 * the buffers.
 433 *
 434 * PageWriteback is used to prevent simultaneous writeout of the same
 435 * page.
 436 *
 437 * PageLocked prevents anyone from starting writeback of a page which is
 438 * under read I/O (PageWriteback is only ever set against a locked page).
 439 */
 440static void mark_buffer_async_read(struct buffer_head *bh)
 441{
 442	bh->b_end_io = end_buffer_async_read;
 443	set_buffer_async_read(bh);
 444}
 445
 446static void mark_buffer_async_write_endio(struct buffer_head *bh,
 447					  bh_end_io_t *handler)
 448{
 449	bh->b_end_io = handler;
 450	set_buffer_async_write(bh);
 451}
 452
 453void mark_buffer_async_write(struct buffer_head *bh)
 454{
 455	mark_buffer_async_write_endio(bh, end_buffer_async_write);
 456}
 457EXPORT_SYMBOL(mark_buffer_async_write);
 458
 459
 460/*
 461 * fs/buffer.c contains helper functions for buffer-backed address space's
 462 * fsync functions.  A common requirement for buffer-based filesystems is
 463 * that certain data from the backing blockdev needs to be written out for
 464 * a successful fsync().  For example, ext2 indirect blocks need to be
 465 * written back and waited upon before fsync() returns.
 466 *
 467 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
 468 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 469 * management of a list of dependent buffers at ->i_mapping->private_list.
 470 *
 471 * Locking is a little subtle: try_to_free_buffers() will remove buffers
 472 * from their controlling inode's queue when they are being freed.  But
 473 * try_to_free_buffers() will be operating against the *blockdev* mapping
 474 * at the time, not against the S_ISREG file which depends on those buffers.
 475 * So the locking for private_list is via the private_lock in the address_space
 476 * which backs the buffers.  Which is different from the address_space 
 477 * against which the buffers are listed.  So for a particular address_space,
 478 * mapping->private_lock does *not* protect mapping->private_list!  In fact,
 479 * mapping->private_list will always be protected by the backing blockdev's
 480 * ->private_lock.
 481 *
 482 * Which introduces a requirement: all buffers on an address_space's
 483 * ->private_list must be from the same address_space: the blockdev's.
 484 *
 485 * address_spaces which do not place buffers at ->private_list via these
 486 * utility functions are free to use private_lock and private_list for
 487 * whatever they want.  The only requirement is that list_empty(private_list)
 488 * be true at clear_inode() time.
 489 *
 490 * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 491 * filesystems should do that.  invalidate_inode_buffers() should just go
 492 * BUG_ON(!list_empty).
 493 *
 494 * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 495 * take an address_space, not an inode.  And it should be called
 496 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 497 * queued up.
 498 *
 499 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 500 * list if it is already on a list.  Because if the buffer is on a list,
 501 * it *must* already be on the right one.  If not, the filesystem is being
 502 * silly.  This will save a ton of locking.  But first we have to ensure
 503 * that buffers are taken *off* the old inode's list when they are freed
 504 * (presumably in truncate).  That requires careful auditing of all
 505 * filesystems (do it inside bforget()).  It could also be done by bringing
 506 * b_inode back.
 507 */
 508
 509/*
 510 * The buffer's backing address_space's private_lock must be held
 511 */
 512static void __remove_assoc_queue(struct buffer_head *bh)
 513{
 514	list_del_init(&bh->b_assoc_buffers);
 515	WARN_ON(!bh->b_assoc_map);
 516	if (buffer_write_io_error(bh))
 517		set_bit(AS_EIO, &bh->b_assoc_map->flags);
 518	bh->b_assoc_map = NULL;
 519}
 520
 521int inode_has_buffers(struct inode *inode)
 522{
 523	return !list_empty(&inode->i_data.private_list);
 524}
 525
 526/*
 527 * osync is designed to support O_SYNC io.  It waits synchronously for
 528 * all already-submitted IO to complete, but does not queue any new
 529 * writes to the disk.
 530 *
 531 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 532 * you dirty the buffers, and then use osync_inode_buffers to wait for
 533 * completion.  Any other dirty buffers which are not yet queued for
 534 * write will not be flushed to disk by the osync.
 535 */
 536static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
 537{
 538	struct buffer_head *bh;
 539	struct list_head *p;
 540	int err = 0;
 541
 542	spin_lock(lock);
 543repeat:
 544	list_for_each_prev(p, list) {
 545		bh = BH_ENTRY(p);
 546		if (buffer_locked(bh)) {
 547			get_bh(bh);
 548			spin_unlock(lock);
 549			wait_on_buffer(bh);
 550			if (!buffer_uptodate(bh))
 551				err = -EIO;
 552			brelse(bh);
 553			spin_lock(lock);
 554			goto repeat;
 555		}
 556	}
 557	spin_unlock(lock);
 558	return err;
 559}
 560
 561static void do_thaw_one(struct super_block *sb, void *unused)
 562{
 563	char b[BDEVNAME_SIZE];
 564	while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
 565		printk(KERN_WARNING "Emergency Thaw on %s\n",
 566		       bdevname(sb->s_bdev, b));
 567}
 568
 569static void do_thaw_all(struct work_struct *work)
 570{
 571	iterate_supers(do_thaw_one, NULL);
 572	kfree(work);
 573	printk(KERN_WARNING "Emergency Thaw complete\n");
 574}
 575
 576/**
 577 * emergency_thaw_all -- forcibly thaw every frozen filesystem
 578 *
 579 * Used for emergency unfreeze of all filesystems via SysRq
 580 */
 581void emergency_thaw_all(void)
 582{
 583	struct work_struct *work;
 584
 585	work = kmalloc(sizeof(*work), GFP_ATOMIC);
 586	if (work) {
 587		INIT_WORK(work, do_thaw_all);
 588		schedule_work(work);
 589	}
 590}
 591
 592/**
 593 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
 594 * @mapping: the mapping which wants those buffers written
 595 *
 596 * Starts I/O against the buffers at mapping->private_list, and waits upon
 597 * that I/O.
 598 *
 599 * Basically, this is a convenience function for fsync().
 600 * @mapping is a file or directory which needs those buffers to be written for
 601 * a successful fsync().
 602 */
 603int sync_mapping_buffers(struct address_space *mapping)
 604{
 605	struct address_space *buffer_mapping = mapping->assoc_mapping;
 606
 607	if (buffer_mapping == NULL || list_empty(&mapping->private_list))
 608		return 0;
 609
 610	return fsync_buffers_list(&buffer_mapping->private_lock,
 611					&mapping->private_list);
 612}
 613EXPORT_SYMBOL(sync_mapping_buffers);
 614
 615/*
 616 * Called when we've recently written block `bblock', and it is known that
 617 * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 618 * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 619 * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 620 */
 621void write_boundary_block(struct block_device *bdev,
 622			sector_t bblock, unsigned blocksize)
 623{
 624	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
 625	if (bh) {
 626		if (buffer_dirty(bh))
 627			ll_rw_block(WRITE, 1, &bh);
 628		put_bh(bh);
 629	}
 630}
 631
 632void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 633{
 634	struct address_space *mapping = inode->i_mapping;
 635	struct address_space *buffer_mapping = bh->b_page->mapping;
 636
 637	mark_buffer_dirty(bh);
 638	if (!mapping->assoc_mapping) {
 639		mapping->assoc_mapping = buffer_mapping;
 640	} else {
 641		BUG_ON(mapping->assoc_mapping != buffer_mapping);
 642	}
 643	if (!bh->b_assoc_map) {
 644		spin_lock(&buffer_mapping->private_lock);
 645		list_move_tail(&bh->b_assoc_buffers,
 646				&mapping->private_list);
 647		bh->b_assoc_map = mapping;
 648		spin_unlock(&buffer_mapping->private_lock);
 649	}
 650}
 651EXPORT_SYMBOL(mark_buffer_dirty_inode);
 652
 653/*
 654 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
 655 * dirty.
 656 *
 657 * If warn is true, then emit a warning if the page is not uptodate and has
 658 * not been truncated.
 
 
 659 */
 660static void __set_page_dirty(struct page *page,
 661		struct address_space *mapping, int warn)
 662{
 663	spin_lock_irq(&mapping->tree_lock);
 
 
 664	if (page->mapping) {	/* Race with truncate? */
 665		WARN_ON_ONCE(warn && !PageUptodate(page));
 666		account_page_dirtied(page, mapping);
 667		radix_tree_tag_set(&mapping->page_tree,
 668				page_index(page), PAGECACHE_TAG_DIRTY);
 669	}
 670	spin_unlock_irq(&mapping->tree_lock);
 671	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 672}
 673
 674/*
 675 * Add a page to the dirty page list.
 676 *
 677 * It is a sad fact of life that this function is called from several places
 678 * deeply under spinlocking.  It may not sleep.
 679 *
 680 * If the page has buffers, the uptodate buffers are set dirty, to preserve
 681 * dirty-state coherency between the page and the buffers.  It the page does
 682 * not have buffers then when they are later attached they will all be set
 683 * dirty.
 684 *
 685 * The buffers are dirtied before the page is dirtied.  There's a small race
 686 * window in which a writepage caller may see the page cleanness but not the
 687 * buffer dirtiness.  That's fine.  If this code were to set the page dirty
 688 * before the buffers, a concurrent writepage caller could clear the page dirty
 689 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
 690 * page on the dirty page list.
 691 *
 692 * We use private_lock to lock against try_to_free_buffers while using the
 693 * page's buffer list.  Also use this to protect against clean buffers being
 694 * added to the page after it was set dirty.
 695 *
 696 * FIXME: may need to call ->reservepage here as well.  That's rather up to the
 697 * address_space though.
 698 */
 699int __set_page_dirty_buffers(struct page *page)
 700{
 701	int newly_dirty;
 702	struct address_space *mapping = page_mapping(page);
 703
 704	if (unlikely(!mapping))
 705		return !TestSetPageDirty(page);
 706
 707	spin_lock(&mapping->private_lock);
 708	if (page_has_buffers(page)) {
 709		struct buffer_head *head = page_buffers(page);
 710		struct buffer_head *bh = head;
 711
 712		do {
 713			set_buffer_dirty(bh);
 714			bh = bh->b_this_page;
 715		} while (bh != head);
 716	}
 
 
 
 
 
 717	newly_dirty = !TestSetPageDirty(page);
 718	spin_unlock(&mapping->private_lock);
 719
 720	if (newly_dirty)
 721		__set_page_dirty(page, mapping, 1);
 
 
 
 
 
 
 722	return newly_dirty;
 723}
 724EXPORT_SYMBOL(__set_page_dirty_buffers);
 725
 726/*
 727 * Write out and wait upon a list of buffers.
 728 *
 729 * We have conflicting pressures: we want to make sure that all
 730 * initially dirty buffers get waited on, but that any subsequently
 731 * dirtied buffers don't.  After all, we don't want fsync to last
 732 * forever if somebody is actively writing to the file.
 733 *
 734 * Do this in two main stages: first we copy dirty buffers to a
 735 * temporary inode list, queueing the writes as we go.  Then we clean
 736 * up, waiting for those writes to complete.
 737 * 
 738 * During this second stage, any subsequent updates to the file may end
 739 * up refiling the buffer on the original inode's dirty list again, so
 740 * there is a chance we will end up with a buffer queued for write but
 741 * not yet completed on that list.  So, as a final cleanup we go through
 742 * the osync code to catch these locked, dirty buffers without requeuing
 743 * any newly dirty buffers for write.
 744 */
 745static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 746{
 747	struct buffer_head *bh;
 748	struct list_head tmp;
 749	struct address_space *mapping;
 750	int err = 0, err2;
 751	struct blk_plug plug;
 752
 753	INIT_LIST_HEAD(&tmp);
 754	blk_start_plug(&plug);
 755
 756	spin_lock(lock);
 757	while (!list_empty(list)) {
 758		bh = BH_ENTRY(list->next);
 759		mapping = bh->b_assoc_map;
 760		__remove_assoc_queue(bh);
 761		/* Avoid race with mark_buffer_dirty_inode() which does
 762		 * a lockless check and we rely on seeing the dirty bit */
 763		smp_mb();
 764		if (buffer_dirty(bh) || buffer_locked(bh)) {
 765			list_add(&bh->b_assoc_buffers, &tmp);
 766			bh->b_assoc_map = mapping;
 767			if (buffer_dirty(bh)) {
 768				get_bh(bh);
 769				spin_unlock(lock);
 770				/*
 771				 * Ensure any pending I/O completes so that
 772				 * write_dirty_buffer() actually writes the
 773				 * current contents - it is a noop if I/O is
 774				 * still in flight on potentially older
 775				 * contents.
 776				 */
 777				write_dirty_buffer(bh, WRITE_SYNC);
 778
 779				/*
 780				 * Kick off IO for the previous mapping. Note
 781				 * that we will not run the very last mapping,
 782				 * wait_on_buffer() will do that for us
 783				 * through sync_buffer().
 784				 */
 785				brelse(bh);
 786				spin_lock(lock);
 787			}
 788		}
 789	}
 790
 791	spin_unlock(lock);
 792	blk_finish_plug(&plug);
 793	spin_lock(lock);
 794
 795	while (!list_empty(&tmp)) {
 796		bh = BH_ENTRY(tmp.prev);
 797		get_bh(bh);
 798		mapping = bh->b_assoc_map;
 799		__remove_assoc_queue(bh);
 800		/* Avoid race with mark_buffer_dirty_inode() which does
 801		 * a lockless check and we rely on seeing the dirty bit */
 802		smp_mb();
 803		if (buffer_dirty(bh)) {
 804			list_add(&bh->b_assoc_buffers,
 805				 &mapping->private_list);
 806			bh->b_assoc_map = mapping;
 807		}
 808		spin_unlock(lock);
 809		wait_on_buffer(bh);
 810		if (!buffer_uptodate(bh))
 811			err = -EIO;
 812		brelse(bh);
 813		spin_lock(lock);
 814	}
 815	
 816	spin_unlock(lock);
 817	err2 = osync_buffers_list(lock, list);
 818	if (err)
 819		return err;
 820	else
 821		return err2;
 822}
 823
 824/*
 825 * Invalidate any and all dirty buffers on a given inode.  We are
 826 * probably unmounting the fs, but that doesn't mean we have already
 827 * done a sync().  Just drop the buffers from the inode list.
 828 *
 829 * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
 830 * assumes that all the buffers are against the blockdev.  Not true
 831 * for reiserfs.
 832 */
 833void invalidate_inode_buffers(struct inode *inode)
 834{
 835	if (inode_has_buffers(inode)) {
 836		struct address_space *mapping = &inode->i_data;
 837		struct list_head *list = &mapping->private_list;
 838		struct address_space *buffer_mapping = mapping->assoc_mapping;
 839
 840		spin_lock(&buffer_mapping->private_lock);
 841		while (!list_empty(list))
 842			__remove_assoc_queue(BH_ENTRY(list->next));
 843		spin_unlock(&buffer_mapping->private_lock);
 844	}
 845}
 846EXPORT_SYMBOL(invalidate_inode_buffers);
 847
 848/*
 849 * Remove any clean buffers from the inode's buffer list.  This is called
 850 * when we're trying to free the inode itself.  Those buffers can pin it.
 851 *
 852 * Returns true if all buffers were removed.
 853 */
 854int remove_inode_buffers(struct inode *inode)
 855{
 856	int ret = 1;
 857
 858	if (inode_has_buffers(inode)) {
 859		struct address_space *mapping = &inode->i_data;
 860		struct list_head *list = &mapping->private_list;
 861		struct address_space *buffer_mapping = mapping->assoc_mapping;
 862
 863		spin_lock(&buffer_mapping->private_lock);
 864		while (!list_empty(list)) {
 865			struct buffer_head *bh = BH_ENTRY(list->next);
 866			if (buffer_dirty(bh)) {
 867				ret = 0;
 868				break;
 869			}
 870			__remove_assoc_queue(bh);
 871		}
 872		spin_unlock(&buffer_mapping->private_lock);
 873	}
 874	return ret;
 875}
 876
 877/*
 878 * Create the appropriate buffers when given a page for data area and
 879 * the size of each buffer.. Use the bh->b_this_page linked list to
 880 * follow the buffers created.  Return NULL if unable to create more
 881 * buffers.
 882 *
 883 * The retry flag is used to differentiate async IO (paging, swapping)
 884 * which may not fail from ordinary buffer allocations.
 885 */
 886struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 887		int retry)
 888{
 889	struct buffer_head *bh, *head;
 890	long offset;
 891
 892try_again:
 893	head = NULL;
 894	offset = PAGE_SIZE;
 895	while ((offset -= size) >= 0) {
 896		bh = alloc_buffer_head(GFP_NOFS);
 897		if (!bh)
 898			goto no_grow;
 899
 900		bh->b_bdev = NULL;
 901		bh->b_this_page = head;
 902		bh->b_blocknr = -1;
 903		head = bh;
 904
 905		bh->b_state = 0;
 906		atomic_set(&bh->b_count, 0);
 907		bh->b_size = size;
 908
 909		/* Link the buffer to its page */
 910		set_bh_page(bh, page, offset);
 911
 912		init_buffer(bh, NULL, NULL);
 913	}
 914	return head;
 915/*
 916 * In case anything failed, we just free everything we got.
 917 */
 918no_grow:
 919	if (head) {
 920		do {
 921			bh = head;
 922			head = head->b_this_page;
 923			free_buffer_head(bh);
 924		} while (head);
 925	}
 926
 927	/*
 928	 * Return failure for non-async IO requests.  Async IO requests
 929	 * are not allowed to fail, so we have to wait until buffer heads
 930	 * become available.  But we don't want tasks sleeping with 
 931	 * partially complete buffers, so all were released above.
 932	 */
 933	if (!retry)
 934		return NULL;
 935
 936	/* We're _really_ low on memory. Now we just
 937	 * wait for old buffer heads to become free due to
 938	 * finishing IO.  Since this is an async request and
 939	 * the reserve list is empty, we're sure there are 
 940	 * async buffer heads in use.
 941	 */
 942	free_more_memory();
 943	goto try_again;
 944}
 945EXPORT_SYMBOL_GPL(alloc_page_buffers);
 946
 947static inline void
 948link_dev_buffers(struct page *page, struct buffer_head *head)
 949{
 950	struct buffer_head *bh, *tail;
 951
 952	bh = head;
 953	do {
 954		tail = bh;
 955		bh = bh->b_this_page;
 956	} while (bh);
 957	tail->b_this_page = head;
 958	attach_page_buffers(page, head);
 959}
 960
 
 
 
 
 
 
 
 
 
 
 
 
 961/*
 962 * Initialise the state of a blockdev page's buffers.
 963 */ 
 964static void
 965init_page_buffers(struct page *page, struct block_device *bdev,
 966			sector_t block, int size)
 967{
 968	struct buffer_head *head = page_buffers(page);
 969	struct buffer_head *bh = head;
 970	int uptodate = PageUptodate(page);
 
 971
 972	do {
 973		if (!buffer_mapped(bh)) {
 974			init_buffer(bh, NULL, NULL);
 975			bh->b_bdev = bdev;
 976			bh->b_blocknr = block;
 977			if (uptodate)
 978				set_buffer_uptodate(bh);
 979			set_buffer_mapped(bh);
 
 980		}
 981		block++;
 982		bh = bh->b_this_page;
 983	} while (bh != head);
 
 
 
 
 
 984}
 985
 986/*
 987 * Create the page-cache page that contains the requested block.
 988 *
 989 * This is user purely for blockdev mappings.
 990 */
 991static struct page *
 992grow_dev_page(struct block_device *bdev, sector_t block,
 993		pgoff_t index, int size)
 994{
 995	struct inode *inode = bdev->bd_inode;
 996	struct page *page;
 997	struct buffer_head *bh;
 
 
 
 998
 999	page = find_or_create_page(inode->i_mapping, index,
1000		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
 
 
 
 
 
 
 
 
 
1001	if (!page)
1002		return NULL;
1003
1004	BUG_ON(!PageLocked(page));
1005
1006	if (page_has_buffers(page)) {
1007		bh = page_buffers(page);
1008		if (bh->b_size == size) {
1009			init_page_buffers(page, bdev, block, size);
1010			return page;
 
 
1011		}
1012		if (!try_to_free_buffers(page))
1013			goto failed;
1014	}
1015
1016	/*
1017	 * Allocate some buffers for this page
1018	 */
1019	bh = alloc_page_buffers(page, size, 0);
1020	if (!bh)
1021		goto failed;
1022
1023	/*
1024	 * Link the page to the buffers and initialise them.  Take the
1025	 * lock to be atomic wrt __find_get_block(), which does not
1026	 * run under the page lock.
1027	 */
1028	spin_lock(&inode->i_mapping->private_lock);
1029	link_dev_buffers(page, bh);
1030	init_page_buffers(page, bdev, block, size);
 
1031	spin_unlock(&inode->i_mapping->private_lock);
1032	return page;
1033
1034failed:
1035	BUG();
1036	unlock_page(page);
1037	page_cache_release(page);
1038	return NULL;
1039}
1040
1041/*
1042 * Create buffers for the specified block device block's page.  If
1043 * that page was dirty, the buffers are set dirty also.
1044 */
1045static int
1046grow_buffers(struct block_device *bdev, sector_t block, int size)
1047{
1048	struct page *page;
1049	pgoff_t index;
1050	int sizebits;
1051
1052	sizebits = -1;
1053	do {
1054		sizebits++;
1055	} while ((size << sizebits) < PAGE_SIZE);
1056
1057	index = block >> sizebits;
1058
1059	/*
1060	 * Check for a block which wants to lie outside our maximum possible
1061	 * pagecache index.  (this comparison is done using sector_t types).
1062	 */
1063	if (unlikely(index != block >> sizebits)) {
1064		char b[BDEVNAME_SIZE];
1065
1066		printk(KERN_ERR "%s: requested out-of-range block %llu for "
1067			"device %s\n",
1068			__func__, (unsigned long long)block,
1069			bdevname(bdev, b));
1070		return -EIO;
1071	}
1072	block = index << sizebits;
1073	/* Create a page with the proper size buffers.. */
1074	page = grow_dev_page(bdev, block, index, size);
1075	if (!page)
1076		return 0;
1077	unlock_page(page);
1078	page_cache_release(page);
1079	return 1;
1080}
1081
1082static struct buffer_head *
1083__getblk_slow(struct block_device *bdev, sector_t block, int size)
 
1084{
1085	/* Size must be multiple of hard sectorsize */
1086	if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1087			(size < 512 || size > PAGE_SIZE))) {
1088		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1089					size);
1090		printk(KERN_ERR "logical block size: %d\n",
1091					bdev_logical_block_size(bdev));
1092
1093		dump_stack();
1094		return NULL;
1095	}
1096
1097	for (;;) {
1098		struct buffer_head * bh;
1099		int ret;
1100
1101		bh = __find_get_block(bdev, block, size);
1102		if (bh)
1103			return bh;
1104
1105		ret = grow_buffers(bdev, block, size);
1106		if (ret < 0)
1107			return NULL;
1108		if (ret == 0)
1109			free_more_memory();
1110	}
1111}
 
1112
1113/*
1114 * The relationship between dirty buffers and dirty pages:
1115 *
1116 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1117 * the page is tagged dirty in its radix tree.
1118 *
1119 * At all times, the dirtiness of the buffers represents the dirtiness of
1120 * subsections of the page.  If the page has buffers, the page dirty bit is
1121 * merely a hint about the true dirty state.
1122 *
1123 * When a page is set dirty in its entirety, all its buffers are marked dirty
1124 * (if the page has buffers).
1125 *
1126 * When a buffer is marked dirty, its page is dirtied, but the page's other
1127 * buffers are not.
1128 *
1129 * Also.  When blockdev buffers are explicitly read with bread(), they
1130 * individually become uptodate.  But their backing page remains not
1131 * uptodate - even if all of its buffers are uptodate.  A subsequent
1132 * block_read_full_page() against that page will discover all the uptodate
1133 * buffers, will set the page uptodate and will perform no I/O.
1134 */
1135
1136/**
1137 * mark_buffer_dirty - mark a buffer_head as needing writeout
1138 * @bh: the buffer_head to mark dirty
1139 *
1140 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1141 * backing page dirty, then tag the page as dirty in its address_space's radix
1142 * tree and then attach the address_space's inode to its superblock's dirty
1143 * inode list.
1144 *
1145 * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1146 * mapping->tree_lock and mapping->host->i_lock.
1147 */
1148void mark_buffer_dirty(struct buffer_head *bh)
1149{
1150	WARN_ON_ONCE(!buffer_uptodate(bh));
1151
 
 
1152	/*
1153	 * Very *carefully* optimize the it-is-already-dirty case.
1154	 *
1155	 * Don't let the final "is it dirty" escape to before we
1156	 * perhaps modified the buffer.
1157	 */
1158	if (buffer_dirty(bh)) {
1159		smp_mb();
1160		if (buffer_dirty(bh))
1161			return;
1162	}
1163
1164	if (!test_set_buffer_dirty(bh)) {
1165		struct page *page = bh->b_page;
 
 
 
1166		if (!TestSetPageDirty(page)) {
1167			struct address_space *mapping = page_mapping(page);
1168			if (mapping)
1169				__set_page_dirty(page, mapping, 0);
1170		}
 
 
 
1171	}
1172}
1173EXPORT_SYMBOL(mark_buffer_dirty);
1174
1175/*
1176 * Decrement a buffer_head's reference count.  If all buffers against a page
1177 * have zero reference count, are clean and unlocked, and if the page is clean
1178 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1179 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1180 * a page but it ends up not being freed, and buffers may later be reattached).
1181 */
1182void __brelse(struct buffer_head * buf)
1183{
1184	if (atomic_read(&buf->b_count)) {
1185		put_bh(buf);
1186		return;
1187	}
1188	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1189}
1190EXPORT_SYMBOL(__brelse);
1191
1192/*
1193 * bforget() is like brelse(), except it discards any
1194 * potentially dirty data.
1195 */
1196void __bforget(struct buffer_head *bh)
1197{
1198	clear_buffer_dirty(bh);
1199	if (bh->b_assoc_map) {
1200		struct address_space *buffer_mapping = bh->b_page->mapping;
1201
1202		spin_lock(&buffer_mapping->private_lock);
1203		list_del_init(&bh->b_assoc_buffers);
1204		bh->b_assoc_map = NULL;
1205		spin_unlock(&buffer_mapping->private_lock);
1206	}
1207	__brelse(bh);
1208}
1209EXPORT_SYMBOL(__bforget);
1210
1211static struct buffer_head *__bread_slow(struct buffer_head *bh)
1212{
1213	lock_buffer(bh);
1214	if (buffer_uptodate(bh)) {
1215		unlock_buffer(bh);
1216		return bh;
1217	} else {
1218		get_bh(bh);
1219		bh->b_end_io = end_buffer_read_sync;
1220		submit_bh(READ, bh);
1221		wait_on_buffer(bh);
1222		if (buffer_uptodate(bh))
1223			return bh;
1224	}
1225	brelse(bh);
1226	return NULL;
1227}
1228
1229/*
1230 * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1231 * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1232 * refcount elevated by one when they're in an LRU.  A buffer can only appear
1233 * once in a particular CPU's LRU.  A single buffer can be present in multiple
1234 * CPU's LRUs at the same time.
1235 *
1236 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1237 * sb_find_get_block().
1238 *
1239 * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1240 * a local interrupt disable for that.
1241 */
1242
1243#define BH_LRU_SIZE	8
1244
1245struct bh_lru {
1246	struct buffer_head *bhs[BH_LRU_SIZE];
1247};
1248
1249static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1250
1251#ifdef CONFIG_SMP
1252#define bh_lru_lock()	local_irq_disable()
1253#define bh_lru_unlock()	local_irq_enable()
1254#else
1255#define bh_lru_lock()	preempt_disable()
1256#define bh_lru_unlock()	preempt_enable()
1257#endif
1258
1259static inline void check_irqs_on(void)
1260{
1261#ifdef irqs_disabled
1262	BUG_ON(irqs_disabled());
1263#endif
1264}
1265
1266/*
1267 * The LRU management algorithm is dopey-but-simple.  Sorry.
1268 */
1269static void bh_lru_install(struct buffer_head *bh)
1270{
1271	struct buffer_head *evictee = NULL;
1272
1273	check_irqs_on();
1274	bh_lru_lock();
1275	if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
1276		struct buffer_head *bhs[BH_LRU_SIZE];
1277		int in;
1278		int out = 0;
1279
1280		get_bh(bh);
1281		bhs[out++] = bh;
1282		for (in = 0; in < BH_LRU_SIZE; in++) {
1283			struct buffer_head *bh2 =
1284				__this_cpu_read(bh_lrus.bhs[in]);
1285
1286			if (bh2 == bh) {
1287				__brelse(bh2);
1288			} else {
1289				if (out >= BH_LRU_SIZE) {
1290					BUG_ON(evictee != NULL);
1291					evictee = bh2;
1292				} else {
1293					bhs[out++] = bh2;
1294				}
1295			}
1296		}
1297		while (out < BH_LRU_SIZE)
1298			bhs[out++] = NULL;
1299		memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
1300	}
1301	bh_lru_unlock();
1302
1303	if (evictee)
1304		__brelse(evictee);
1305}
1306
1307/*
1308 * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1309 */
1310static struct buffer_head *
1311lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1312{
1313	struct buffer_head *ret = NULL;
1314	unsigned int i;
1315
1316	check_irqs_on();
1317	bh_lru_lock();
1318	for (i = 0; i < BH_LRU_SIZE; i++) {
1319		struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1320
1321		if (bh && bh->b_bdev == bdev &&
1322				bh->b_blocknr == block && bh->b_size == size) {
1323			if (i) {
1324				while (i) {
1325					__this_cpu_write(bh_lrus.bhs[i],
1326						__this_cpu_read(bh_lrus.bhs[i - 1]));
1327					i--;
1328				}
1329				__this_cpu_write(bh_lrus.bhs[0], bh);
1330			}
1331			get_bh(bh);
1332			ret = bh;
1333			break;
1334		}
1335	}
1336	bh_lru_unlock();
1337	return ret;
1338}
1339
1340/*
1341 * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1342 * it in the LRU and mark it as accessed.  If it is not present then return
1343 * NULL
1344 */
1345struct buffer_head *
1346__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1347{
1348	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1349
1350	if (bh == NULL) {
 
1351		bh = __find_get_block_slow(bdev, block);
1352		if (bh)
1353			bh_lru_install(bh);
1354	}
1355	if (bh)
1356		touch_buffer(bh);
 
1357	return bh;
1358}
1359EXPORT_SYMBOL(__find_get_block);
1360
1361/*
1362 * __getblk will locate (and, if necessary, create) the buffer_head
1363 * which corresponds to the passed block_device, block and size. The
1364 * returned buffer has its reference count incremented.
1365 *
1366 * __getblk() cannot fail - it just keeps trying.  If you pass it an
1367 * illegal block number, __getblk() will happily return a buffer_head
1368 * which represents the non-existent block.  Very weird.
1369 *
1370 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1371 * attempt is failing.  FIXME, perhaps?
1372 */
1373struct buffer_head *
1374__getblk(struct block_device *bdev, sector_t block, unsigned size)
 
1375{
1376	struct buffer_head *bh = __find_get_block(bdev, block, size);
1377
1378	might_sleep();
1379	if (bh == NULL)
1380		bh = __getblk_slow(bdev, block, size);
1381	return bh;
1382}
1383EXPORT_SYMBOL(__getblk);
1384
1385/*
1386 * Do async read-ahead on a buffer..
1387 */
1388void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1389{
1390	struct buffer_head *bh = __getblk(bdev, block, size);
1391	if (likely(bh)) {
1392		ll_rw_block(READA, 1, &bh);
1393		brelse(bh);
1394	}
1395}
1396EXPORT_SYMBOL(__breadahead);
1397
1398/**
1399 *  __bread() - reads a specified block and returns the bh
1400 *  @bdev: the block_device to read from
1401 *  @block: number of block
1402 *  @size: size (in bytes) to read
1403 * 
 
1404 *  Reads a specified block, and returns buffer head that contains it.
 
 
1405 *  It returns NULL if the block was unreadable.
1406 */
1407struct buffer_head *
1408__bread(struct block_device *bdev, sector_t block, unsigned size)
 
1409{
1410	struct buffer_head *bh = __getblk(bdev, block, size);
1411
1412	if (likely(bh) && !buffer_uptodate(bh))
1413		bh = __bread_slow(bh);
1414	return bh;
1415}
1416EXPORT_SYMBOL(__bread);
1417
1418/*
1419 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1420 * This doesn't race because it runs in each cpu either in irq
1421 * or with preempt disabled.
1422 */
1423static void invalidate_bh_lru(void *arg)
1424{
1425	struct bh_lru *b = &get_cpu_var(bh_lrus);
1426	int i;
1427
1428	for (i = 0; i < BH_LRU_SIZE; i++) {
1429		brelse(b->bhs[i]);
1430		b->bhs[i] = NULL;
1431	}
1432	put_cpu_var(bh_lrus);
1433}
 
 
 
 
 
1434	
 
 
 
 
 
 
 
 
1435void invalidate_bh_lrus(void)
1436{
1437	on_each_cpu(invalidate_bh_lru, NULL, 1);
1438}
1439EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1440
1441void set_bh_page(struct buffer_head *bh,
1442		struct page *page, unsigned long offset)
1443{
1444	bh->b_page = page;
1445	BUG_ON(offset >= PAGE_SIZE);
1446	if (PageHighMem(page))
1447		/*
1448		 * This catches illegal uses and preserves the offset:
1449		 */
1450		bh->b_data = (char *)(0 + offset);
1451	else
1452		bh->b_data = page_address(page) + offset;
1453}
1454EXPORT_SYMBOL(set_bh_page);
1455
1456/*
1457 * Called when truncating a buffer on a page completely.
1458 */
 
 
 
 
 
 
1459static void discard_buffer(struct buffer_head * bh)
1460{
 
 
1461	lock_buffer(bh);
1462	clear_buffer_dirty(bh);
1463	bh->b_bdev = NULL;
1464	clear_buffer_mapped(bh);
1465	clear_buffer_req(bh);
1466	clear_buffer_new(bh);
1467	clear_buffer_delay(bh);
1468	clear_buffer_unwritten(bh);
 
 
 
1469	unlock_buffer(bh);
1470}
1471
1472/**
1473 * block_invalidatepage - invalidate part of all of a buffer-backed page
1474 *
1475 * @page: the page which is affected
1476 * @offset: the index of the truncation point
 
1477 *
1478 * block_invalidatepage() is called when all or part of the page has become
1479 * invalidatedby a truncate operation.
1480 *
1481 * block_invalidatepage() does not have to release all buffers, but it must
1482 * ensure that no dirty buffer is left outside @offset and that no I/O
1483 * is underway against any of the blocks which are outside the truncation
1484 * point.  Because the caller is about to free (and possibly reuse) those
1485 * blocks on-disk.
1486 */
1487void block_invalidatepage(struct page *page, unsigned long offset)
 
1488{
1489	struct buffer_head *head, *bh, *next;
1490	unsigned int curr_off = 0;
 
1491
1492	BUG_ON(!PageLocked(page));
1493	if (!page_has_buffers(page))
1494		goto out;
1495
 
 
 
 
 
1496	head = page_buffers(page);
1497	bh = head;
1498	do {
1499		unsigned int next_off = curr_off + bh->b_size;
1500		next = bh->b_this_page;
1501
1502		/*
 
 
 
 
 
 
1503		 * is this block fully invalidated?
1504		 */
1505		if (offset <= curr_off)
1506			discard_buffer(bh);
1507		curr_off = next_off;
1508		bh = next;
1509	} while (bh != head);
1510
1511	/*
1512	 * We release buffers only if the entire page is being invalidated.
1513	 * The get_block cached value has been unconditionally invalidated,
1514	 * so real IO is not possible anymore.
1515	 */
1516	if (offset == 0)
1517		try_to_release_page(page, 0);
1518out:
1519	return;
1520}
1521EXPORT_SYMBOL(block_invalidatepage);
1522
 
1523/*
1524 * We attach and possibly dirty the buffers atomically wrt
1525 * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1526 * is already excluded via the page lock.
1527 */
1528void create_empty_buffers(struct page *page,
1529			unsigned long blocksize, unsigned long b_state)
1530{
1531	struct buffer_head *bh, *head, *tail;
1532
1533	head = alloc_page_buffers(page, blocksize, 1);
1534	bh = head;
1535	do {
1536		bh->b_state |= b_state;
1537		tail = bh;
1538		bh = bh->b_this_page;
1539	} while (bh);
1540	tail->b_this_page = head;
1541
1542	spin_lock(&page->mapping->private_lock);
1543	if (PageUptodate(page) || PageDirty(page)) {
1544		bh = head;
1545		do {
1546			if (PageDirty(page))
1547				set_buffer_dirty(bh);
1548			if (PageUptodate(page))
1549				set_buffer_uptodate(bh);
1550			bh = bh->b_this_page;
1551		} while (bh != head);
1552	}
1553	attach_page_buffers(page, head);
1554	spin_unlock(&page->mapping->private_lock);
1555}
1556EXPORT_SYMBOL(create_empty_buffers);
1557
1558/*
1559 * We are taking a block for data and we don't want any output from any
1560 * buffer-cache aliases starting from return from that function and
1561 * until the moment when something will explicitly mark the buffer
1562 * dirty (hopefully that will not happen until we will free that block ;-)
1563 * We don't even need to mark it not-uptodate - nobody can expect
1564 * anything from a newly allocated buffer anyway. We used to used
1565 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1566 * don't want to mark the alias unmapped, for example - it would confuse
1567 * anyone who might pick it with bread() afterwards...
1568 *
1569 * Also..  Note that bforget() doesn't lock the buffer.  So there can
1570 * be writeout I/O going on against recently-freed buffers.  We don't
1571 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1572 * only if we really need to.  That happens here.
1573 */
1574void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1575{
1576	struct buffer_head *old_bh;
1577
1578	might_sleep();
1579
1580	old_bh = __find_get_block_slow(bdev, block);
1581	if (old_bh) {
1582		clear_buffer_dirty(old_bh);
1583		wait_on_buffer(old_bh);
1584		clear_buffer_req(old_bh);
1585		__brelse(old_bh);
1586	}
1587}
1588EXPORT_SYMBOL(unmap_underlying_metadata);
1589
1590/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1591 * NOTE! All mapped/uptodate combinations are valid:
1592 *
1593 *	Mapped	Uptodate	Meaning
1594 *
1595 *	No	No		"unknown" - must do get_block()
1596 *	No	Yes		"hole" - zero-filled
1597 *	Yes	No		"allocated" - allocated on disk, not read in
1598 *	Yes	Yes		"valid" - allocated and up-to-date in memory.
1599 *
1600 * "Dirty" is valid only with the last case (mapped+uptodate).
1601 */
1602
1603/*
1604 * While block_write_full_page is writing back the dirty buffers under
1605 * the page lock, whoever dirtied the buffers may decide to clean them
1606 * again at any time.  We handle that by only looking at the buffer
1607 * state inside lock_buffer().
1608 *
1609 * If block_write_full_page() is called for regular writeback
1610 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1611 * locked buffer.   This only can happen if someone has written the buffer
1612 * directly, with submit_bh().  At the address_space level PageWriteback
1613 * prevents this contention from occurring.
1614 *
1615 * If block_write_full_page() is called with wbc->sync_mode ==
1616 * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
1617 * causes the writes to be flagged as synchronous writes.
1618 */
1619static int __block_write_full_page(struct inode *inode, struct page *page,
1620			get_block_t *get_block, struct writeback_control *wbc,
1621			bh_end_io_t *handler)
1622{
1623	int err;
1624	sector_t block;
1625	sector_t last_block;
1626	struct buffer_head *bh, *head;
1627	const unsigned blocksize = 1 << inode->i_blkbits;
1628	int nr_underway = 0;
1629	int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1630			WRITE_SYNC : WRITE);
1631
1632	BUG_ON(!PageLocked(page));
1633
1634	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1635
1636	if (!page_has_buffers(page)) {
1637		create_empty_buffers(page, blocksize,
1638					(1 << BH_Dirty)|(1 << BH_Uptodate));
1639	}
1640
1641	/*
1642	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1643	 * here, and the (potentially unmapped) buffers may become dirty at
1644	 * any time.  If a buffer becomes dirty here after we've inspected it
1645	 * then we just miss that fact, and the page stays dirty.
1646	 *
1647	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1648	 * handle that here by just cleaning them.
1649	 */
1650
1651	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1652	head = page_buffers(page);
1653	bh = head;
 
 
 
 
 
1654
1655	/*
1656	 * Get all the dirty buffers mapped to disk addresses and
1657	 * handle any aliases from the underlying blockdev's mapping.
1658	 */
1659	do {
1660		if (block > last_block) {
1661			/*
1662			 * mapped buffers outside i_size will occur, because
1663			 * this page can be outside i_size when there is a
1664			 * truncate in progress.
1665			 */
1666			/*
1667			 * The buffer was zeroed by block_write_full_page()
1668			 */
1669			clear_buffer_dirty(bh);
1670			set_buffer_uptodate(bh);
1671		} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1672			   buffer_dirty(bh)) {
1673			WARN_ON(bh->b_size != blocksize);
1674			err = get_block(inode, block, bh, 1);
1675			if (err)
1676				goto recover;
1677			clear_buffer_delay(bh);
1678			if (buffer_new(bh)) {
1679				/* blockdev mappings never come here */
1680				clear_buffer_new(bh);
1681				unmap_underlying_metadata(bh->b_bdev,
1682							bh->b_blocknr);
1683			}
1684		}
1685		bh = bh->b_this_page;
1686		block++;
1687	} while (bh != head);
1688
1689	do {
1690		if (!buffer_mapped(bh))
1691			continue;
1692		/*
1693		 * If it's a fully non-blocking write attempt and we cannot
1694		 * lock the buffer then redirty the page.  Note that this can
1695		 * potentially cause a busy-wait loop from writeback threads
1696		 * and kswapd activity, but those code paths have their own
1697		 * higher-level throttling.
1698		 */
1699		if (wbc->sync_mode != WB_SYNC_NONE) {
1700			lock_buffer(bh);
1701		} else if (!trylock_buffer(bh)) {
1702			redirty_page_for_writepage(wbc, page);
1703			continue;
1704		}
1705		if (test_clear_buffer_dirty(bh)) {
1706			mark_buffer_async_write_endio(bh, handler);
1707		} else {
1708			unlock_buffer(bh);
1709		}
1710	} while ((bh = bh->b_this_page) != head);
1711
1712	/*
1713	 * The page and its buffers are protected by PageWriteback(), so we can
1714	 * drop the bh refcounts early.
1715	 */
1716	BUG_ON(PageWriteback(page));
1717	set_page_writeback(page);
1718
1719	do {
1720		struct buffer_head *next = bh->b_this_page;
1721		if (buffer_async_write(bh)) {
1722			submit_bh(write_op, bh);
1723			nr_underway++;
1724		}
1725		bh = next;
1726	} while (bh != head);
1727	unlock_page(page);
1728
1729	err = 0;
1730done:
1731	if (nr_underway == 0) {
1732		/*
1733		 * The page was marked dirty, but the buffers were
1734		 * clean.  Someone wrote them back by hand with
1735		 * ll_rw_block/submit_bh.  A rare case.
1736		 */
1737		end_page_writeback(page);
1738
1739		/*
1740		 * The page and buffer_heads can be released at any time from
1741		 * here on.
1742		 */
1743	}
1744	return err;
1745
1746recover:
1747	/*
1748	 * ENOSPC, or some other error.  We may already have added some
1749	 * blocks to the file, so we need to write these out to avoid
1750	 * exposing stale data.
1751	 * The page is currently locked and not marked for writeback
1752	 */
1753	bh = head;
1754	/* Recovery: lock and submit the mapped buffers */
1755	do {
1756		if (buffer_mapped(bh) && buffer_dirty(bh) &&
1757		    !buffer_delay(bh)) {
1758			lock_buffer(bh);
1759			mark_buffer_async_write_endio(bh, handler);
1760		} else {
1761			/*
1762			 * The buffer may have been set dirty during
1763			 * attachment to a dirty page.
1764			 */
1765			clear_buffer_dirty(bh);
1766		}
1767	} while ((bh = bh->b_this_page) != head);
1768	SetPageError(page);
1769	BUG_ON(PageWriteback(page));
1770	mapping_set_error(page->mapping, err);
1771	set_page_writeback(page);
1772	do {
1773		struct buffer_head *next = bh->b_this_page;
1774		if (buffer_async_write(bh)) {
1775			clear_buffer_dirty(bh);
1776			submit_bh(write_op, bh);
1777			nr_underway++;
1778		}
1779		bh = next;
1780	} while (bh != head);
1781	unlock_page(page);
1782	goto done;
1783}
1784
1785/*
1786 * If a page has any new buffers, zero them out here, and mark them uptodate
1787 * and dirty so they'll be written out (in order to prevent uninitialised
1788 * block data from leaking). And clear the new bit.
1789 */
1790void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1791{
1792	unsigned int block_start, block_end;
1793	struct buffer_head *head, *bh;
1794
1795	BUG_ON(!PageLocked(page));
1796	if (!page_has_buffers(page))
1797		return;
1798
1799	bh = head = page_buffers(page);
1800	block_start = 0;
1801	do {
1802		block_end = block_start + bh->b_size;
1803
1804		if (buffer_new(bh)) {
1805			if (block_end > from && block_start < to) {
1806				if (!PageUptodate(page)) {
1807					unsigned start, size;
1808
1809					start = max(from, block_start);
1810					size = min(to, block_end) - start;
1811
1812					zero_user(page, start, size);
1813					set_buffer_uptodate(bh);
1814				}
1815
1816				clear_buffer_new(bh);
1817				mark_buffer_dirty(bh);
1818			}
1819		}
1820
1821		block_start = block_end;
1822		bh = bh->b_this_page;
1823	} while (bh != head);
1824}
1825EXPORT_SYMBOL(page_zero_new_buffers);
1826
1827int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1828		get_block_t *get_block)
1829{
1830	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1831	unsigned to = from + len;
1832	struct inode *inode = page->mapping->host;
1833	unsigned block_start, block_end;
1834	sector_t block;
1835	int err = 0;
1836	unsigned blocksize, bbits;
1837	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1838
1839	BUG_ON(!PageLocked(page));
1840	BUG_ON(from > PAGE_CACHE_SIZE);
1841	BUG_ON(to > PAGE_CACHE_SIZE);
1842	BUG_ON(from > to);
1843
1844	blocksize = 1 << inode->i_blkbits;
1845	if (!page_has_buffers(page))
1846		create_empty_buffers(page, blocksize, 0);
1847	head = page_buffers(page);
1848
1849	bbits = inode->i_blkbits;
1850	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1851
1852	for(bh = head, block_start = 0; bh != head || !block_start;
1853	    block++, block_start=block_end, bh = bh->b_this_page) {
1854		block_end = block_start + blocksize;
1855		if (block_end <= from || block_start >= to) {
1856			if (PageUptodate(page)) {
1857				if (!buffer_uptodate(bh))
1858					set_buffer_uptodate(bh);
1859			}
1860			continue;
1861		}
1862		if (buffer_new(bh))
1863			clear_buffer_new(bh);
1864		if (!buffer_mapped(bh)) {
1865			WARN_ON(bh->b_size != blocksize);
1866			err = get_block(inode, block, bh, 1);
1867			if (err)
1868				break;
1869			if (buffer_new(bh)) {
1870				unmap_underlying_metadata(bh->b_bdev,
1871							bh->b_blocknr);
1872				if (PageUptodate(page)) {
1873					clear_buffer_new(bh);
1874					set_buffer_uptodate(bh);
1875					mark_buffer_dirty(bh);
1876					continue;
1877				}
1878				if (block_end > to || block_start < from)
1879					zero_user_segments(page,
1880						to, block_end,
1881						block_start, from);
1882				continue;
1883			}
1884		}
1885		if (PageUptodate(page)) {
1886			if (!buffer_uptodate(bh))
1887				set_buffer_uptodate(bh);
1888			continue; 
1889		}
1890		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1891		    !buffer_unwritten(bh) &&
1892		     (block_start < from || block_end > to)) {
1893			ll_rw_block(READ, 1, &bh);
1894			*wait_bh++=bh;
1895		}
1896	}
1897	/*
1898	 * If we issued read requests - let them complete.
1899	 */
1900	while(wait_bh > wait) {
1901		wait_on_buffer(*--wait_bh);
1902		if (!buffer_uptodate(*wait_bh))
1903			err = -EIO;
1904	}
1905	if (unlikely(err))
1906		page_zero_new_buffers(page, from, to);
1907	return err;
1908}
1909EXPORT_SYMBOL(__block_write_begin);
1910
1911static int __block_commit_write(struct inode *inode, struct page *page,
1912		unsigned from, unsigned to)
1913{
1914	unsigned block_start, block_end;
1915	int partial = 0;
1916	unsigned blocksize;
1917	struct buffer_head *bh, *head;
1918
1919	blocksize = 1 << inode->i_blkbits;
 
1920
1921	for(bh = head = page_buffers(page), block_start = 0;
1922	    bh != head || !block_start;
1923	    block_start=block_end, bh = bh->b_this_page) {
1924		block_end = block_start + blocksize;
1925		if (block_end <= from || block_start >= to) {
1926			if (!buffer_uptodate(bh))
1927				partial = 1;
1928		} else {
1929			set_buffer_uptodate(bh);
1930			mark_buffer_dirty(bh);
1931		}
1932		clear_buffer_new(bh);
1933	}
 
 
 
1934
1935	/*
1936	 * If this is a partial write which happened to make all buffers
1937	 * uptodate then we can optimize away a bogus readpage() for
1938	 * the next read(). Here we 'discover' whether the page went
1939	 * uptodate as a result of this (potentially partial) write.
1940	 */
1941	if (!partial)
1942		SetPageUptodate(page);
1943	return 0;
1944}
1945
1946/*
1947 * block_write_begin takes care of the basic task of block allocation and
1948 * bringing partial write blocks uptodate first.
1949 *
1950 * The filesystem needs to handle block truncation upon failure.
1951 */
1952int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
1953		unsigned flags, struct page **pagep, get_block_t *get_block)
1954{
1955	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1956	struct page *page;
1957	int status;
1958
1959	page = grab_cache_page_write_begin(mapping, index, flags);
1960	if (!page)
1961		return -ENOMEM;
1962
1963	status = __block_write_begin(page, pos, len, get_block);
1964	if (unlikely(status)) {
1965		unlock_page(page);
1966		page_cache_release(page);
1967		page = NULL;
1968	}
1969
1970	*pagep = page;
1971	return status;
1972}
1973EXPORT_SYMBOL(block_write_begin);
1974
1975int block_write_end(struct file *file, struct address_space *mapping,
1976			loff_t pos, unsigned len, unsigned copied,
1977			struct page *page, void *fsdata)
1978{
1979	struct inode *inode = mapping->host;
1980	unsigned start;
1981
1982	start = pos & (PAGE_CACHE_SIZE - 1);
1983
1984	if (unlikely(copied < len)) {
1985		/*
1986		 * The buffers that were written will now be uptodate, so we
1987		 * don't have to worry about a readpage reading them and
1988		 * overwriting a partial write. However if we have encountered
1989		 * a short write and only partially written into a buffer, it
1990		 * will not be marked uptodate, so a readpage might come in and
1991		 * destroy our partial write.
1992		 *
1993		 * Do the simplest thing, and just treat any short write to a
1994		 * non uptodate page as a zero-length write, and force the
1995		 * caller to redo the whole thing.
1996		 */
1997		if (!PageUptodate(page))
1998			copied = 0;
1999
2000		page_zero_new_buffers(page, start+copied, start+len);
2001	}
2002	flush_dcache_page(page);
2003
2004	/* This could be a short (even 0-length) commit */
2005	__block_commit_write(inode, page, start, start+copied);
2006
2007	return copied;
2008}
2009EXPORT_SYMBOL(block_write_end);
2010
2011int generic_write_end(struct file *file, struct address_space *mapping,
2012			loff_t pos, unsigned len, unsigned copied,
2013			struct page *page, void *fsdata)
2014{
2015	struct inode *inode = mapping->host;
 
2016	int i_size_changed = 0;
2017
2018	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2019
2020	/*
2021	 * No need to use i_size_read() here, the i_size
2022	 * cannot change under us because we hold i_mutex.
2023	 *
2024	 * But it's important to update i_size while still holding page lock:
2025	 * page writeout could otherwise come in and zero beyond i_size.
2026	 */
2027	if (pos+copied > inode->i_size) {
2028		i_size_write(inode, pos+copied);
2029		i_size_changed = 1;
2030	}
2031
2032	unlock_page(page);
2033	page_cache_release(page);
2034
 
 
2035	/*
2036	 * Don't mark the inode dirty under page lock. First, it unnecessarily
2037	 * makes the holding time of page lock longer. Second, it forces lock
2038	 * ordering of page lock and transaction start for journaling
2039	 * filesystems.
2040	 */
2041	if (i_size_changed)
2042		mark_inode_dirty(inode);
2043
2044	return copied;
2045}
2046EXPORT_SYMBOL(generic_write_end);
2047
2048/*
2049 * block_is_partially_uptodate checks whether buffers within a page are
2050 * uptodate or not.
2051 *
2052 * Returns true if all buffers which correspond to a file portion
2053 * we want to read are uptodate.
2054 */
2055int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2056					unsigned long from)
2057{
2058	struct inode *inode = page->mapping->host;
2059	unsigned block_start, block_end, blocksize;
2060	unsigned to;
2061	struct buffer_head *bh, *head;
2062	int ret = 1;
2063
2064	if (!page_has_buffers(page))
2065		return 0;
2066
2067	blocksize = 1 << inode->i_blkbits;
2068	to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
 
2069	to = from + to;
2070	if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2071		return 0;
2072
2073	head = page_buffers(page);
2074	bh = head;
2075	block_start = 0;
2076	do {
2077		block_end = block_start + blocksize;
2078		if (block_end > from && block_start < to) {
2079			if (!buffer_uptodate(bh)) {
2080				ret = 0;
2081				break;
2082			}
2083			if (block_end >= to)
2084				break;
2085		}
2086		block_start = block_end;
2087		bh = bh->b_this_page;
2088	} while (bh != head);
2089
2090	return ret;
2091}
2092EXPORT_SYMBOL(block_is_partially_uptodate);
2093
2094/*
2095 * Generic "read page" function for block devices that have the normal
2096 * get_block functionality. This is most of the block device filesystems.
2097 * Reads the page asynchronously --- the unlock_buffer() and
2098 * set/clear_buffer_uptodate() functions propagate buffer state into the
2099 * page struct once IO has completed.
2100 */
2101int block_read_full_page(struct page *page, get_block_t *get_block)
2102{
2103	struct inode *inode = page->mapping->host;
2104	sector_t iblock, lblock;
2105	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2106	unsigned int blocksize;
2107	int nr, i;
2108	int fully_mapped = 1;
2109
2110	BUG_ON(!PageLocked(page));
2111	blocksize = 1 << inode->i_blkbits;
2112	if (!page_has_buffers(page))
2113		create_empty_buffers(page, blocksize, 0);
2114	head = page_buffers(page);
2115
2116	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2117	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2118	bh = head;
2119	nr = 0;
2120	i = 0;
2121
2122	do {
2123		if (buffer_uptodate(bh))
2124			continue;
2125
2126		if (!buffer_mapped(bh)) {
2127			int err = 0;
2128
2129			fully_mapped = 0;
2130			if (iblock < lblock) {
2131				WARN_ON(bh->b_size != blocksize);
2132				err = get_block(inode, iblock, bh, 0);
2133				if (err)
2134					SetPageError(page);
2135			}
2136			if (!buffer_mapped(bh)) {
2137				zero_user(page, i * blocksize, blocksize);
2138				if (!err)
2139					set_buffer_uptodate(bh);
2140				continue;
2141			}
2142			/*
2143			 * get_block() might have updated the buffer
2144			 * synchronously
2145			 */
2146			if (buffer_uptodate(bh))
2147				continue;
2148		}
2149		arr[nr++] = bh;
2150	} while (i++, iblock++, (bh = bh->b_this_page) != head);
2151
2152	if (fully_mapped)
2153		SetPageMappedToDisk(page);
2154
2155	if (!nr) {
2156		/*
2157		 * All buffers are uptodate - we can set the page uptodate
2158		 * as well. But not if get_block() returned an error.
2159		 */
2160		if (!PageError(page))
2161			SetPageUptodate(page);
2162		unlock_page(page);
2163		return 0;
2164	}
2165
2166	/* Stage two: lock the buffers */
2167	for (i = 0; i < nr; i++) {
2168		bh = arr[i];
2169		lock_buffer(bh);
2170		mark_buffer_async_read(bh);
2171	}
2172
2173	/*
2174	 * Stage 3: start the IO.  Check for uptodateness
2175	 * inside the buffer lock in case another process reading
2176	 * the underlying blockdev brought it uptodate (the sct fix).
2177	 */
2178	for (i = 0; i < nr; i++) {
2179		bh = arr[i];
2180		if (buffer_uptodate(bh))
2181			end_buffer_async_read(bh, 1);
2182		else
2183			submit_bh(READ, bh);
2184	}
2185	return 0;
2186}
2187EXPORT_SYMBOL(block_read_full_page);
2188
2189/* utility function for filesystems that need to do work on expanding
2190 * truncates.  Uses filesystem pagecache writes to allow the filesystem to
2191 * deal with the hole.  
2192 */
2193int generic_cont_expand_simple(struct inode *inode, loff_t size)
2194{
2195	struct address_space *mapping = inode->i_mapping;
2196	struct page *page;
2197	void *fsdata;
2198	int err;
2199
2200	err = inode_newsize_ok(inode, size);
2201	if (err)
2202		goto out;
2203
2204	err = pagecache_write_begin(NULL, mapping, size, 0,
2205				AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2206				&page, &fsdata);
2207	if (err)
2208		goto out;
2209
2210	err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2211	BUG_ON(err > 0);
2212
2213out:
2214	return err;
2215}
2216EXPORT_SYMBOL(generic_cont_expand_simple);
2217
2218static int cont_expand_zero(struct file *file, struct address_space *mapping,
2219			    loff_t pos, loff_t *bytes)
2220{
2221	struct inode *inode = mapping->host;
2222	unsigned blocksize = 1 << inode->i_blkbits;
2223	struct page *page;
2224	void *fsdata;
2225	pgoff_t index, curidx;
2226	loff_t curpos;
2227	unsigned zerofrom, offset, len;
2228	int err = 0;
2229
2230	index = pos >> PAGE_CACHE_SHIFT;
2231	offset = pos & ~PAGE_CACHE_MASK;
2232
2233	while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2234		zerofrom = curpos & ~PAGE_CACHE_MASK;
2235		if (zerofrom & (blocksize-1)) {
2236			*bytes |= (blocksize-1);
2237			(*bytes)++;
2238		}
2239		len = PAGE_CACHE_SIZE - zerofrom;
2240
2241		err = pagecache_write_begin(file, mapping, curpos, len,
2242						AOP_FLAG_UNINTERRUPTIBLE,
2243						&page, &fsdata);
2244		if (err)
2245			goto out;
2246		zero_user(page, zerofrom, len);
2247		err = pagecache_write_end(file, mapping, curpos, len, len,
2248						page, fsdata);
2249		if (err < 0)
2250			goto out;
2251		BUG_ON(err != len);
2252		err = 0;
2253
2254		balance_dirty_pages_ratelimited(mapping);
 
 
 
 
 
2255	}
2256
2257	/* page covers the boundary, find the boundary offset */
2258	if (index == curidx) {
2259		zerofrom = curpos & ~PAGE_CACHE_MASK;
2260		/* if we will expand the thing last block will be filled */
2261		if (offset <= zerofrom) {
2262			goto out;
2263		}
2264		if (zerofrom & (blocksize-1)) {
2265			*bytes |= (blocksize-1);
2266			(*bytes)++;
2267		}
2268		len = offset - zerofrom;
2269
2270		err = pagecache_write_begin(file, mapping, curpos, len,
2271						AOP_FLAG_UNINTERRUPTIBLE,
2272						&page, &fsdata);
2273		if (err)
2274			goto out;
2275		zero_user(page, zerofrom, len);
2276		err = pagecache_write_end(file, mapping, curpos, len, len,
2277						page, fsdata);
2278		if (err < 0)
2279			goto out;
2280		BUG_ON(err != len);
2281		err = 0;
2282	}
2283out:
2284	return err;
2285}
2286
2287/*
2288 * For moronic filesystems that do not allow holes in file.
2289 * We may have to extend the file.
2290 */
2291int cont_write_begin(struct file *file, struct address_space *mapping,
2292			loff_t pos, unsigned len, unsigned flags,
2293			struct page **pagep, void **fsdata,
2294			get_block_t *get_block, loff_t *bytes)
2295{
2296	struct inode *inode = mapping->host;
2297	unsigned blocksize = 1 << inode->i_blkbits;
2298	unsigned zerofrom;
2299	int err;
2300
2301	err = cont_expand_zero(file, mapping, pos, bytes);
2302	if (err)
2303		return err;
2304
2305	zerofrom = *bytes & ~PAGE_CACHE_MASK;
2306	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2307		*bytes |= (blocksize-1);
2308		(*bytes)++;
2309	}
2310
2311	return block_write_begin(mapping, pos, len, flags, pagep, get_block);
2312}
2313EXPORT_SYMBOL(cont_write_begin);
2314
2315int block_commit_write(struct page *page, unsigned from, unsigned to)
2316{
2317	struct inode *inode = page->mapping->host;
2318	__block_commit_write(inode,page,from,to);
2319	return 0;
2320}
2321EXPORT_SYMBOL(block_commit_write);
2322
2323/*
2324 * block_page_mkwrite() is not allowed to change the file size as it gets
2325 * called from a page fault handler when a page is first dirtied. Hence we must
2326 * be careful to check for EOF conditions here. We set the page up correctly
2327 * for a written page which means we get ENOSPC checking when writing into
2328 * holes and correct delalloc and unwritten extent mapping on filesystems that
2329 * support these features.
2330 *
2331 * We are not allowed to take the i_mutex here so we have to play games to
2332 * protect against truncate races as the page could now be beyond EOF.  Because
2333 * truncate writes the inode size before removing pages, once we have the
2334 * page lock we can determine safely if the page is beyond EOF. If it is not
2335 * beyond EOF, then the page is guaranteed safe against truncation until we
2336 * unlock the page.
2337 *
2338 * Direct callers of this function should call vfs_check_frozen() so that page
2339 * fault does not busyloop until the fs is thawed.
2340 */
2341int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2342			 get_block_t get_block)
2343{
2344	struct page *page = vmf->page;
2345	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2346	unsigned long end;
2347	loff_t size;
2348	int ret;
2349
2350	lock_page(page);
2351	size = i_size_read(inode);
2352	if ((page->mapping != inode->i_mapping) ||
2353	    (page_offset(page) > size)) {
2354		/* We overload EFAULT to mean page got truncated */
2355		ret = -EFAULT;
2356		goto out_unlock;
2357	}
2358
2359	/* page is wholly or partially inside EOF */
2360	if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2361		end = size & ~PAGE_CACHE_MASK;
2362	else
2363		end = PAGE_CACHE_SIZE;
2364
2365	ret = __block_write_begin(page, 0, end, get_block);
2366	if (!ret)
2367		ret = block_commit_write(page, 0, end);
2368
2369	if (unlikely(ret < 0))
2370		goto out_unlock;
2371	/*
2372	 * Freezing in progress? We check after the page is marked dirty and
2373	 * with page lock held so if the test here fails, we are sure freezing
2374	 * code will wait during syncing until the page fault is done - at that
2375	 * point page will be dirty and unlocked so freezing code will write it
2376	 * and writeprotect it again.
2377	 */
2378	set_page_dirty(page);
2379	if (inode->i_sb->s_frozen != SB_UNFROZEN) {
2380		ret = -EAGAIN;
2381		goto out_unlock;
2382	}
2383	wait_on_page_writeback(page);
2384	return 0;
2385out_unlock:
2386	unlock_page(page);
2387	return ret;
2388}
2389EXPORT_SYMBOL(__block_page_mkwrite);
2390
2391int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2392		   get_block_t get_block)
2393{
2394	int ret;
2395	struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
2396
2397	/*
2398	 * This check is racy but catches the common case. The check in
2399	 * __block_page_mkwrite() is reliable.
2400	 */
2401	vfs_check_frozen(sb, SB_FREEZE_WRITE);
2402	ret = __block_page_mkwrite(vma, vmf, get_block);
2403	return block_page_mkwrite_return(ret);
2404}
2405EXPORT_SYMBOL(block_page_mkwrite);
2406
2407/*
2408 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2409 * immediately, while under the page lock.  So it needs a special end_io
2410 * handler which does not touch the bh after unlocking it.
2411 */
2412static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2413{
2414	__end_buffer_read_notouch(bh, uptodate);
2415}
2416
2417/*
2418 * Attach the singly-linked list of buffers created by nobh_write_begin, to
2419 * the page (converting it to circular linked list and taking care of page
2420 * dirty races).
2421 */
2422static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2423{
2424	struct buffer_head *bh;
2425
2426	BUG_ON(!PageLocked(page));
2427
2428	spin_lock(&page->mapping->private_lock);
2429	bh = head;
2430	do {
2431		if (PageDirty(page))
2432			set_buffer_dirty(bh);
2433		if (!bh->b_this_page)
2434			bh->b_this_page = head;
2435		bh = bh->b_this_page;
2436	} while (bh != head);
2437	attach_page_buffers(page, head);
2438	spin_unlock(&page->mapping->private_lock);
2439}
2440
2441/*
2442 * On entry, the page is fully not uptodate.
2443 * On exit the page is fully uptodate in the areas outside (from,to)
2444 * The filesystem needs to handle block truncation upon failure.
2445 */
2446int nobh_write_begin(struct address_space *mapping,
2447			loff_t pos, unsigned len, unsigned flags,
2448			struct page **pagep, void **fsdata,
2449			get_block_t *get_block)
2450{
2451	struct inode *inode = mapping->host;
2452	const unsigned blkbits = inode->i_blkbits;
2453	const unsigned blocksize = 1 << blkbits;
2454	struct buffer_head *head, *bh;
2455	struct page *page;
2456	pgoff_t index;
2457	unsigned from, to;
2458	unsigned block_in_page;
2459	unsigned block_start, block_end;
2460	sector_t block_in_file;
2461	int nr_reads = 0;
2462	int ret = 0;
2463	int is_mapped_to_disk = 1;
2464
2465	index = pos >> PAGE_CACHE_SHIFT;
2466	from = pos & (PAGE_CACHE_SIZE - 1);
2467	to = from + len;
2468
2469	page = grab_cache_page_write_begin(mapping, index, flags);
2470	if (!page)
2471		return -ENOMEM;
2472	*pagep = page;
2473	*fsdata = NULL;
2474
2475	if (page_has_buffers(page)) {
2476		ret = __block_write_begin(page, pos, len, get_block);
2477		if (unlikely(ret))
2478			goto out_release;
2479		return ret;
2480	}
2481
2482	if (PageMappedToDisk(page))
2483		return 0;
2484
2485	/*
2486	 * Allocate buffers so that we can keep track of state, and potentially
2487	 * attach them to the page if an error occurs. In the common case of
2488	 * no error, they will just be freed again without ever being attached
2489	 * to the page (which is all OK, because we're under the page lock).
2490	 *
2491	 * Be careful: the buffer linked list is a NULL terminated one, rather
2492	 * than the circular one we're used to.
2493	 */
2494	head = alloc_page_buffers(page, blocksize, 0);
2495	if (!head) {
2496		ret = -ENOMEM;
2497		goto out_release;
2498	}
2499
2500	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2501
2502	/*
2503	 * We loop across all blocks in the page, whether or not they are
2504	 * part of the affected region.  This is so we can discover if the
2505	 * page is fully mapped-to-disk.
2506	 */
2507	for (block_start = 0, block_in_page = 0, bh = head;
2508		  block_start < PAGE_CACHE_SIZE;
2509		  block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2510		int create;
2511
2512		block_end = block_start + blocksize;
2513		bh->b_state = 0;
2514		create = 1;
2515		if (block_start >= to)
2516			create = 0;
2517		ret = get_block(inode, block_in_file + block_in_page,
2518					bh, create);
2519		if (ret)
2520			goto failed;
2521		if (!buffer_mapped(bh))
2522			is_mapped_to_disk = 0;
2523		if (buffer_new(bh))
2524			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2525		if (PageUptodate(page)) {
2526			set_buffer_uptodate(bh);
2527			continue;
2528		}
2529		if (buffer_new(bh) || !buffer_mapped(bh)) {
2530			zero_user_segments(page, block_start, from,
2531							to, block_end);
2532			continue;
2533		}
2534		if (buffer_uptodate(bh))
2535			continue;	/* reiserfs does this */
2536		if (block_start < from || block_end > to) {
2537			lock_buffer(bh);
2538			bh->b_end_io = end_buffer_read_nobh;
2539			submit_bh(READ, bh);
2540			nr_reads++;
2541		}
2542	}
2543
2544	if (nr_reads) {
2545		/*
2546		 * The page is locked, so these buffers are protected from
2547		 * any VM or truncate activity.  Hence we don't need to care
2548		 * for the buffer_head refcounts.
2549		 */
2550		for (bh = head; bh; bh = bh->b_this_page) {
2551			wait_on_buffer(bh);
2552			if (!buffer_uptodate(bh))
2553				ret = -EIO;
2554		}
2555		if (ret)
2556			goto failed;
2557	}
2558
2559	if (is_mapped_to_disk)
2560		SetPageMappedToDisk(page);
2561
2562	*fsdata = head; /* to be released by nobh_write_end */
2563
2564	return 0;
2565
2566failed:
2567	BUG_ON(!ret);
2568	/*
2569	 * Error recovery is a bit difficult. We need to zero out blocks that
2570	 * were newly allocated, and dirty them to ensure they get written out.
2571	 * Buffers need to be attached to the page at this point, otherwise
2572	 * the handling of potential IO errors during writeout would be hard
2573	 * (could try doing synchronous writeout, but what if that fails too?)
2574	 */
2575	attach_nobh_buffers(page, head);
2576	page_zero_new_buffers(page, from, to);
2577
2578out_release:
2579	unlock_page(page);
2580	page_cache_release(page);
2581	*pagep = NULL;
2582
2583	return ret;
2584}
2585EXPORT_SYMBOL(nobh_write_begin);
2586
2587int nobh_write_end(struct file *file, struct address_space *mapping,
2588			loff_t pos, unsigned len, unsigned copied,
2589			struct page *page, void *fsdata)
2590{
2591	struct inode *inode = page->mapping->host;
2592	struct buffer_head *head = fsdata;
2593	struct buffer_head *bh;
2594	BUG_ON(fsdata != NULL && page_has_buffers(page));
2595
2596	if (unlikely(copied < len) && head)
2597		attach_nobh_buffers(page, head);
2598	if (page_has_buffers(page))
2599		return generic_write_end(file, mapping, pos, len,
2600					copied, page, fsdata);
2601
2602	SetPageUptodate(page);
2603	set_page_dirty(page);
2604	if (pos+copied > inode->i_size) {
2605		i_size_write(inode, pos+copied);
2606		mark_inode_dirty(inode);
2607	}
2608
2609	unlock_page(page);
2610	page_cache_release(page);
2611
2612	while (head) {
2613		bh = head;
2614		head = head->b_this_page;
2615		free_buffer_head(bh);
2616	}
2617
2618	return copied;
2619}
2620EXPORT_SYMBOL(nobh_write_end);
2621
2622/*
2623 * nobh_writepage() - based on block_full_write_page() except
2624 * that it tries to operate without attaching bufferheads to
2625 * the page.
2626 */
2627int nobh_writepage(struct page *page, get_block_t *get_block,
2628			struct writeback_control *wbc)
2629{
2630	struct inode * const inode = page->mapping->host;
2631	loff_t i_size = i_size_read(inode);
2632	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2633	unsigned offset;
2634	int ret;
2635
2636	/* Is the page fully inside i_size? */
2637	if (page->index < end_index)
2638		goto out;
2639
2640	/* Is the page fully outside i_size? (truncate in progress) */
2641	offset = i_size & (PAGE_CACHE_SIZE-1);
2642	if (page->index >= end_index+1 || !offset) {
2643		/*
2644		 * The page may have dirty, unmapped buffers.  For example,
2645		 * they may have been added in ext3_writepage().  Make them
2646		 * freeable here, so the page does not leak.
2647		 */
2648#if 0
2649		/* Not really sure about this  - do we need this ? */
2650		if (page->mapping->a_ops->invalidatepage)
2651			page->mapping->a_ops->invalidatepage(page, offset);
2652#endif
2653		unlock_page(page);
2654		return 0; /* don't care */
2655	}
2656
2657	/*
2658	 * The page straddles i_size.  It must be zeroed out on each and every
2659	 * writepage invocation because it may be mmapped.  "A file is mapped
2660	 * in multiples of the page size.  For a file that is not a multiple of
2661	 * the  page size, the remaining memory is zeroed when mapped, and
2662	 * writes to that region are not written out to the file."
2663	 */
2664	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2665out:
2666	ret = mpage_writepage(page, get_block, wbc);
2667	if (ret == -EAGAIN)
2668		ret = __block_write_full_page(inode, page, get_block, wbc,
2669					      end_buffer_async_write);
2670	return ret;
2671}
2672EXPORT_SYMBOL(nobh_writepage);
2673
2674int nobh_truncate_page(struct address_space *mapping,
2675			loff_t from, get_block_t *get_block)
2676{
2677	pgoff_t index = from >> PAGE_CACHE_SHIFT;
2678	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2679	unsigned blocksize;
2680	sector_t iblock;
2681	unsigned length, pos;
2682	struct inode *inode = mapping->host;
2683	struct page *page;
2684	struct buffer_head map_bh;
2685	int err;
2686
2687	blocksize = 1 << inode->i_blkbits;
2688	length = offset & (blocksize - 1);
2689
2690	/* Block boundary? Nothing to do */
2691	if (!length)
2692		return 0;
2693
2694	length = blocksize - length;
2695	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2696
2697	page = grab_cache_page(mapping, index);
2698	err = -ENOMEM;
2699	if (!page)
2700		goto out;
2701
2702	if (page_has_buffers(page)) {
2703has_buffers:
2704		unlock_page(page);
2705		page_cache_release(page);
2706		return block_truncate_page(mapping, from, get_block);
2707	}
2708
2709	/* Find the buffer that contains "offset" */
2710	pos = blocksize;
2711	while (offset >= pos) {
2712		iblock++;
2713		pos += blocksize;
2714	}
2715
2716	map_bh.b_size = blocksize;
2717	map_bh.b_state = 0;
2718	err = get_block(inode, iblock, &map_bh, 0);
2719	if (err)
2720		goto unlock;
2721	/* unmapped? It's a hole - nothing to do */
2722	if (!buffer_mapped(&map_bh))
2723		goto unlock;
2724
2725	/* Ok, it's mapped. Make sure it's up-to-date */
2726	if (!PageUptodate(page)) {
2727		err = mapping->a_ops->readpage(NULL, page);
2728		if (err) {
2729			page_cache_release(page);
2730			goto out;
2731		}
2732		lock_page(page);
2733		if (!PageUptodate(page)) {
2734			err = -EIO;
2735			goto unlock;
2736		}
2737		if (page_has_buffers(page))
2738			goto has_buffers;
2739	}
2740	zero_user(page, offset, length);
2741	set_page_dirty(page);
2742	err = 0;
2743
2744unlock:
2745	unlock_page(page);
2746	page_cache_release(page);
2747out:
2748	return err;
2749}
2750EXPORT_SYMBOL(nobh_truncate_page);
2751
2752int block_truncate_page(struct address_space *mapping,
2753			loff_t from, get_block_t *get_block)
2754{
2755	pgoff_t index = from >> PAGE_CACHE_SHIFT;
2756	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2757	unsigned blocksize;
2758	sector_t iblock;
2759	unsigned length, pos;
2760	struct inode *inode = mapping->host;
2761	struct page *page;
2762	struct buffer_head *bh;
2763	int err;
2764
2765	blocksize = 1 << inode->i_blkbits;
2766	length = offset & (blocksize - 1);
2767
2768	/* Block boundary? Nothing to do */
2769	if (!length)
2770		return 0;
2771
2772	length = blocksize - length;
2773	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2774	
2775	page = grab_cache_page(mapping, index);
2776	err = -ENOMEM;
2777	if (!page)
2778		goto out;
2779
2780	if (!page_has_buffers(page))
2781		create_empty_buffers(page, blocksize, 0);
2782
2783	/* Find the buffer that contains "offset" */
2784	bh = page_buffers(page);
2785	pos = blocksize;
2786	while (offset >= pos) {
2787		bh = bh->b_this_page;
2788		iblock++;
2789		pos += blocksize;
2790	}
2791
2792	err = 0;
2793	if (!buffer_mapped(bh)) {
2794		WARN_ON(bh->b_size != blocksize);
2795		err = get_block(inode, iblock, bh, 0);
2796		if (err)
2797			goto unlock;
2798		/* unmapped? It's a hole - nothing to do */
2799		if (!buffer_mapped(bh))
2800			goto unlock;
2801	}
2802
2803	/* Ok, it's mapped. Make sure it's up-to-date */
2804	if (PageUptodate(page))
2805		set_buffer_uptodate(bh);
2806
2807	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2808		err = -EIO;
2809		ll_rw_block(READ, 1, &bh);
2810		wait_on_buffer(bh);
2811		/* Uhhuh. Read error. Complain and punt. */
2812		if (!buffer_uptodate(bh))
2813			goto unlock;
2814	}
2815
2816	zero_user(page, offset, length);
2817	mark_buffer_dirty(bh);
2818	err = 0;
2819
2820unlock:
2821	unlock_page(page);
2822	page_cache_release(page);
2823out:
2824	return err;
2825}
2826EXPORT_SYMBOL(block_truncate_page);
2827
2828/*
2829 * The generic ->writepage function for buffer-backed address_spaces
2830 * this form passes in the end_io handler used to finish the IO.
2831 */
2832int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2833			struct writeback_control *wbc, bh_end_io_t *handler)
2834{
2835	struct inode * const inode = page->mapping->host;
2836	loff_t i_size = i_size_read(inode);
2837	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2838	unsigned offset;
2839
2840	/* Is the page fully inside i_size? */
2841	if (page->index < end_index)
2842		return __block_write_full_page(inode, page, get_block, wbc,
2843					       handler);
2844
2845	/* Is the page fully outside i_size? (truncate in progress) */
2846	offset = i_size & (PAGE_CACHE_SIZE-1);
2847	if (page->index >= end_index+1 || !offset) {
2848		/*
2849		 * The page may have dirty, unmapped buffers.  For example,
2850		 * they may have been added in ext3_writepage().  Make them
2851		 * freeable here, so the page does not leak.
2852		 */
2853		do_invalidatepage(page, 0);
2854		unlock_page(page);
2855		return 0; /* don't care */
2856	}
2857
2858	/*
2859	 * The page straddles i_size.  It must be zeroed out on each and every
2860	 * writepage invocation because it may be mmapped.  "A file is mapped
2861	 * in multiples of the page size.  For a file that is not a multiple of
2862	 * the  page size, the remaining memory is zeroed when mapped, and
2863	 * writes to that region are not written out to the file."
2864	 */
2865	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2866	return __block_write_full_page(inode, page, get_block, wbc, handler);
2867}
2868EXPORT_SYMBOL(block_write_full_page_endio);
2869
2870/*
2871 * The generic ->writepage function for buffer-backed address_spaces
2872 */
2873int block_write_full_page(struct page *page, get_block_t *get_block,
2874			struct writeback_control *wbc)
2875{
2876	return block_write_full_page_endio(page, get_block, wbc,
2877					   end_buffer_async_write);
2878}
2879EXPORT_SYMBOL(block_write_full_page);
2880
2881sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2882			    get_block_t *get_block)
2883{
2884	struct buffer_head tmp;
2885	struct inode *inode = mapping->host;
2886	tmp.b_state = 0;
2887	tmp.b_blocknr = 0;
2888	tmp.b_size = 1 << inode->i_blkbits;
2889	get_block(inode, block, &tmp, 0);
2890	return tmp.b_blocknr;
2891}
2892EXPORT_SYMBOL(generic_block_bmap);
2893
2894static void end_bio_bh_io_sync(struct bio *bio, int err)
2895{
2896	struct buffer_head *bh = bio->bi_private;
2897
2898	if (err == -EOPNOTSUPP) {
2899		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2900	}
2901
2902	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
2903		set_bit(BH_Quiet, &bh->b_state);
2904
2905	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2906	bio_put(bio);
2907}
2908
2909int submit_bh(int rw, struct buffer_head * bh)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2910{
2911	struct bio *bio;
2912	int ret = 0;
2913
2914	BUG_ON(!buffer_locked(bh));
2915	BUG_ON(!buffer_mapped(bh));
2916	BUG_ON(!bh->b_end_io);
2917	BUG_ON(buffer_delay(bh));
2918	BUG_ON(buffer_unwritten(bh));
2919
2920	/*
2921	 * Only clear out a write error when rewriting
2922	 */
2923	if (test_set_buffer_req(bh) && (rw & WRITE))
2924		clear_buffer_write_io_error(bh);
2925
2926	/*
2927	 * from here on down, it's all bio -- do the initial mapping,
2928	 * submit_bio -> generic_make_request may further map this bio around
2929	 */
2930	bio = bio_alloc(GFP_NOIO, 1);
2931
2932	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 
 
 
 
 
2933	bio->bi_bdev = bh->b_bdev;
2934	bio->bi_io_vec[0].bv_page = bh->b_page;
2935	bio->bi_io_vec[0].bv_len = bh->b_size;
2936	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2937
2938	bio->bi_vcnt = 1;
2939	bio->bi_idx = 0;
2940	bio->bi_size = bh->b_size;
2941
2942	bio->bi_end_io = end_bio_bh_io_sync;
2943	bio->bi_private = bh;
 
 
 
 
 
 
 
 
 
2944
2945	bio_get(bio);
2946	submit_bio(rw, bio);
 
 
2947
2948	if (bio_flagged(bio, BIO_EOPNOTSUPP))
2949		ret = -EOPNOTSUPP;
 
 
 
2950
2951	bio_put(bio);
2952	return ret;
 
2953}
2954EXPORT_SYMBOL(submit_bh);
2955
2956/**
2957 * ll_rw_block: low-level access to block devices (DEPRECATED)
2958 * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
2959 * @nr: number of &struct buffer_heads in the array
2960 * @bhs: array of pointers to &struct buffer_head
2961 *
2962 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2963 * requests an I/O operation on them, either a %READ or a %WRITE.  The third
2964 * %READA option is described in the documentation for generic_make_request()
2965 * which ll_rw_block() calls.
2966 *
2967 * This function drops any buffer that it cannot get a lock on (with the
2968 * BH_Lock state bit), any buffer that appears to be clean when doing a write
2969 * request, and any buffer that appears to be up-to-date when doing read
2970 * request.  Further it marks as clean buffers that are processed for
2971 * writing (the buffer cache won't assume that they are actually clean
2972 * until the buffer gets unlocked).
2973 *
2974 * ll_rw_block sets b_end_io to simple completion handler that marks
2975 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2976 * any waiters. 
2977 *
2978 * All of the buffers must be for the same device, and must also be a
2979 * multiple of the current approved size for the device.
2980 */
2981void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2982{
2983	int i;
2984
2985	for (i = 0; i < nr; i++) {
2986		struct buffer_head *bh = bhs[i];
2987
2988		if (!trylock_buffer(bh))
2989			continue;
2990		if (rw == WRITE) {
2991			if (test_clear_buffer_dirty(bh)) {
2992				bh->b_end_io = end_buffer_write_sync;
2993				get_bh(bh);
2994				submit_bh(WRITE, bh);
2995				continue;
2996			}
2997		} else {
2998			if (!buffer_uptodate(bh)) {
2999				bh->b_end_io = end_buffer_read_sync;
3000				get_bh(bh);
3001				submit_bh(rw, bh);
3002				continue;
3003			}
3004		}
3005		unlock_buffer(bh);
3006	}
3007}
3008EXPORT_SYMBOL(ll_rw_block);
3009
3010void write_dirty_buffer(struct buffer_head *bh, int rw)
3011{
3012	lock_buffer(bh);
3013	if (!test_clear_buffer_dirty(bh)) {
3014		unlock_buffer(bh);
3015		return;
3016	}
3017	bh->b_end_io = end_buffer_write_sync;
3018	get_bh(bh);
3019	submit_bh(rw, bh);
3020}
3021EXPORT_SYMBOL(write_dirty_buffer);
3022
3023/*
3024 * For a data-integrity writeout, we need to wait upon any in-progress I/O
3025 * and then start new I/O and then wait upon it.  The caller must have a ref on
3026 * the buffer_head.
3027 */
3028int __sync_dirty_buffer(struct buffer_head *bh, int rw)
3029{
3030	int ret = 0;
3031
3032	WARN_ON(atomic_read(&bh->b_count) < 1);
3033	lock_buffer(bh);
3034	if (test_clear_buffer_dirty(bh)) {
3035		get_bh(bh);
3036		bh->b_end_io = end_buffer_write_sync;
3037		ret = submit_bh(rw, bh);
3038		wait_on_buffer(bh);
3039		if (!ret && !buffer_uptodate(bh))
3040			ret = -EIO;
3041	} else {
3042		unlock_buffer(bh);
3043	}
3044	return ret;
3045}
3046EXPORT_SYMBOL(__sync_dirty_buffer);
3047
3048int sync_dirty_buffer(struct buffer_head *bh)
3049{
3050	return __sync_dirty_buffer(bh, WRITE_SYNC);
3051}
3052EXPORT_SYMBOL(sync_dirty_buffer);
3053
3054/*
3055 * try_to_free_buffers() checks if all the buffers on this particular page
3056 * are unused, and releases them if so.
3057 *
3058 * Exclusion against try_to_free_buffers may be obtained by either
3059 * locking the page or by holding its mapping's private_lock.
3060 *
3061 * If the page is dirty but all the buffers are clean then we need to
3062 * be sure to mark the page clean as well.  This is because the page
3063 * may be against a block device, and a later reattachment of buffers
3064 * to a dirty page will set *all* buffers dirty.  Which would corrupt
3065 * filesystem data on the same device.
3066 *
3067 * The same applies to regular filesystem pages: if all the buffers are
3068 * clean then we set the page clean and proceed.  To do that, we require
3069 * total exclusion from __set_page_dirty_buffers().  That is obtained with
3070 * private_lock.
3071 *
3072 * try_to_free_buffers() is non-blocking.
3073 */
3074static inline int buffer_busy(struct buffer_head *bh)
3075{
3076	return atomic_read(&bh->b_count) |
3077		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3078}
3079
3080static int
3081drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3082{
3083	struct buffer_head *head = page_buffers(page);
3084	struct buffer_head *bh;
3085
3086	bh = head;
3087	do {
3088		if (buffer_write_io_error(bh) && page->mapping)
3089			set_bit(AS_EIO, &page->mapping->flags);
3090		if (buffer_busy(bh))
3091			goto failed;
3092		bh = bh->b_this_page;
3093	} while (bh != head);
3094
3095	do {
3096		struct buffer_head *next = bh->b_this_page;
3097
3098		if (bh->b_assoc_map)
3099			__remove_assoc_queue(bh);
3100		bh = next;
3101	} while (bh != head);
3102	*buffers_to_free = head;
3103	__clear_page_buffers(page);
3104	return 1;
3105failed:
3106	return 0;
3107}
3108
3109int try_to_free_buffers(struct page *page)
3110{
3111	struct address_space * const mapping = page->mapping;
3112	struct buffer_head *buffers_to_free = NULL;
3113	int ret = 0;
3114
3115	BUG_ON(!PageLocked(page));
3116	if (PageWriteback(page))
3117		return 0;
3118
3119	if (mapping == NULL) {		/* can this still happen? */
3120		ret = drop_buffers(page, &buffers_to_free);
3121		goto out;
3122	}
3123
3124	spin_lock(&mapping->private_lock);
3125	ret = drop_buffers(page, &buffers_to_free);
3126
3127	/*
3128	 * If the filesystem writes its buffers by hand (eg ext3)
3129	 * then we can have clean buffers against a dirty page.  We
3130	 * clean the page here; otherwise the VM will never notice
3131	 * that the filesystem did any IO at all.
3132	 *
3133	 * Also, during truncate, discard_buffer will have marked all
3134	 * the page's buffers clean.  We discover that here and clean
3135	 * the page also.
3136	 *
3137	 * private_lock must be held over this entire operation in order
3138	 * to synchronise against __set_page_dirty_buffers and prevent the
3139	 * dirty bit from being lost.
3140	 */
3141	if (ret)
3142		cancel_dirty_page(page, PAGE_CACHE_SIZE);
3143	spin_unlock(&mapping->private_lock);
3144out:
3145	if (buffers_to_free) {
3146		struct buffer_head *bh = buffers_to_free;
3147
3148		do {
3149			struct buffer_head *next = bh->b_this_page;
3150			free_buffer_head(bh);
3151			bh = next;
3152		} while (bh != buffers_to_free);
3153	}
3154	return ret;
3155}
3156EXPORT_SYMBOL(try_to_free_buffers);
3157
3158/*
3159 * There are no bdflush tunables left.  But distributions are
3160 * still running obsolete flush daemons, so we terminate them here.
3161 *
3162 * Use of bdflush() is deprecated and will be removed in a future kernel.
3163 * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3164 */
3165SYSCALL_DEFINE2(bdflush, int, func, long, data)
3166{
3167	static int msg_count;
3168
3169	if (!capable(CAP_SYS_ADMIN))
3170		return -EPERM;
3171
3172	if (msg_count < 5) {
3173		msg_count++;
3174		printk(KERN_INFO
3175			"warning: process `%s' used the obsolete bdflush"
3176			" system call\n", current->comm);
3177		printk(KERN_INFO "Fix your initscripts?\n");
3178	}
3179
3180	if (func == 1)
3181		do_exit(0);
3182	return 0;
3183}
3184
3185/*
3186 * Buffer-head allocation
3187 */
3188static struct kmem_cache *bh_cachep;
3189
3190/*
3191 * Once the number of bh's in the machine exceeds this level, we start
3192 * stripping them in writeback.
3193 */
3194static int max_buffer_heads;
3195
3196int buffer_heads_over_limit;
3197
3198struct bh_accounting {
3199	int nr;			/* Number of live bh's */
3200	int ratelimit;		/* Limit cacheline bouncing */
3201};
3202
3203static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3204
3205static void recalc_bh_state(void)
3206{
3207	int i;
3208	int tot = 0;
3209
3210	if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3211		return;
3212	__this_cpu_write(bh_accounting.ratelimit, 0);
3213	for_each_online_cpu(i)
3214		tot += per_cpu(bh_accounting, i).nr;
3215	buffer_heads_over_limit = (tot > max_buffer_heads);
3216}
3217
3218struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3219{
3220	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3221	if (ret) {
3222		INIT_LIST_HEAD(&ret->b_assoc_buffers);
3223		preempt_disable();
3224		__this_cpu_inc(bh_accounting.nr);
3225		recalc_bh_state();
3226		preempt_enable();
3227	}
3228	return ret;
3229}
3230EXPORT_SYMBOL(alloc_buffer_head);
3231
3232void free_buffer_head(struct buffer_head *bh)
3233{
3234	BUG_ON(!list_empty(&bh->b_assoc_buffers));
3235	kmem_cache_free(bh_cachep, bh);
3236	preempt_disable();
3237	__this_cpu_dec(bh_accounting.nr);
3238	recalc_bh_state();
3239	preempt_enable();
3240}
3241EXPORT_SYMBOL(free_buffer_head);
3242
3243static void buffer_exit_cpu(int cpu)
3244{
3245	int i;
3246	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3247
3248	for (i = 0; i < BH_LRU_SIZE; i++) {
3249		brelse(b->bhs[i]);
3250		b->bhs[i] = NULL;
3251	}
3252	this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3253	per_cpu(bh_accounting, cpu).nr = 0;
3254}
3255
3256static int buffer_cpu_notify(struct notifier_block *self,
3257			      unsigned long action, void *hcpu)
3258{
3259	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3260		buffer_exit_cpu((unsigned long)hcpu);
3261	return NOTIFY_OK;
3262}
3263
3264/**
3265 * bh_uptodate_or_lock - Test whether the buffer is uptodate
3266 * @bh: struct buffer_head
3267 *
3268 * Return true if the buffer is up-to-date and false,
3269 * with the buffer locked, if not.
3270 */
3271int bh_uptodate_or_lock(struct buffer_head *bh)
3272{
3273	if (!buffer_uptodate(bh)) {
3274		lock_buffer(bh);
3275		if (!buffer_uptodate(bh))
3276			return 0;
3277		unlock_buffer(bh);
3278	}
3279	return 1;
3280}
3281EXPORT_SYMBOL(bh_uptodate_or_lock);
3282
3283/**
3284 * bh_submit_read - Submit a locked buffer for reading
3285 * @bh: struct buffer_head
3286 *
3287 * Returns zero on success and -EIO on error.
3288 */
3289int bh_submit_read(struct buffer_head *bh)
3290{
3291	BUG_ON(!buffer_locked(bh));
3292
3293	if (buffer_uptodate(bh)) {
3294		unlock_buffer(bh);
3295		return 0;
3296	}
3297
3298	get_bh(bh);
3299	bh->b_end_io = end_buffer_read_sync;
3300	submit_bh(READ, bh);
3301	wait_on_buffer(bh);
3302	if (buffer_uptodate(bh))
3303		return 0;
3304	return -EIO;
3305}
3306EXPORT_SYMBOL(bh_submit_read);
3307
3308void __init buffer_init(void)
3309{
3310	int nrpages;
3311
3312	bh_cachep = kmem_cache_create("buffer_head",
3313			sizeof(struct buffer_head), 0,
3314				(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3315				SLAB_MEM_SPREAD),
3316				NULL);
3317
3318	/*
3319	 * Limit the bh occupancy to 10% of ZONE_NORMAL
3320	 */
3321	nrpages = (nr_free_buffer_pages() * 10) / 100;
3322	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3323	hotcpu_notifier(buffer_cpu_notify, 0);
3324}