buffer.c - fs/buffer.c - Linux diff v4.10.11 - Bootlin Elixir Cross Referencer

   1/*
   2 *  linux/fs/buffer.c
   3 *
   4 *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
   5 */
   6
   7/*
   8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
   9 *
  10 * Removed a lot of unnecessary code and simplified things now that
  11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  12 *
  13 * Speed up hash, lru, and free list operations.  Use gfp() for allocating
  14 * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
  15 *
  16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
  17 *
  18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
  19 */
  20
  21#include <linux/kernel.h>
  22#include <linux/syscalls.h>
  23#include <linux/fs.h>
  24#include <linux/iomap.h>
  25#include <linux/mm.h>
  26#include <linux/percpu.h>
  27#include <linux/slab.h>
  28#include <linux/capability.h>
  29#include <linux/blkdev.h>
  30#include <linux/file.h>
  31#include <linux/quotaops.h>
  32#include <linux/highmem.h>
  33#include <linux/export.h>
  34#include <linux/backing-dev.h>
  35#include <linux/writeback.h>
  36#include <linux/hash.h>
  37#include <linux/suspend.h>
  38#include <linux/buffer_head.h>
  39#include <linux/task_io_accounting_ops.h>
  40#include <linux/bio.h>
  41#include <linux/notifier.h>
  42#include <linux/cpu.h>
  43#include <linux/bitops.h>
  44#include <linux/mpage.h>
  45#include <linux/bit_spinlock.h>
  46#include <linux/pagevec.h>
  47#include <trace/events/block.h>
  48
  49static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
  50static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
  51			 unsigned long bio_flags,
  52			 struct writeback_control *wbc);
  53
  54#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
  55
  56void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 
  57{
  58	bh->b_end_io = handler;
  59	bh->b_private = private;
  60}
  61EXPORT_SYMBOL(init_buffer);
  62
  63inline void touch_buffer(struct buffer_head *bh)
  64{
  65	trace_block_touch_buffer(bh);
  66	mark_page_accessed(bh->b_page);
  67}
  68EXPORT_SYMBOL(touch_buffer);
  69
  70void __lock_buffer(struct buffer_head *bh)
  71{
  72	wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
 
  73}
  74EXPORT_SYMBOL(__lock_buffer);
  75
  76void unlock_buffer(struct buffer_head *bh)
  77{
  78	clear_bit_unlock(BH_Lock, &bh->b_state);
  79	smp_mb__after_atomic();
  80	wake_up_bit(&bh->b_state, BH_Lock);
  81}
  82EXPORT_SYMBOL(unlock_buffer);
  83
  84/*
  85 * Returns if the page has dirty or writeback buffers. If all the buffers
  86 * are unlocked and clean then the PageDirty information is stale. If
  87 * any of the pages are locked, it is assumed they are locked for IO.
  88 */
  89void buffer_check_dirty_writeback(struct page *page,
  90				     bool *dirty, bool *writeback)
  91{
  92	struct buffer_head *head, *bh;
  93	*dirty = false;
  94	*writeback = false;
  95
  96	BUG_ON(!PageLocked(page));
  97
  98	if (!page_has_buffers(page))
  99		return;
 100
 101	if (PageWriteback(page))
 102		*writeback = true;
 103
 104	head = page_buffers(page);
 105	bh = head;
 106	do {
 107		if (buffer_locked(bh))
 108			*writeback = true;
 109
 110		if (buffer_dirty(bh))
 111			*dirty = true;
 112
 113		bh = bh->b_this_page;
 114	} while (bh != head);
 115}
 116EXPORT_SYMBOL(buffer_check_dirty_writeback);
 117
 118/*
 119 * Block until a buffer comes unlocked.  This doesn't stop it
 120 * from becoming locked again - you have to lock it yourself
 121 * if you want to preserve its state.
 122 */
 123void __wait_on_buffer(struct buffer_head * bh)
 124{
 125	wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
 126}
 127EXPORT_SYMBOL(__wait_on_buffer);
 128
 129static void
 130__clear_page_buffers(struct page *page)
 131{
 132	ClearPagePrivate(page);
 133	set_page_private(page, 0);
 134	put_page(page);
 
 
 
 
 
 
 
 
 135}
 136
 137static void buffer_io_error(struct buffer_head *bh, char *msg)
 
 138{
 139	if (!test_bit(BH_Quiet, &bh->b_state))
 140		printk_ratelimited(KERN_ERR
 141			"Buffer I/O error on dev %pg, logical block %llu%s\n",
 142			bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
 143}
 144
 145/*
 146 * End-of-IO handler helper function which does not touch the bh after
 147 * unlocking it.
 148 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
 149 * a race there is benign: unlock_buffer() only use the bh's address for
 150 * hashing after unlocking the buffer, so it doesn't actually touch the bh
 151 * itself.
 152 */
 153static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
 154{
 155	if (uptodate) {
 156		set_buffer_uptodate(bh);
 157	} else {
 158		/* This happens, due to failed read-ahead attempts. */
 159		clear_buffer_uptodate(bh);
 160	}
 161	unlock_buffer(bh);
 162}
 163
 164/*
 165 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 166 * unlock the buffer. This is what ll_rw_block uses too.
 167 */
 168void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
 169{
 170	__end_buffer_read_notouch(bh, uptodate);
 171	put_bh(bh);
 172}
 173EXPORT_SYMBOL(end_buffer_read_sync);
 174
 175void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 176{
 
 
 177	if (uptodate) {
 178		set_buffer_uptodate(bh);
 179	} else {
 180		buffer_io_error(bh, ", lost sync page write");
 
 
 
 
 
 181		set_buffer_write_io_error(bh);
 182		clear_buffer_uptodate(bh);
 183	}
 184	unlock_buffer(bh);
 185	put_bh(bh);
 186}
 187EXPORT_SYMBOL(end_buffer_write_sync);
 188
 189/*
 190 * Various filesystems appear to want __find_get_block to be non-blocking.
 191 * But it's the page lock which protects the buffers.  To get around this,
 192 * we get exclusion from try_to_free_buffers with the blockdev mapping's
 193 * private_lock.
 194 *
 195 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
 196 * may be quite high.  This code could TryLock the page, and if that
 197 * succeeds, there is no need to take private_lock. (But if
 198 * private_lock is contended then so is mapping->tree_lock).
 199 */
 200static struct buffer_head *
 201__find_get_block_slow(struct block_device *bdev, sector_t block)
 202{
 203	struct inode *bd_inode = bdev->bd_inode;
 204	struct address_space *bd_mapping = bd_inode->i_mapping;
 205	struct buffer_head *ret = NULL;
 206	pgoff_t index;
 207	struct buffer_head *bh;
 208	struct buffer_head *head;
 209	struct page *page;
 210	int all_mapped = 1;
 211
 212	index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
 213	page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
 214	if (!page)
 215		goto out;
 216
 217	spin_lock(&bd_mapping->private_lock);
 218	if (!page_has_buffers(page))
 219		goto out_unlock;
 220	head = page_buffers(page);
 221	bh = head;
 222	do {
 223		if (!buffer_mapped(bh))
 224			all_mapped = 0;
 225		else if (bh->b_blocknr == block) {
 226			ret = bh;
 227			get_bh(bh);
 228			goto out_unlock;
 229		}
 230		bh = bh->b_this_page;
 231	} while (bh != head);
 232
 233	/* we might be here because some of the buffers on this page are
 234	 * not mapped.  This is due to various races between
 235	 * file io on the block device and getblk.  It gets dealt with
 236	 * elsewhere, don't buffer_error if we had some unmapped buffers
 237	 */
 238	if (all_mapped) {
 239		printk("__find_get_block_slow() failed. "
 240			"block=%llu, b_blocknr=%llu\n",
 241			(unsigned long long)block,
 242			(unsigned long long)bh->b_blocknr);
 243		printk("b_state=0x%08lx, b_size=%zu\n",
 244			bh->b_state, bh->b_size);
 245		printk("device %pg blocksize: %d\n", bdev,
 246			1 << bd_inode->i_blkbits);
 247	}
 248out_unlock:
 249	spin_unlock(&bd_mapping->private_lock);
 250	put_page(page);
 251out:
 252	return ret;
 253}
 254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 255/*
 256 * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
 257 */
 258static void free_more_memory(void)
 259{
 260	struct zoneref *z;
 261	int nid;
 262
 263	wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
 264	yield();
 265
 266	for_each_online_node(nid) {
 267
 268		z = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
 269						gfp_zone(GFP_NOFS), NULL);
 270		if (z->zone)
 271			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
 272						GFP_NOFS, NULL);
 273	}
 274}
 275
 276/*
 277 * I/O completion handler for block_read_full_page() - pages
 278 * which come unlocked at the end of I/O.
 279 */
 280static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 281{
 282	unsigned long flags;
 283	struct buffer_head *first;
 284	struct buffer_head *tmp;
 285	struct page *page;
 286	int page_uptodate = 1;
 287
 288	BUG_ON(!buffer_async_read(bh));
 289
 290	page = bh->b_page;
 291	if (uptodate) {
 292		set_buffer_uptodate(bh);
 293	} else {
 294		clear_buffer_uptodate(bh);
 295		buffer_io_error(bh, ", async page read");
 
 296		SetPageError(page);
 297	}
 298
 299	/*
 300	 * Be _very_ careful from here on. Bad things can happen if
 301	 * two buffer heads end IO at almost the same time and both
 302	 * decide that the page is now completely done.
 303	 */
 304	first = page_buffers(page);
 305	local_irq_save(flags);
 306	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 307	clear_buffer_async_read(bh);
 308	unlock_buffer(bh);
 309	tmp = bh;
 310	do {
 311		if (!buffer_uptodate(tmp))
 312			page_uptodate = 0;
 313		if (buffer_async_read(tmp)) {
 314			BUG_ON(!buffer_locked(tmp));
 315			goto still_busy;
 316		}
 317		tmp = tmp->b_this_page;
 318	} while (tmp != bh);
 319	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 320	local_irq_restore(flags);
 321
 322	/*
 323	 * If none of the buffers had errors and they are all
 324	 * uptodate then we can set the page uptodate.
 325	 */
 326	if (page_uptodate && !PageError(page))
 327		SetPageUptodate(page);
 328	unlock_page(page);
 329	return;
 330
 331still_busy:
 332	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 333	local_irq_restore(flags);
 334	return;
 335}
 336
 337/*
 338 * Completion handler for block_write_full_page() - pages which are unlocked
 339 * during I/O, and which have PageWriteback cleared upon I/O completion.
 340 */
 341void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 342{
 
 343	unsigned long flags;
 344	struct buffer_head *first;
 345	struct buffer_head *tmp;
 346	struct page *page;
 347
 348	BUG_ON(!buffer_async_write(bh));
 349
 350	page = bh->b_page;
 351	if (uptodate) {
 352		set_buffer_uptodate(bh);
 353	} else {
 354		buffer_io_error(bh, ", lost async page write");
 355		mapping_set_error(page->mapping, -EIO);
 
 
 
 
 
 356		set_buffer_write_io_error(bh);
 357		clear_buffer_uptodate(bh);
 358		SetPageError(page);
 359	}
 360
 361	first = page_buffers(page);
 362	local_irq_save(flags);
 363	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 364
 365	clear_buffer_async_write(bh);
 366	unlock_buffer(bh);
 367	tmp = bh->b_this_page;
 368	while (tmp != bh) {
 369		if (buffer_async_write(tmp)) {
 370			BUG_ON(!buffer_locked(tmp));
 371			goto still_busy;
 372		}
 373		tmp = tmp->b_this_page;
 374	}
 375	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 376	local_irq_restore(flags);
 377	end_page_writeback(page);
 378	return;
 379
 380still_busy:
 381	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 382	local_irq_restore(flags);
 383	return;
 384}
 385EXPORT_SYMBOL(end_buffer_async_write);
 386
 387/*
 388 * If a page's buffers are under async readin (end_buffer_async_read
 389 * completion) then there is a possibility that another thread of
 390 * control could lock one of the buffers after it has completed
 391 * but while some of the other buffers have not completed.  This
 392 * locked buffer would confuse end_buffer_async_read() into not unlocking
 393 * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 394 * that this buffer is not under async I/O.
 395 *
 396 * The page comes unlocked when it has no locked buffer_async buffers
 397 * left.
 398 *
 399 * PageLocked prevents anyone starting new async I/O reads any of
 400 * the buffers.
 401 *
 402 * PageWriteback is used to prevent simultaneous writeout of the same
 403 * page.
 404 *
 405 * PageLocked prevents anyone from starting writeback of a page which is
 406 * under read I/O (PageWriteback is only ever set against a locked page).
 407 */
 408static void mark_buffer_async_read(struct buffer_head *bh)
 409{
 410	bh->b_end_io = end_buffer_async_read;
 411	set_buffer_async_read(bh);
 412}
 413
 414static void mark_buffer_async_write_endio(struct buffer_head *bh,
 415					  bh_end_io_t *handler)
 416{
 417	bh->b_end_io = handler;
 418	set_buffer_async_write(bh);
 419}
 420
 421void mark_buffer_async_write(struct buffer_head *bh)
 422{
 423	mark_buffer_async_write_endio(bh, end_buffer_async_write);
 424}
 425EXPORT_SYMBOL(mark_buffer_async_write);
 426
 427
 428/*
 429 * fs/buffer.c contains helper functions for buffer-backed address space's
 430 * fsync functions.  A common requirement for buffer-based filesystems is
 431 * that certain data from the backing blockdev needs to be written out for
 432 * a successful fsync().  For example, ext2 indirect blocks need to be
 433 * written back and waited upon before fsync() returns.
 434 *
 435 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
 436 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 437 * management of a list of dependent buffers at ->i_mapping->private_list.
 438 *
 439 * Locking is a little subtle: try_to_free_buffers() will remove buffers
 440 * from their controlling inode's queue when they are being freed.  But
 441 * try_to_free_buffers() will be operating against the *blockdev* mapping
 442 * at the time, not against the S_ISREG file which depends on those buffers.
 443 * So the locking for private_list is via the private_lock in the address_space
 444 * which backs the buffers.  Which is different from the address_space 
 445 * against which the buffers are listed.  So for a particular address_space,
 446 * mapping->private_lock does *not* protect mapping->private_list!  In fact,
 447 * mapping->private_list will always be protected by the backing blockdev's
 448 * ->private_lock.
 449 *
 450 * Which introduces a requirement: all buffers on an address_space's
 451 * ->private_list must be from the same address_space: the blockdev's.
 452 *
 453 * address_spaces which do not place buffers at ->private_list via these
 454 * utility functions are free to use private_lock and private_list for
 455 * whatever they want.  The only requirement is that list_empty(private_list)
 456 * be true at clear_inode() time.
 457 *
 458 * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 459 * filesystems should do that.  invalidate_inode_buffers() should just go
 460 * BUG_ON(!list_empty).
 461 *
 462 * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 463 * take an address_space, not an inode.  And it should be called
 464 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 465 * queued up.
 466 *
 467 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 468 * list if it is already on a list.  Because if the buffer is on a list,
 469 * it *must* already be on the right one.  If not, the filesystem is being
 470 * silly.  This will save a ton of locking.  But first we have to ensure
 471 * that buffers are taken *off* the old inode's list when they are freed
 472 * (presumably in truncate).  That requires careful auditing of all
 473 * filesystems (do it inside bforget()).  It could also be done by bringing
 474 * b_inode back.
 475 */
 476
 477/*
 478 * The buffer's backing address_space's private_lock must be held
 479 */
 480static void __remove_assoc_queue(struct buffer_head *bh)
 481{
 482	list_del_init(&bh->b_assoc_buffers);
 483	WARN_ON(!bh->b_assoc_map);
 484	if (buffer_write_io_error(bh))
 485		set_bit(AS_EIO, &bh->b_assoc_map->flags);
 486	bh->b_assoc_map = NULL;
 487}
 488
 489int inode_has_buffers(struct inode *inode)
 490{
 491	return !list_empty(&inode->i_data.private_list);
 492}
 493
 494/*
 495 * osync is designed to support O_SYNC io.  It waits synchronously for
 496 * all already-submitted IO to complete, but does not queue any new
 497 * writes to the disk.
 498 *
 499 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 500 * you dirty the buffers, and then use osync_inode_buffers to wait for
 501 * completion.  Any other dirty buffers which are not yet queued for
 502 * write will not be flushed to disk by the osync.
 503 */
 504static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
 505{
 506	struct buffer_head *bh;
 507	struct list_head *p;
 508	int err = 0;
 509
 510	spin_lock(lock);
 511repeat:
 512	list_for_each_prev(p, list) {
 513		bh = BH_ENTRY(p);
 514		if (buffer_locked(bh)) {
 515			get_bh(bh);
 516			spin_unlock(lock);
 517			wait_on_buffer(bh);
 518			if (!buffer_uptodate(bh))
 519				err = -EIO;
 520			brelse(bh);
 521			spin_lock(lock);
 522			goto repeat;
 523		}
 524	}
 525	spin_unlock(lock);
 526	return err;
 527}
 528
 529static void do_thaw_one(struct super_block *sb, void *unused)
 530{
 
 531	while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
 532		printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
 
 533}
 534
 535static void do_thaw_all(struct work_struct *work)
 536{
 537	iterate_supers(do_thaw_one, NULL);
 538	kfree(work);
 539	printk(KERN_WARNING "Emergency Thaw complete\n");
 540}
 541
 542/**
 543 * emergency_thaw_all -- forcibly thaw every frozen filesystem
 544 *
 545 * Used for emergency unfreeze of all filesystems via SysRq
 546 */
 547void emergency_thaw_all(void)
 548{
 549	struct work_struct *work;
 550
 551	work = kmalloc(sizeof(*work), GFP_ATOMIC);
 552	if (work) {
 553		INIT_WORK(work, do_thaw_all);
 554		schedule_work(work);
 555	}
 556}
 557
 558/**
 559 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
 560 * @mapping: the mapping which wants those buffers written
 561 *
 562 * Starts I/O against the buffers at mapping->private_list, and waits upon
 563 * that I/O.
 564 *
 565 * Basically, this is a convenience function for fsync().
 566 * @mapping is a file or directory which needs those buffers to be written for
 567 * a successful fsync().
 568 */
 569int sync_mapping_buffers(struct address_space *mapping)
 570{
 571	struct address_space *buffer_mapping = mapping->private_data;
 572
 573	if (buffer_mapping == NULL || list_empty(&mapping->private_list))
 574		return 0;
 575
 576	return fsync_buffers_list(&buffer_mapping->private_lock,
 577					&mapping->private_list);
 578}
 579EXPORT_SYMBOL(sync_mapping_buffers);
 580
 581/*
 582 * Called when we've recently written block `bblock', and it is known that
 583 * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 584 * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 585 * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 586 */
 587void write_boundary_block(struct block_device *bdev,
 588			sector_t bblock, unsigned blocksize)
 589{
 590	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
 591	if (bh) {
 592		if (buffer_dirty(bh))
 593			ll_rw_block(REQ_OP_WRITE, 0, 1, &bh);
 594		put_bh(bh);
 595	}
 596}
 597
 598void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 599{
 600	struct address_space *mapping = inode->i_mapping;
 601	struct address_space *buffer_mapping = bh->b_page->mapping;
 602
 603	mark_buffer_dirty(bh);
 604	if (!mapping->private_data) {
 605		mapping->private_data = buffer_mapping;
 606	} else {
 607		BUG_ON(mapping->private_data != buffer_mapping);
 608	}
 609	if (!bh->b_assoc_map) {
 610		spin_lock(&buffer_mapping->private_lock);
 611		list_move_tail(&bh->b_assoc_buffers,
 612				&mapping->private_list);
 613		bh->b_assoc_map = mapping;
 614		spin_unlock(&buffer_mapping->private_lock);
 615	}
 616}
 617EXPORT_SYMBOL(mark_buffer_dirty_inode);
 618
 619/*
 620 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
 621 * dirty.
 622 *
 623 * If warn is true, then emit a warning if the page is not uptodate and has
 624 * not been truncated.
 625 *
 626 * The caller must hold lock_page_memcg().
 627 */
 628static void __set_page_dirty(struct page *page, struct address_space *mapping,
 629			     int warn)
 630{
 631	unsigned long flags;
 632
 633	spin_lock_irqsave(&mapping->tree_lock, flags);
 634	if (page->mapping) {	/* Race with truncate? */
 635		WARN_ON_ONCE(warn && !PageUptodate(page));
 636		account_page_dirtied(page, mapping);
 637		radix_tree_tag_set(&mapping->page_tree,
 638				page_index(page), PAGECACHE_TAG_DIRTY);
 639	}
 640	spin_unlock_irqrestore(&mapping->tree_lock, flags);
 
 641}
 642
 643/*
 644 * Add a page to the dirty page list.
 645 *
 646 * It is a sad fact of life that this function is called from several places
 647 * deeply under spinlocking.  It may not sleep.
 648 *
 649 * If the page has buffers, the uptodate buffers are set dirty, to preserve
 650 * dirty-state coherency between the page and the buffers.  It the page does
 651 * not have buffers then when they are later attached they will all be set
 652 * dirty.
 653 *
 654 * The buffers are dirtied before the page is dirtied.  There's a small race
 655 * window in which a writepage caller may see the page cleanness but not the
 656 * buffer dirtiness.  That's fine.  If this code were to set the page dirty
 657 * before the buffers, a concurrent writepage caller could clear the page dirty
 658 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
 659 * page on the dirty page list.
 660 *
 661 * We use private_lock to lock against try_to_free_buffers while using the
 662 * page's buffer list.  Also use this to protect against clean buffers being
 663 * added to the page after it was set dirty.
 664 *
 665 * FIXME: may need to call ->reservepage here as well.  That's rather up to the
 666 * address_space though.
 667 */
 668int __set_page_dirty_buffers(struct page *page)
 669{
 670	int newly_dirty;
 671	struct address_space *mapping = page_mapping(page);
 672
 673	if (unlikely(!mapping))
 674		return !TestSetPageDirty(page);
 675
 676	spin_lock(&mapping->private_lock);
 677	if (page_has_buffers(page)) {
 678		struct buffer_head *head = page_buffers(page);
 679		struct buffer_head *bh = head;
 680
 681		do {
 682			set_buffer_dirty(bh);
 683			bh = bh->b_this_page;
 684		} while (bh != head);
 685	}
 686	/*
 687	 * Lock out page->mem_cgroup migration to keep PageDirty
 688	 * synchronized with per-memcg dirty page counters.
 689	 */
 690	lock_page_memcg(page);
 691	newly_dirty = !TestSetPageDirty(page);
 692	spin_unlock(&mapping->private_lock);
 693
 694	if (newly_dirty)
 695		__set_page_dirty(page, mapping, 1);
 696
 697	unlock_page_memcg(page);
 698
 699	if (newly_dirty)
 700		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 701
 702	return newly_dirty;
 703}
 704EXPORT_SYMBOL(__set_page_dirty_buffers);
 705
 706/*
 707 * Write out and wait upon a list of buffers.
 708 *
 709 * We have conflicting pressures: we want to make sure that all
 710 * initially dirty buffers get waited on, but that any subsequently
 711 * dirtied buffers don't.  After all, we don't want fsync to last
 712 * forever if somebody is actively writing to the file.
 713 *
 714 * Do this in two main stages: first we copy dirty buffers to a
 715 * temporary inode list, queueing the writes as we go.  Then we clean
 716 * up, waiting for those writes to complete.
 717 * 
 718 * During this second stage, any subsequent updates to the file may end
 719 * up refiling the buffer on the original inode's dirty list again, so
 720 * there is a chance we will end up with a buffer queued for write but
 721 * not yet completed on that list.  So, as a final cleanup we go through
 722 * the osync code to catch these locked, dirty buffers without requeuing
 723 * any newly dirty buffers for write.
 724 */
 725static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 726{
 727	struct buffer_head *bh;
 728	struct list_head tmp;
 729	struct address_space *mapping;
 730	int err = 0, err2;
 731	struct blk_plug plug;
 732
 733	INIT_LIST_HEAD(&tmp);
 734	blk_start_plug(&plug);
 735
 736	spin_lock(lock);
 737	while (!list_empty(list)) {
 738		bh = BH_ENTRY(list->next);
 739		mapping = bh->b_assoc_map;
 740		__remove_assoc_queue(bh);
 741		/* Avoid race with mark_buffer_dirty_inode() which does
 742		 * a lockless check and we rely on seeing the dirty bit */
 743		smp_mb();
 744		if (buffer_dirty(bh) || buffer_locked(bh)) {
 745			list_add(&bh->b_assoc_buffers, &tmp);
 746			bh->b_assoc_map = mapping;
 747			if (buffer_dirty(bh)) {
 748				get_bh(bh);
 749				spin_unlock(lock);
 750				/*
 751				 * Ensure any pending I/O completes so that
 752				 * write_dirty_buffer() actually writes the
 753				 * current contents - it is a noop if I/O is
 754				 * still in flight on potentially older
 755				 * contents.
 756				 */
 757				write_dirty_buffer(bh, REQ_SYNC);
 758
 759				/*
 760				 * Kick off IO for the previous mapping. Note
 761				 * that we will not run the very last mapping,
 762				 * wait_on_buffer() will do that for us
 763				 * through sync_buffer().
 764				 */
 765				brelse(bh);
 766				spin_lock(lock);
 767			}
 768		}
 769	}
 770
 771	spin_unlock(lock);
 772	blk_finish_plug(&plug);
 773	spin_lock(lock);
 774
 775	while (!list_empty(&tmp)) {
 776		bh = BH_ENTRY(tmp.prev);
 777		get_bh(bh);
 778		mapping = bh->b_assoc_map;
 779		__remove_assoc_queue(bh);
 780		/* Avoid race with mark_buffer_dirty_inode() which does
 781		 * a lockless check and we rely on seeing the dirty bit */
 782		smp_mb();
 783		if (buffer_dirty(bh)) {
 784			list_add(&bh->b_assoc_buffers,
 785				 &mapping->private_list);
 786			bh->b_assoc_map = mapping;
 787		}
 788		spin_unlock(lock);
 789		wait_on_buffer(bh);
 790		if (!buffer_uptodate(bh))
 791			err = -EIO;
 792		brelse(bh);
 793		spin_lock(lock);
 794	}
 795	
 796	spin_unlock(lock);
 797	err2 = osync_buffers_list(lock, list);
 798	if (err)
 799		return err;
 800	else
 801		return err2;
 802}
 803
 804/*
 805 * Invalidate any and all dirty buffers on a given inode.  We are
 806 * probably unmounting the fs, but that doesn't mean we have already
 807 * done a sync().  Just drop the buffers from the inode list.
 808 *
 809 * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
 810 * assumes that all the buffers are against the blockdev.  Not true
 811 * for reiserfs.
 812 */
 813void invalidate_inode_buffers(struct inode *inode)
 814{
 815	if (inode_has_buffers(inode)) {
 816		struct address_space *mapping = &inode->i_data;
 817		struct list_head *list = &mapping->private_list;
 818		struct address_space *buffer_mapping = mapping->private_data;
 819
 820		spin_lock(&buffer_mapping->private_lock);
 821		while (!list_empty(list))
 822			__remove_assoc_queue(BH_ENTRY(list->next));
 823		spin_unlock(&buffer_mapping->private_lock);
 824	}
 825}
 826EXPORT_SYMBOL(invalidate_inode_buffers);
 827
 828/*
 829 * Remove any clean buffers from the inode's buffer list.  This is called
 830 * when we're trying to free the inode itself.  Those buffers can pin it.
 831 *
 832 * Returns true if all buffers were removed.
 833 */
 834int remove_inode_buffers(struct inode *inode)
 835{
 836	int ret = 1;
 837
 838	if (inode_has_buffers(inode)) {
 839		struct address_space *mapping = &inode->i_data;
 840		struct list_head *list = &mapping->private_list;
 841		struct address_space *buffer_mapping = mapping->private_data;
 842
 843		spin_lock(&buffer_mapping->private_lock);
 844		while (!list_empty(list)) {
 845			struct buffer_head *bh = BH_ENTRY(list->next);
 846			if (buffer_dirty(bh)) {
 847				ret = 0;
 848				break;
 849			}
 850			__remove_assoc_queue(bh);
 851		}
 852		spin_unlock(&buffer_mapping->private_lock);
 853	}
 854	return ret;
 855}
 856
 857/*
 858 * Create the appropriate buffers when given a page for data area and
 859 * the size of each buffer.. Use the bh->b_this_page linked list to
 860 * follow the buffers created.  Return NULL if unable to create more
 861 * buffers.
 862 *
 863 * The retry flag is used to differentiate async IO (paging, swapping)
 864 * which may not fail from ordinary buffer allocations.
 865 */
 866struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 867		int retry)
 868{
 869	struct buffer_head *bh, *head;
 870	long offset;
 871
 872try_again:
 873	head = NULL;
 874	offset = PAGE_SIZE;
 875	while ((offset -= size) >= 0) {
 876		bh = alloc_buffer_head(GFP_NOFS);
 877		if (!bh)
 878			goto no_grow;
 879
 
 880		bh->b_this_page = head;
 881		bh->b_blocknr = -1;
 882		head = bh;
 883
 
 
 884		bh->b_size = size;
 885
 886		/* Link the buffer to its page */
 887		set_bh_page(bh, page, offset);
 
 
 888	}
 889	return head;
 890/*
 891 * In case anything failed, we just free everything we got.
 892 */
 893no_grow:
 894	if (head) {
 895		do {
 896			bh = head;
 897			head = head->b_this_page;
 898			free_buffer_head(bh);
 899		} while (head);
 900	}
 901
 902	/*
 903	 * Return failure for non-async IO requests.  Async IO requests
 904	 * are not allowed to fail, so we have to wait until buffer heads
 905	 * become available.  But we don't want tasks sleeping with 
 906	 * partially complete buffers, so all were released above.
 907	 */
 908	if (!retry)
 909		return NULL;
 910
 911	/* We're _really_ low on memory. Now we just
 912	 * wait for old buffer heads to become free due to
 913	 * finishing IO.  Since this is an async request and
 914	 * the reserve list is empty, we're sure there are 
 915	 * async buffer heads in use.
 916	 */
 917	free_more_memory();
 918	goto try_again;
 919}
 920EXPORT_SYMBOL_GPL(alloc_page_buffers);
 921
 922static inline void
 923link_dev_buffers(struct page *page, struct buffer_head *head)
 924{
 925	struct buffer_head *bh, *tail;
 926
 927	bh = head;
 928	do {
 929		tail = bh;
 930		bh = bh->b_this_page;
 931	} while (bh);
 932	tail->b_this_page = head;
 933	attach_page_buffers(page, head);
 934}
 935
 936static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
 937{
 938	sector_t retval = ~((sector_t)0);
 939	loff_t sz = i_size_read(bdev->bd_inode);
 940
 941	if (sz) {
 942		unsigned int sizebits = blksize_bits(size);
 943		retval = (sz >> sizebits);
 944	}
 945	return retval;
 946}
 947
 948/*
 949 * Initialise the state of a blockdev page's buffers.
 950 */ 
 951static sector_t
 952init_page_buffers(struct page *page, struct block_device *bdev,
 953			sector_t block, int size)
 954{
 955	struct buffer_head *head = page_buffers(page);
 956	struct buffer_head *bh = head;
 957	int uptodate = PageUptodate(page);
 958	sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
 959
 960	do {
 961		if (!buffer_mapped(bh)) {
 962			init_buffer(bh, NULL, NULL);
 963			bh->b_bdev = bdev;
 964			bh->b_blocknr = block;
 965			if (uptodate)
 966				set_buffer_uptodate(bh);
 967			if (block < end_block)
 968				set_buffer_mapped(bh);
 969		}
 970		block++;
 971		bh = bh->b_this_page;
 972	} while (bh != head);
 973
 974	/*
 975	 * Caller needs to validate requested block against end of device.
 976	 */
 977	return end_block;
 978}
 979
 980/*
 981 * Create the page-cache page that contains the requested block.
 982 *
 983 * This is used purely for blockdev mappings.
 984 */
 985static int
 986grow_dev_page(struct block_device *bdev, sector_t block,
 987	      pgoff_t index, int size, int sizebits, gfp_t gfp)
 988{
 989	struct inode *inode = bdev->bd_inode;
 990	struct page *page;
 991	struct buffer_head *bh;
 992	sector_t end_block;
 993	int ret = 0;		/* Will call free_more_memory() */
 994	gfp_t gfp_mask;
 995
 996	gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
 997
 998	/*
 999	 * XXX: __getblk_slow() can not really deal with failure and
1000	 * will endlessly loop on improvised global reclaim.  Prefer
1001	 * looping in the allocator rather than here, at least that
1002	 * code knows what it's doing.
1003	 */
1004	gfp_mask |= __GFP_NOFAIL;
1005
1006	page = find_or_create_page(inode->i_mapping, index, gfp_mask);
 
1007	if (!page)
1008		return ret;
1009
1010	BUG_ON(!PageLocked(page));
1011
1012	if (page_has_buffers(page)) {
1013		bh = page_buffers(page);
1014		if (bh->b_size == size) {
1015			end_block = init_page_buffers(page, bdev,
1016						(sector_t)index << sizebits,
1017						size);
1018			goto done;
1019		}
1020		if (!try_to_free_buffers(page))
1021			goto failed;
1022	}
1023
1024	/*
1025	 * Allocate some buffers for this page
1026	 */
1027	bh = alloc_page_buffers(page, size, 0);
1028	if (!bh)
1029		goto failed;
1030
1031	/*
1032	 * Link the page to the buffers and initialise them.  Take the
1033	 * lock to be atomic wrt __find_get_block(), which does not
1034	 * run under the page lock.
1035	 */
1036	spin_lock(&inode->i_mapping->private_lock);
1037	link_dev_buffers(page, bh);
1038	end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits,
1039			size);
1040	spin_unlock(&inode->i_mapping->private_lock);
1041done:
1042	ret = (block < end_block) ? 1 : -ENXIO;
1043failed:
 
1044	unlock_page(page);
1045	put_page(page);
1046	return ret;
1047}
1048
1049/*
1050 * Create buffers for the specified block device block's page.  If
1051 * that page was dirty, the buffers are set dirty also.
1052 */
1053static int
1054grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
1055{
 
1056	pgoff_t index;
1057	int sizebits;
1058
1059	sizebits = -1;
1060	do {
1061		sizebits++;
1062	} while ((size << sizebits) < PAGE_SIZE);
1063
1064	index = block >> sizebits;
1065
1066	/*
1067	 * Check for a block which wants to lie outside our maximum possible
1068	 * pagecache index.  (this comparison is done using sector_t types).
1069	 */
1070	if (unlikely(index != block >> sizebits)) {
 
 
1071		printk(KERN_ERR "%s: requested out-of-range block %llu for "
1072			"device %pg\n",
1073			__func__, (unsigned long long)block,
1074			bdev);
1075		return -EIO;
1076	}
1077
1078	/* Create a page with the proper size buffers.. */
1079	return grow_dev_page(bdev, block, index, size, sizebits, gfp);
 
 
 
 
 
1080}
1081
1082static struct buffer_head *
1083__getblk_slow(struct block_device *bdev, sector_t block,
1084	     unsigned size, gfp_t gfp)
1085{
1086	/* Size must be multiple of hard sectorsize */
1087	if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1088			(size < 512 || size > PAGE_SIZE))) {
1089		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1090					size);
1091		printk(KERN_ERR "logical block size: %d\n",
1092					bdev_logical_block_size(bdev));
1093
1094		dump_stack();
1095		return NULL;
1096	}
1097
1098	for (;;) {
1099		struct buffer_head *bh;
1100		int ret;
1101
1102		bh = __find_get_block(bdev, block, size);
1103		if (bh)
1104			return bh;
1105
1106		ret = grow_buffers(bdev, block, size, gfp);
1107		if (ret < 0)
1108			return NULL;
1109		if (ret == 0)
1110			free_more_memory();
1111	}
1112}
1113
1114/*
1115 * The relationship between dirty buffers and dirty pages:
1116 *
1117 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1118 * the page is tagged dirty in its radix tree.
1119 *
1120 * At all times, the dirtiness of the buffers represents the dirtiness of
1121 * subsections of the page.  If the page has buffers, the page dirty bit is
1122 * merely a hint about the true dirty state.
1123 *
1124 * When a page is set dirty in its entirety, all its buffers are marked dirty
1125 * (if the page has buffers).
1126 *
1127 * When a buffer is marked dirty, its page is dirtied, but the page's other
1128 * buffers are not.
1129 *
1130 * Also.  When blockdev buffers are explicitly read with bread(), they
1131 * individually become uptodate.  But their backing page remains not
1132 * uptodate - even if all of its buffers are uptodate.  A subsequent
1133 * block_read_full_page() against that page will discover all the uptodate
1134 * buffers, will set the page uptodate and will perform no I/O.
1135 */
1136
1137/**
1138 * mark_buffer_dirty - mark a buffer_head as needing writeout
1139 * @bh: the buffer_head to mark dirty
1140 *
1141 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1142 * backing page dirty, then tag the page as dirty in its address_space's radix
1143 * tree and then attach the address_space's inode to its superblock's dirty
1144 * inode list.
1145 *
1146 * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1147 * mapping->tree_lock and mapping->host->i_lock.
1148 */
1149void mark_buffer_dirty(struct buffer_head *bh)
1150{
1151	WARN_ON_ONCE(!buffer_uptodate(bh));
1152
1153	trace_block_dirty_buffer(bh);
1154
1155	/*
1156	 * Very *carefully* optimize the it-is-already-dirty case.
1157	 *
1158	 * Don't let the final "is it dirty" escape to before we
1159	 * perhaps modified the buffer.
1160	 */
1161	if (buffer_dirty(bh)) {
1162		smp_mb();
1163		if (buffer_dirty(bh))
1164			return;
1165	}
1166
1167	if (!test_set_buffer_dirty(bh)) {
1168		struct page *page = bh->b_page;
1169		struct address_space *mapping = NULL;
1170
1171		lock_page_memcg(page);
1172		if (!TestSetPageDirty(page)) {
1173			mapping = page_mapping(page);
1174			if (mapping)
1175				__set_page_dirty(page, mapping, 0);
1176		}
1177		unlock_page_memcg(page);
1178		if (mapping)
1179			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1180	}
1181}
1182EXPORT_SYMBOL(mark_buffer_dirty);
1183
1184/*
1185 * Decrement a buffer_head's reference count.  If all buffers against a page
1186 * have zero reference count, are clean and unlocked, and if the page is clean
1187 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1188 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1189 * a page but it ends up not being freed, and buffers may later be reattached).
1190 */
1191void __brelse(struct buffer_head * buf)
1192{
1193	if (atomic_read(&buf->b_count)) {
1194		put_bh(buf);
1195		return;
1196	}
1197	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1198}
1199EXPORT_SYMBOL(__brelse);
1200
1201/*
1202 * bforget() is like brelse(), except it discards any
1203 * potentially dirty data.
1204 */
1205void __bforget(struct buffer_head *bh)
1206{
1207	clear_buffer_dirty(bh);
1208	if (bh->b_assoc_map) {
1209		struct address_space *buffer_mapping = bh->b_page->mapping;
1210
1211		spin_lock(&buffer_mapping->private_lock);
1212		list_del_init(&bh->b_assoc_buffers);
1213		bh->b_assoc_map = NULL;
1214		spin_unlock(&buffer_mapping->private_lock);
1215	}
1216	__brelse(bh);
1217}
1218EXPORT_SYMBOL(__bforget);
1219
1220static struct buffer_head *__bread_slow(struct buffer_head *bh)
1221{
1222	lock_buffer(bh);
1223	if (buffer_uptodate(bh)) {
1224		unlock_buffer(bh);
1225		return bh;
1226	} else {
1227		get_bh(bh);
1228		bh->b_end_io = end_buffer_read_sync;
1229		submit_bh(REQ_OP_READ, 0, bh);
1230		wait_on_buffer(bh);
1231		if (buffer_uptodate(bh))
1232			return bh;
1233	}
1234	brelse(bh);
1235	return NULL;
1236}
1237
1238/*
1239 * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1240 * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1241 * refcount elevated by one when they're in an LRU.  A buffer can only appear
1242 * once in a particular CPU's LRU.  A single buffer can be present in multiple
1243 * CPU's LRUs at the same time.
1244 *
1245 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1246 * sb_find_get_block().
1247 *
1248 * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1249 * a local interrupt disable for that.
1250 */
1251
1252#define BH_LRU_SIZE	16
1253
1254struct bh_lru {
1255	struct buffer_head *bhs[BH_LRU_SIZE];
1256};
1257
1258static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1259
1260#ifdef CONFIG_SMP
1261#define bh_lru_lock()	local_irq_disable()
1262#define bh_lru_unlock()	local_irq_enable()
1263#else
1264#define bh_lru_lock()	preempt_disable()
1265#define bh_lru_unlock()	preempt_enable()
1266#endif
1267
1268static inline void check_irqs_on(void)
1269{
1270#ifdef irqs_disabled
1271	BUG_ON(irqs_disabled());
1272#endif
1273}
1274
1275/*
1276 * The LRU management algorithm is dopey-but-simple.  Sorry.
1277 */
1278static void bh_lru_install(struct buffer_head *bh)
1279{
1280	struct buffer_head *evictee = NULL;
1281
1282	check_irqs_on();
1283	bh_lru_lock();
1284	if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
1285		struct buffer_head *bhs[BH_LRU_SIZE];
1286		int in;
1287		int out = 0;
1288
1289		get_bh(bh);
1290		bhs[out++] = bh;
1291		for (in = 0; in < BH_LRU_SIZE; in++) {
1292			struct buffer_head *bh2 =
1293				__this_cpu_read(bh_lrus.bhs[in]);
1294
1295			if (bh2 == bh) {
1296				__brelse(bh2);
1297			} else {
1298				if (out >= BH_LRU_SIZE) {
1299					BUG_ON(evictee != NULL);
1300					evictee = bh2;
1301				} else {
1302					bhs[out++] = bh2;
1303				}
1304			}
1305		}
1306		while (out < BH_LRU_SIZE)
1307			bhs[out++] = NULL;
1308		memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
1309	}
1310	bh_lru_unlock();
1311
1312	if (evictee)
1313		__brelse(evictee);
1314}
1315
1316/*
1317 * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1318 */
1319static struct buffer_head *
1320lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1321{
1322	struct buffer_head *ret = NULL;
1323	unsigned int i;
1324
1325	check_irqs_on();
1326	bh_lru_lock();
1327	for (i = 0; i < BH_LRU_SIZE; i++) {
1328		struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1329
1330		if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
1331		    bh->b_size == size) {
1332			if (i) {
1333				while (i) {
1334					__this_cpu_write(bh_lrus.bhs[i],
1335						__this_cpu_read(bh_lrus.bhs[i - 1]));
1336					i--;
1337				}
1338				__this_cpu_write(bh_lrus.bhs[0], bh);
1339			}
1340			get_bh(bh);
1341			ret = bh;
1342			break;
1343		}
1344	}
1345	bh_lru_unlock();
1346	return ret;
1347}
1348
1349/*
1350 * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1351 * it in the LRU and mark it as accessed.  If it is not present then return
1352 * NULL
1353 */
1354struct buffer_head *
1355__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1356{
1357	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1358
1359	if (bh == NULL) {
1360		/* __find_get_block_slow will mark the page accessed */
1361		bh = __find_get_block_slow(bdev, block);
1362		if (bh)
1363			bh_lru_install(bh);
1364	} else
 
1365		touch_buffer(bh);
1366
1367	return bh;
1368}
1369EXPORT_SYMBOL(__find_get_block);
1370
1371/*
1372 * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
1373 * which corresponds to the passed block_device, block and size. The
1374 * returned buffer has its reference count incremented.
1375 *
1376 * __getblk_gfp() will lock up the machine if grow_dev_page's
1377 * try_to_free_buffers() attempt is failing.  FIXME, perhaps?
 
 
 
 
1378 */
1379struct buffer_head *
1380__getblk_gfp(struct block_device *bdev, sector_t block,
1381	     unsigned size, gfp_t gfp)
1382{
1383	struct buffer_head *bh = __find_get_block(bdev, block, size);
1384
1385	might_sleep();
1386	if (bh == NULL)
1387		bh = __getblk_slow(bdev, block, size, gfp);
1388	return bh;
1389}
1390EXPORT_SYMBOL(__getblk_gfp);
1391
1392/*
1393 * Do async read-ahead on a buffer..
1394 */
1395void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1396{
1397	struct buffer_head *bh = __getblk(bdev, block, size);
1398	if (likely(bh)) {
1399		ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh);
1400		brelse(bh);
1401	}
1402}
1403EXPORT_SYMBOL(__breadahead);
1404
1405/**
1406 *  __bread_gfp() - reads a specified block and returns the bh
1407 *  @bdev: the block_device to read from
1408 *  @block: number of block
1409 *  @size: size (in bytes) to read
1410 *  @gfp: page allocation flag
1411 *
1412 *  Reads a specified block, and returns buffer head that contains it.
1413 *  The page cache can be allocated from non-movable area
1414 *  not to prevent page migration if you set gfp to zero.
1415 *  It returns NULL if the block was unreadable.
1416 */
1417struct buffer_head *
1418__bread_gfp(struct block_device *bdev, sector_t block,
1419		   unsigned size, gfp_t gfp)
1420{
1421	struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
1422
1423	if (likely(bh) && !buffer_uptodate(bh))
1424		bh = __bread_slow(bh);
1425	return bh;
1426}
1427EXPORT_SYMBOL(__bread_gfp);
1428
1429/*
1430 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1431 * This doesn't race because it runs in each cpu either in irq
1432 * or with preempt disabled.
1433 */
1434static void invalidate_bh_lru(void *arg)
1435{
1436	struct bh_lru *b = &get_cpu_var(bh_lrus);
1437	int i;
1438
1439	for (i = 0; i < BH_LRU_SIZE; i++) {
1440		brelse(b->bhs[i]);
1441		b->bhs[i] = NULL;
1442	}
1443	put_cpu_var(bh_lrus);
1444}
1445
1446static bool has_bh_in_lru(int cpu, void *dummy)
1447{
1448	struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
1449	int i;
1450	
1451	for (i = 0; i < BH_LRU_SIZE; i++) {
1452		if (b->bhs[i])
1453			return 1;
1454	}
1455
1456	return 0;
1457}
1458
1459void invalidate_bh_lrus(void)
1460{
1461	on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1, GFP_KERNEL);
1462}
1463EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1464
1465void set_bh_page(struct buffer_head *bh,
1466		struct page *page, unsigned long offset)
1467{
1468	bh->b_page = page;
1469	BUG_ON(offset >= PAGE_SIZE);
1470	if (PageHighMem(page))
1471		/*
1472		 * This catches illegal uses and preserves the offset:
1473		 */
1474		bh->b_data = (char *)(0 + offset);
1475	else
1476		bh->b_data = page_address(page) + offset;
1477}
1478EXPORT_SYMBOL(set_bh_page);
1479
1480/*
1481 * Called when truncating a buffer on a page completely.
1482 */
1483
1484/* Bits that are cleared during an invalidate */
1485#define BUFFER_FLAGS_DISCARD \
1486	(1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
1487	 1 << BH_Delay | 1 << BH_Unwritten)
1488
1489static void discard_buffer(struct buffer_head * bh)
1490{
1491	unsigned long b_state, b_state_old;
1492
1493	lock_buffer(bh);
1494	clear_buffer_dirty(bh);
1495	bh->b_bdev = NULL;
1496	b_state = bh->b_state;
1497	for (;;) {
1498		b_state_old = cmpxchg(&bh->b_state, b_state,
1499				      (b_state & ~BUFFER_FLAGS_DISCARD));
1500		if (b_state_old == b_state)
1501			break;
1502		b_state = b_state_old;
1503	}
1504	unlock_buffer(bh);
1505}
1506
1507/**
1508 * block_invalidatepage - invalidate part or all of a buffer-backed page
1509 *
1510 * @page: the page which is affected
1511 * @offset: start of the range to invalidate
1512 * @length: length of the range to invalidate
1513 *
1514 * block_invalidatepage() is called when all or part of the page has become
1515 * invalidated by a truncate operation.
1516 *
1517 * block_invalidatepage() does not have to release all buffers, but it must
1518 * ensure that no dirty buffer is left outside @offset and that no I/O
1519 * is underway against any of the blocks which are outside the truncation
1520 * point.  Because the caller is about to free (and possibly reuse) those
1521 * blocks on-disk.
1522 */
1523void block_invalidatepage(struct page *page, unsigned int offset,
1524			  unsigned int length)
1525{
1526	struct buffer_head *head, *bh, *next;
1527	unsigned int curr_off = 0;
1528	unsigned int stop = length + offset;
1529
1530	BUG_ON(!PageLocked(page));
1531	if (!page_has_buffers(page))
1532		goto out;
1533
1534	/*
1535	 * Check for overflow
1536	 */
1537	BUG_ON(stop > PAGE_SIZE || stop < length);
1538
1539	head = page_buffers(page);
1540	bh = head;
1541	do {
1542		unsigned int next_off = curr_off + bh->b_size;
1543		next = bh->b_this_page;
1544
1545		/*
1546		 * Are we still fully in range ?
1547		 */
1548		if (next_off > stop)
1549			goto out;
1550
1551		/*
1552		 * is this block fully invalidated?
1553		 */
1554		if (offset <= curr_off)
1555			discard_buffer(bh);
1556		curr_off = next_off;
1557		bh = next;
1558	} while (bh != head);
1559
1560	/*
1561	 * We release buffers only if the entire page is being invalidated.
1562	 * The get_block cached value has been unconditionally invalidated,
1563	 * so real IO is not possible anymore.
1564	 */
1565	if (offset == 0)
1566		try_to_release_page(page, 0);
1567out:
1568	return;
1569}
1570EXPORT_SYMBOL(block_invalidatepage);
1571
1572
1573/*
1574 * We attach and possibly dirty the buffers atomically wrt
1575 * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1576 * is already excluded via the page lock.
1577 */
1578void create_empty_buffers(struct page *page,
1579			unsigned long blocksize, unsigned long b_state)
1580{
1581	struct buffer_head *bh, *head, *tail;
1582
1583	head = alloc_page_buffers(page, blocksize, 1);
1584	bh = head;
1585	do {
1586		bh->b_state |= b_state;
1587		tail = bh;
1588		bh = bh->b_this_page;
1589	} while (bh);
1590	tail->b_this_page = head;
1591
1592	spin_lock(&page->mapping->private_lock);
1593	if (PageUptodate(page) || PageDirty(page)) {
1594		bh = head;
1595		do {
1596			if (PageDirty(page))
1597				set_buffer_dirty(bh);
1598			if (PageUptodate(page))
1599				set_buffer_uptodate(bh);
1600			bh = bh->b_this_page;
1601		} while (bh != head);
1602	}
1603	attach_page_buffers(page, head);
1604	spin_unlock(&page->mapping->private_lock);
1605}
1606EXPORT_SYMBOL(create_empty_buffers);
1607
1608/**
1609 * clean_bdev_aliases: clean a range of buffers in block device
1610 * @bdev: Block device to clean buffers in
1611 * @block: Start of a range of blocks to clean
1612 * @len: Number of blocks to clean
1613 *
1614 * We are taking a range of blocks for data and we don't want writeback of any
1615 * buffer-cache aliases starting from return from this function and until the
1616 * moment when something will explicitly mark the buffer dirty (hopefully that
1617 * will not happen until we will free that block ;-) We don't even need to mark
1618 * it not-uptodate - nobody can expect anything from a newly allocated buffer
1619 * anyway. We used to use unmap_buffer() for such invalidation, but that was
1620 * wrong. We definitely don't want to mark the alias unmapped, for example - it
1621 * would confuse anyone who might pick it with bread() afterwards...
1622 *
1623 * Also..  Note that bforget() doesn't lock the buffer.  So there can be
1624 * writeout I/O going on against recently-freed buffers.  We don't wait on that
1625 * I/O in bforget() - it's more efficient to wait on the I/O only if we really
1626 * need to.  That happens here.
1627 */
1628void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
1629{
1630	struct inode *bd_inode = bdev->bd_inode;
1631	struct address_space *bd_mapping = bd_inode->i_mapping;
1632	struct pagevec pvec;
1633	pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
1634	pgoff_t end;
1635	int i;
1636	struct buffer_head *bh;
1637	struct buffer_head *head;
1638
1639	end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
1640	pagevec_init(&pvec, 0);
1641	while (index <= end && pagevec_lookup(&pvec, bd_mapping, index,
1642			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
1643		for (i = 0; i < pagevec_count(&pvec); i++) {
1644			struct page *page = pvec.pages[i];
1645
1646			index = page->index;
1647			if (index > end)
1648				break;
1649			if (!page_has_buffers(page))
1650				continue;
1651			/*
1652			 * We use page lock instead of bd_mapping->private_lock
1653			 * to pin buffers here since we can afford to sleep and
1654			 * it scales better than a global spinlock lock.
1655			 */
1656			lock_page(page);
1657			/* Recheck when the page is locked which pins bhs */
1658			if (!page_has_buffers(page))
1659				goto unlock_page;
1660			head = page_buffers(page);
1661			bh = head;
1662			do {
1663				if (!buffer_mapped(bh) || (bh->b_blocknr < block))
1664					goto next;
1665				if (bh->b_blocknr >= block + len)
1666					break;
1667				clear_buffer_dirty(bh);
1668				wait_on_buffer(bh);
1669				clear_buffer_req(bh);
1670next:
1671				bh = bh->b_this_page;
1672			} while (bh != head);
1673unlock_page:
1674			unlock_page(page);
1675		}
1676		pagevec_release(&pvec);
1677		cond_resched();
1678		index++;
1679	}
1680}
1681EXPORT_SYMBOL(clean_bdev_aliases);
1682
1683/*
1684 * Size is a power-of-two in the range 512..PAGE_SIZE,
1685 * and the case we care about most is PAGE_SIZE.
1686 *
1687 * So this *could* possibly be written with those
1688 * constraints in mind (relevant mostly if some
1689 * architecture has a slow bit-scan instruction)
 
 
 
 
 
 
 
 
1690 */
1691static inline int block_size_bits(unsigned int blocksize)
1692{
1693	return ilog2(blocksize);
1694}
1695
1696static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
1697{
1698	BUG_ON(!PageLocked(page));
1699
1700	if (!page_has_buffers(page))
1701		create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state);
1702	return page_buffers(page);
 
 
 
 
1703}
 
1704
1705/*
1706 * NOTE! All mapped/uptodate combinations are valid:
1707 *
1708 *	Mapped	Uptodate	Meaning
1709 *
1710 *	No	No		"unknown" - must do get_block()
1711 *	No	Yes		"hole" - zero-filled
1712 *	Yes	No		"allocated" - allocated on disk, not read in
1713 *	Yes	Yes		"valid" - allocated and up-to-date in memory.
1714 *
1715 * "Dirty" is valid only with the last case (mapped+uptodate).
1716 */
1717
1718/*
1719 * While block_write_full_page is writing back the dirty buffers under
1720 * the page lock, whoever dirtied the buffers may decide to clean them
1721 * again at any time.  We handle that by only looking at the buffer
1722 * state inside lock_buffer().
1723 *
1724 * If block_write_full_page() is called for regular writeback
1725 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1726 * locked buffer.   This only can happen if someone has written the buffer
1727 * directly, with submit_bh().  At the address_space level PageWriteback
1728 * prevents this contention from occurring.
1729 *
1730 * If block_write_full_page() is called with wbc->sync_mode ==
1731 * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
1732 * causes the writes to be flagged as synchronous writes.
1733 */
1734int __block_write_full_page(struct inode *inode, struct page *page,
1735			get_block_t *get_block, struct writeback_control *wbc,
1736			bh_end_io_t *handler)
1737{
1738	int err;
1739	sector_t block;
1740	sector_t last_block;
1741	struct buffer_head *bh, *head;
1742	unsigned int blocksize, bbits;
1743	int nr_underway = 0;
1744	int write_flags = wbc_to_write_flags(wbc);
 
 
 
1745
1746	head = create_page_buffers(page, inode,
 
 
 
1747					(1 << BH_Dirty)|(1 << BH_Uptodate));
 
1748
1749	/*
1750	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1751	 * here, and the (potentially unmapped) buffers may become dirty at
1752	 * any time.  If a buffer becomes dirty here after we've inspected it
1753	 * then we just miss that fact, and the page stays dirty.
1754	 *
1755	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1756	 * handle that here by just cleaning them.
1757	 */
1758
 
 
1759	bh = head;
1760	blocksize = bh->b_size;
1761	bbits = block_size_bits(blocksize);
1762
1763	block = (sector_t)page->index << (PAGE_SHIFT - bbits);
1764	last_block = (i_size_read(inode) - 1) >> bbits;
1765
1766	/*
1767	 * Get all the dirty buffers mapped to disk addresses and
1768	 * handle any aliases from the underlying blockdev's mapping.
1769	 */
1770	do {
1771		if (block > last_block) {
1772			/*
1773			 * mapped buffers outside i_size will occur, because
1774			 * this page can be outside i_size when there is a
1775			 * truncate in progress.
1776			 */
1777			/*
1778			 * The buffer was zeroed by block_write_full_page()
1779			 */
1780			clear_buffer_dirty(bh);
1781			set_buffer_uptodate(bh);
1782		} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1783			   buffer_dirty(bh)) {
1784			WARN_ON(bh->b_size != blocksize);
1785			err = get_block(inode, block, bh, 1);
1786			if (err)
1787				goto recover;
1788			clear_buffer_delay(bh);
1789			if (buffer_new(bh)) {
1790				/* blockdev mappings never come here */
1791				clear_buffer_new(bh);
1792				clean_bdev_bh_alias(bh);
 
1793			}
1794		}
1795		bh = bh->b_this_page;
1796		block++;
1797	} while (bh != head);
1798
1799	do {
1800		if (!buffer_mapped(bh))
1801			continue;
1802		/*
1803		 * If it's a fully non-blocking write attempt and we cannot
1804		 * lock the buffer then redirty the page.  Note that this can
1805		 * potentially cause a busy-wait loop from writeback threads
1806		 * and kswapd activity, but those code paths have their own
1807		 * higher-level throttling.
1808		 */
1809		if (wbc->sync_mode != WB_SYNC_NONE) {
1810			lock_buffer(bh);
1811		} else if (!trylock_buffer(bh)) {
1812			redirty_page_for_writepage(wbc, page);
1813			continue;
1814		}
1815		if (test_clear_buffer_dirty(bh)) {
1816			mark_buffer_async_write_endio(bh, handler);
1817		} else {
1818			unlock_buffer(bh);
1819		}
1820	} while ((bh = bh->b_this_page) != head);
1821
1822	/*
1823	 * The page and its buffers are protected by PageWriteback(), so we can
1824	 * drop the bh refcounts early.
1825	 */
1826	BUG_ON(PageWriteback(page));
1827	set_page_writeback(page);
1828
1829	do {
1830		struct buffer_head *next = bh->b_this_page;
1831		if (buffer_async_write(bh)) {
1832			submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc);
1833			nr_underway++;
1834		}
1835		bh = next;
1836	} while (bh != head);
1837	unlock_page(page);
1838
1839	err = 0;
1840done:
1841	if (nr_underway == 0) {
1842		/*
1843		 * The page was marked dirty, but the buffers were
1844		 * clean.  Someone wrote them back by hand with
1845		 * ll_rw_block/submit_bh.  A rare case.
1846		 */
1847		end_page_writeback(page);
1848
1849		/*
1850		 * The page and buffer_heads can be released at any time from
1851		 * here on.
1852		 */
1853	}
1854	return err;
1855
1856recover:
1857	/*
1858	 * ENOSPC, or some other error.  We may already have added some
1859	 * blocks to the file, so we need to write these out to avoid
1860	 * exposing stale data.
1861	 * The page is currently locked and not marked for writeback
1862	 */
1863	bh = head;
1864	/* Recovery: lock and submit the mapped buffers */
1865	do {
1866		if (buffer_mapped(bh) && buffer_dirty(bh) &&
1867		    !buffer_delay(bh)) {
1868			lock_buffer(bh);
1869			mark_buffer_async_write_endio(bh, handler);
1870		} else {
1871			/*
1872			 * The buffer may have been set dirty during
1873			 * attachment to a dirty page.
1874			 */
1875			clear_buffer_dirty(bh);
1876		}
1877	} while ((bh = bh->b_this_page) != head);
1878	SetPageError(page);
1879	BUG_ON(PageWriteback(page));
1880	mapping_set_error(page->mapping, err);
1881	set_page_writeback(page);
1882	do {
1883		struct buffer_head *next = bh->b_this_page;
1884		if (buffer_async_write(bh)) {
1885			clear_buffer_dirty(bh);
1886			submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc);
1887			nr_underway++;
1888		}
1889		bh = next;
1890	} while (bh != head);
1891	unlock_page(page);
1892	goto done;
1893}
1894EXPORT_SYMBOL(__block_write_full_page);
1895
1896/*
1897 * If a page has any new buffers, zero them out here, and mark them uptodate
1898 * and dirty so they'll be written out (in order to prevent uninitialised
1899 * block data from leaking). And clear the new bit.
1900 */
1901void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1902{
1903	unsigned int block_start, block_end;
1904	struct buffer_head *head, *bh;
1905
1906	BUG_ON(!PageLocked(page));
1907	if (!page_has_buffers(page))
1908		return;
1909
1910	bh = head = page_buffers(page);
1911	block_start = 0;
1912	do {
1913		block_end = block_start + bh->b_size;
1914
1915		if (buffer_new(bh)) {
1916			if (block_end > from && block_start < to) {
1917				if (!PageUptodate(page)) {
1918					unsigned start, size;
1919
1920					start = max(from, block_start);
1921					size = min(to, block_end) - start;
1922
1923					zero_user(page, start, size);
1924					set_buffer_uptodate(bh);
1925				}
1926
1927				clear_buffer_new(bh);
1928				mark_buffer_dirty(bh);
1929			}
1930		}
1931
1932		block_start = block_end;
1933		bh = bh->b_this_page;
1934	} while (bh != head);
1935}
1936EXPORT_SYMBOL(page_zero_new_buffers);
1937
1938static void
1939iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
1940		struct iomap *iomap)
1941{
1942	loff_t offset = block << inode->i_blkbits;
1943
1944	bh->b_bdev = iomap->bdev;
1945
1946	/*
1947	 * Block points to offset in file we need to map, iomap contains
1948	 * the offset at which the map starts. If the map ends before the
1949	 * current block, then do not map the buffer and let the caller
1950	 * handle it.
1951	 */
1952	BUG_ON(offset >= iomap->offset + iomap->length);
1953
1954	switch (iomap->type) {
1955	case IOMAP_HOLE:
1956		/*
1957		 * If the buffer is not up to date or beyond the current EOF,
1958		 * we need to mark it as new to ensure sub-block zeroing is
1959		 * executed if necessary.
1960		 */
1961		if (!buffer_uptodate(bh) ||
1962		    (offset >= i_size_read(inode)))
1963			set_buffer_new(bh);
1964		break;
1965	case IOMAP_DELALLOC:
1966		if (!buffer_uptodate(bh) ||
1967		    (offset >= i_size_read(inode)))
1968			set_buffer_new(bh);
1969		set_buffer_uptodate(bh);
1970		set_buffer_mapped(bh);
1971		set_buffer_delay(bh);
1972		break;
1973	case IOMAP_UNWRITTEN:
1974		/*
1975		 * For unwritten regions, we always need to ensure that
1976		 * sub-block writes cause the regions in the block we are not
1977		 * writing to are zeroed. Set the buffer as new to ensure this.
1978		 */
1979		set_buffer_new(bh);
1980		set_buffer_unwritten(bh);
1981		/* FALLTHRU */
1982	case IOMAP_MAPPED:
1983		if (offset >= i_size_read(inode))
1984			set_buffer_new(bh);
1985		bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) +
1986				((offset - iomap->offset) >> inode->i_blkbits);
1987		set_buffer_mapped(bh);
1988		break;
1989	}
1990}
1991
1992int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
1993		get_block_t *get_block, struct iomap *iomap)
1994{
1995	unsigned from = pos & (PAGE_SIZE - 1);
1996	unsigned to = from + len;
1997	struct inode *inode = page->mapping->host;
1998	unsigned block_start, block_end;
1999	sector_t block;
2000	int err = 0;
2001	unsigned blocksize, bbits;
2002	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
2003
2004	BUG_ON(!PageLocked(page));
2005	BUG_ON(from > PAGE_SIZE);
2006	BUG_ON(to > PAGE_SIZE);
2007	BUG_ON(from > to);
2008
2009	head = create_page_buffers(page, inode, 0);
2010	blocksize = head->b_size;
2011	bbits = block_size_bits(blocksize);
 
2012
2013	block = (sector_t)page->index << (PAGE_SHIFT - bbits);
 
2014
2015	for(bh = head, block_start = 0; bh != head || !block_start;
2016	    block++, block_start=block_end, bh = bh->b_this_page) {
2017		block_end = block_start + blocksize;
2018		if (block_end <= from || block_start >= to) {
2019			if (PageUptodate(page)) {
2020				if (!buffer_uptodate(bh))
2021					set_buffer_uptodate(bh);
2022			}
2023			continue;
2024		}
2025		if (buffer_new(bh))
2026			clear_buffer_new(bh);
2027		if (!buffer_mapped(bh)) {
2028			WARN_ON(bh->b_size != blocksize);
2029			if (get_block) {
2030				err = get_block(inode, block, bh, 1);
2031				if (err)
2032					break;
2033			} else {
2034				iomap_to_bh(inode, block, bh, iomap);
2035			}
2036
2037			if (buffer_new(bh)) {
2038				clean_bdev_bh_alias(bh);
 
2039				if (PageUptodate(page)) {
2040					clear_buffer_new(bh);
2041					set_buffer_uptodate(bh);
2042					mark_buffer_dirty(bh);
2043					continue;
2044				}
2045				if (block_end > to || block_start < from)
2046					zero_user_segments(page,
2047						to, block_end,
2048						block_start, from);
2049				continue;
2050			}
2051		}
2052		if (PageUptodate(page)) {
2053			if (!buffer_uptodate(bh))
2054				set_buffer_uptodate(bh);
2055			continue; 
2056		}
2057		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
2058		    !buffer_unwritten(bh) &&
2059		     (block_start < from || block_end > to)) {
2060			ll_rw_block(REQ_OP_READ, 0, 1, &bh);
2061			*wait_bh++=bh;
2062		}
2063	}
2064	/*
2065	 * If we issued read requests - let them complete.
2066	 */
2067	while(wait_bh > wait) {
2068		wait_on_buffer(*--wait_bh);
2069		if (!buffer_uptodate(*wait_bh))
2070			err = -EIO;
2071	}
2072	if (unlikely(err))
2073		page_zero_new_buffers(page, from, to);
2074	return err;
2075}
2076
2077int __block_write_begin(struct page *page, loff_t pos, unsigned len,
2078		get_block_t *get_block)
2079{
2080	return __block_write_begin_int(page, pos, len, get_block, NULL);
2081}
2082EXPORT_SYMBOL(__block_write_begin);
2083
2084static int __block_commit_write(struct inode *inode, struct page *page,
2085		unsigned from, unsigned to)
2086{
2087	unsigned block_start, block_end;
2088	int partial = 0;
2089	unsigned blocksize;
2090	struct buffer_head *bh, *head;
2091
2092	bh = head = page_buffers(page);
2093	blocksize = bh->b_size;
2094
2095	block_start = 0;
2096	do {
 
2097		block_end = block_start + blocksize;
2098		if (block_end <= from || block_start >= to) {
2099			if (!buffer_uptodate(bh))
2100				partial = 1;
2101		} else {
2102			set_buffer_uptodate(bh);
2103			mark_buffer_dirty(bh);
2104		}
2105		clear_buffer_new(bh);
2106
2107		block_start = block_end;
2108		bh = bh->b_this_page;
2109	} while (bh != head);
2110
2111	/*
2112	 * If this is a partial write which happened to make all buffers
2113	 * uptodate then we can optimize away a bogus readpage() for
2114	 * the next read(). Here we 'discover' whether the page went
2115	 * uptodate as a result of this (potentially partial) write.
2116	 */
2117	if (!partial)
2118		SetPageUptodate(page);
2119	return 0;
2120}
2121
2122/*
2123 * block_write_begin takes care of the basic task of block allocation and
2124 * bringing partial write blocks uptodate first.
2125 *
2126 * The filesystem needs to handle block truncation upon failure.
2127 */
2128int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
2129		unsigned flags, struct page **pagep, get_block_t *get_block)
2130{
2131	pgoff_t index = pos >> PAGE_SHIFT;
2132	struct page *page;
2133	int status;
2134
2135	page = grab_cache_page_write_begin(mapping, index, flags);
2136	if (!page)
2137		return -ENOMEM;
2138
2139	status = __block_write_begin(page, pos, len, get_block);
2140	if (unlikely(status)) {
2141		unlock_page(page);
2142		put_page(page);
2143		page = NULL;
2144	}
2145
2146	*pagep = page;
2147	return status;
2148}
2149EXPORT_SYMBOL(block_write_begin);
2150
2151int block_write_end(struct file *file, struct address_space *mapping,
2152			loff_t pos, unsigned len, unsigned copied,
2153			struct page *page, void *fsdata)
2154{
2155	struct inode *inode = mapping->host;
2156	unsigned start;
2157
2158	start = pos & (PAGE_SIZE - 1);
2159
2160	if (unlikely(copied < len)) {
2161		/*
2162		 * The buffers that were written will now be uptodate, so we
2163		 * don't have to worry about a readpage reading them and
2164		 * overwriting a partial write. However if we have encountered
2165		 * a short write and only partially written into a buffer, it
2166		 * will not be marked uptodate, so a readpage might come in and
2167		 * destroy our partial write.
2168		 *
2169		 * Do the simplest thing, and just treat any short write to a
2170		 * non uptodate page as a zero-length write, and force the
2171		 * caller to redo the whole thing.
2172		 */
2173		if (!PageUptodate(page))
2174			copied = 0;
2175
2176		page_zero_new_buffers(page, start+copied, start+len);
2177	}
2178	flush_dcache_page(page);
2179
2180	/* This could be a short (even 0-length) commit */
2181	__block_commit_write(inode, page, start, start+copied);
2182
2183	return copied;
2184}
2185EXPORT_SYMBOL(block_write_end);
2186
2187int generic_write_end(struct file *file, struct address_space *mapping,
2188			loff_t pos, unsigned len, unsigned copied,
2189			struct page *page, void *fsdata)
2190{
2191	struct inode *inode = mapping->host;
2192	loff_t old_size = inode->i_size;
2193	int i_size_changed = 0;
2194
2195	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2196
2197	/*
2198	 * No need to use i_size_read() here, the i_size
2199	 * cannot change under us because we hold i_mutex.
2200	 *
2201	 * But it's important to update i_size while still holding page lock:
2202	 * page writeout could otherwise come in and zero beyond i_size.
2203	 */
2204	if (pos+copied > inode->i_size) {
2205		i_size_write(inode, pos+copied);
2206		i_size_changed = 1;
2207	}
2208
2209	unlock_page(page);
2210	put_page(page);
2211
2212	if (old_size < pos)
2213		pagecache_isize_extended(inode, old_size, pos);
2214	/*
2215	 * Don't mark the inode dirty under page lock. First, it unnecessarily
2216	 * makes the holding time of page lock longer. Second, it forces lock
2217	 * ordering of page lock and transaction start for journaling
2218	 * filesystems.
2219	 */
2220	if (i_size_changed)
2221		mark_inode_dirty(inode);
2222
2223	return copied;
2224}
2225EXPORT_SYMBOL(generic_write_end);
2226
2227/*
2228 * block_is_partially_uptodate checks whether buffers within a page are
2229 * uptodate or not.
2230 *
2231 * Returns true if all buffers which correspond to a file portion
2232 * we want to read are uptodate.
2233 */
2234int block_is_partially_uptodate(struct page *page, unsigned long from,
2235					unsigned long count)
2236{
 
2237	unsigned block_start, block_end, blocksize;
2238	unsigned to;
2239	struct buffer_head *bh, *head;
2240	int ret = 1;
2241
2242	if (!page_has_buffers(page))
2243		return 0;
2244
2245	head = page_buffers(page);
2246	blocksize = head->b_size;
2247	to = min_t(unsigned, PAGE_SIZE - from, count);
2248	to = from + to;
2249	if (from < blocksize && to > PAGE_SIZE - blocksize)
2250		return 0;
2251
 
2252	bh = head;
2253	block_start = 0;
2254	do {
2255		block_end = block_start + blocksize;
2256		if (block_end > from && block_start < to) {
2257			if (!buffer_uptodate(bh)) {
2258				ret = 0;
2259				break;
2260			}
2261			if (block_end >= to)
2262				break;
2263		}
2264		block_start = block_end;
2265		bh = bh->b_this_page;
2266	} while (bh != head);
2267
2268	return ret;
2269}
2270EXPORT_SYMBOL(block_is_partially_uptodate);
2271
2272/*
2273 * Generic "read page" function for block devices that have the normal
2274 * get_block functionality. This is most of the block device filesystems.
2275 * Reads the page asynchronously --- the unlock_buffer() and
2276 * set/clear_buffer_uptodate() functions propagate buffer state into the
2277 * page struct once IO has completed.
2278 */
2279int block_read_full_page(struct page *page, get_block_t *get_block)
2280{
2281	struct inode *inode = page->mapping->host;
2282	sector_t iblock, lblock;
2283	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2284	unsigned int blocksize, bbits;
2285	int nr, i;
2286	int fully_mapped = 1;
2287
2288	head = create_page_buffers(page, inode, 0);
2289	blocksize = head->b_size;
2290	bbits = block_size_bits(blocksize);
 
 
2291
2292	iblock = (sector_t)page->index << (PAGE_SHIFT - bbits);
2293	lblock = (i_size_read(inode)+blocksize-1) >> bbits;
2294	bh = head;
2295	nr = 0;
2296	i = 0;
2297
2298	do {
2299		if (buffer_uptodate(bh))
2300			continue;
2301
2302		if (!buffer_mapped(bh)) {
2303			int err = 0;
2304
2305			fully_mapped = 0;
2306			if (iblock < lblock) {
2307				WARN_ON(bh->b_size != blocksize);
2308				err = get_block(inode, iblock, bh, 0);
2309				if (err)
2310					SetPageError(page);
2311			}
2312			if (!buffer_mapped(bh)) {
2313				zero_user(page, i * blocksize, blocksize);
2314				if (!err)
2315					set_buffer_uptodate(bh);
2316				continue;
2317			}
2318			/*
2319			 * get_block() might have updated the buffer
2320			 * synchronously
2321			 */
2322			if (buffer_uptodate(bh))
2323				continue;
2324		}
2325		arr[nr++] = bh;
2326	} while (i++, iblock++, (bh = bh->b_this_page) != head);
2327
2328	if (fully_mapped)
2329		SetPageMappedToDisk(page);
2330
2331	if (!nr) {
2332		/*
2333		 * All buffers are uptodate - we can set the page uptodate
2334		 * as well. But not if get_block() returned an error.
2335		 */
2336		if (!PageError(page))
2337			SetPageUptodate(page);
2338		unlock_page(page);
2339		return 0;
2340	}
2341
2342	/* Stage two: lock the buffers */
2343	for (i = 0; i < nr; i++) {
2344		bh = arr[i];
2345		lock_buffer(bh);
2346		mark_buffer_async_read(bh);
2347	}
2348
2349	/*
2350	 * Stage 3: start the IO.  Check for uptodateness
2351	 * inside the buffer lock in case another process reading
2352	 * the underlying blockdev brought it uptodate (the sct fix).
2353	 */
2354	for (i = 0; i < nr; i++) {
2355		bh = arr[i];
2356		if (buffer_uptodate(bh))
2357			end_buffer_async_read(bh, 1);
2358		else
2359			submit_bh(REQ_OP_READ, 0, bh);
2360	}
2361	return 0;
2362}
2363EXPORT_SYMBOL(block_read_full_page);
2364
2365/* utility function for filesystems that need to do work on expanding
2366 * truncates.  Uses filesystem pagecache writes to allow the filesystem to
2367 * deal with the hole.  
2368 */
2369int generic_cont_expand_simple(struct inode *inode, loff_t size)
2370{
2371	struct address_space *mapping = inode->i_mapping;
2372	struct page *page;
2373	void *fsdata;
2374	int err;
2375
2376	err = inode_newsize_ok(inode, size);
2377	if (err)
2378		goto out;
2379
2380	err = pagecache_write_begin(NULL, mapping, size, 0,
2381				AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2382				&page, &fsdata);
2383	if (err)
2384		goto out;
2385
2386	err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2387	BUG_ON(err > 0);
2388
2389out:
2390	return err;
2391}
2392EXPORT_SYMBOL(generic_cont_expand_simple);
2393
2394static int cont_expand_zero(struct file *file, struct address_space *mapping,
2395			    loff_t pos, loff_t *bytes)
2396{
2397	struct inode *inode = mapping->host;
2398	unsigned blocksize = 1 << inode->i_blkbits;
2399	struct page *page;
2400	void *fsdata;
2401	pgoff_t index, curidx;
2402	loff_t curpos;
2403	unsigned zerofrom, offset, len;
2404	int err = 0;
2405
2406	index = pos >> PAGE_SHIFT;
2407	offset = pos & ~PAGE_MASK;
2408
2409	while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
2410		zerofrom = curpos & ~PAGE_MASK;
2411		if (zerofrom & (blocksize-1)) {
2412			*bytes |= (blocksize-1);
2413			(*bytes)++;
2414		}
2415		len = PAGE_SIZE - zerofrom;
2416
2417		err = pagecache_write_begin(file, mapping, curpos, len,
2418						AOP_FLAG_UNINTERRUPTIBLE,
2419						&page, &fsdata);
2420		if (err)
2421			goto out;
2422		zero_user(page, zerofrom, len);
2423		err = pagecache_write_end(file, mapping, curpos, len, len,
2424						page, fsdata);
2425		if (err < 0)
2426			goto out;
2427		BUG_ON(err != len);
2428		err = 0;
2429
2430		balance_dirty_pages_ratelimited(mapping);
2431
2432		if (unlikely(fatal_signal_pending(current))) {
2433			err = -EINTR;
2434			goto out;
2435		}
2436	}
2437
2438	/* page covers the boundary, find the boundary offset */
2439	if (index == curidx) {
2440		zerofrom = curpos & ~PAGE_MASK;
2441		/* if we will expand the thing last block will be filled */
2442		if (offset <= zerofrom) {
2443			goto out;
2444		}
2445		if (zerofrom & (blocksize-1)) {
2446			*bytes |= (blocksize-1);
2447			(*bytes)++;
2448		}
2449		len = offset - zerofrom;
2450
2451		err = pagecache_write_begin(file, mapping, curpos, len,
2452						AOP_FLAG_UNINTERRUPTIBLE,
2453						&page, &fsdata);
2454		if (err)
2455			goto out;
2456		zero_user(page, zerofrom, len);
2457		err = pagecache_write_end(file, mapping, curpos, len, len,
2458						page, fsdata);
2459		if (err < 0)
2460			goto out;
2461		BUG_ON(err != len);
2462		err = 0;
2463	}
2464out:
2465	return err;
2466}
2467
2468/*
2469 * For moronic filesystems that do not allow holes in file.
2470 * We may have to extend the file.
2471 */
2472int cont_write_begin(struct file *file, struct address_space *mapping,
2473			loff_t pos, unsigned len, unsigned flags,
2474			struct page **pagep, void **fsdata,
2475			get_block_t *get_block, loff_t *bytes)
2476{
2477	struct inode *inode = mapping->host;
2478	unsigned blocksize = 1 << inode->i_blkbits;
2479	unsigned zerofrom;
2480	int err;
2481
2482	err = cont_expand_zero(file, mapping, pos, bytes);
2483	if (err)
2484		return err;
2485
2486	zerofrom = *bytes & ~PAGE_MASK;
2487	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2488		*bytes |= (blocksize-1);
2489		(*bytes)++;
2490	}
2491
2492	return block_write_begin(mapping, pos, len, flags, pagep, get_block);
2493}
2494EXPORT_SYMBOL(cont_write_begin);
2495
2496int block_commit_write(struct page *page, unsigned from, unsigned to)
2497{
2498	struct inode *inode = page->mapping->host;
2499	__block_commit_write(inode,page,from,to);
2500	return 0;
2501}
2502EXPORT_SYMBOL(block_commit_write);
2503
2504/*
2505 * block_page_mkwrite() is not allowed to change the file size as it gets
2506 * called from a page fault handler when a page is first dirtied. Hence we must
2507 * be careful to check for EOF conditions here. We set the page up correctly
2508 * for a written page which means we get ENOSPC checking when writing into
2509 * holes and correct delalloc and unwritten extent mapping on filesystems that
2510 * support these features.
2511 *
2512 * We are not allowed to take the i_mutex here so we have to play games to
2513 * protect against truncate races as the page could now be beyond EOF.  Because
2514 * truncate writes the inode size before removing pages, once we have the
2515 * page lock we can determine safely if the page is beyond EOF. If it is not
2516 * beyond EOF, then the page is guaranteed safe against truncation until we
2517 * unlock the page.
2518 *
2519 * Direct callers of this function should protect against filesystem freezing
2520 * using sb_start_pagefault() - sb_end_pagefault() functions.
2521 */
2522int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2523			 get_block_t get_block)
2524{
2525	struct page *page = vmf->page;
2526	struct inode *inode = file_inode(vma->vm_file);
2527	unsigned long end;
2528	loff_t size;
2529	int ret;
2530
2531	lock_page(page);
2532	size = i_size_read(inode);
2533	if ((page->mapping != inode->i_mapping) ||
2534	    (page_offset(page) > size)) {
2535		/* We overload EFAULT to mean page got truncated */
2536		ret = -EFAULT;
2537		goto out_unlock;
2538	}
2539
2540	/* page is wholly or partially inside EOF */
2541	if (((page->index + 1) << PAGE_SHIFT) > size)
2542		end = size & ~PAGE_MASK;
2543	else
2544		end = PAGE_SIZE;
2545
2546	ret = __block_write_begin(page, 0, end, get_block);
2547	if (!ret)
2548		ret = block_commit_write(page, 0, end);
2549
2550	if (unlikely(ret < 0))
2551		goto out_unlock;
 
 
 
 
 
 
 
2552	set_page_dirty(page);
2553	wait_for_stable_page(page);
 
 
 
 
2554	return 0;
2555out_unlock:
2556	unlock_page(page);
2557	return ret;
2558}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2559EXPORT_SYMBOL(block_page_mkwrite);
2560
2561/*
2562 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2563 * immediately, while under the page lock.  So it needs a special end_io
2564 * handler which does not touch the bh after unlocking it.
2565 */
2566static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2567{
2568	__end_buffer_read_notouch(bh, uptodate);
2569}
2570
2571/*
2572 * Attach the singly-linked list of buffers created by nobh_write_begin, to
2573 * the page (converting it to circular linked list and taking care of page
2574 * dirty races).
2575 */
2576static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2577{
2578	struct buffer_head *bh;
2579
2580	BUG_ON(!PageLocked(page));
2581
2582	spin_lock(&page->mapping->private_lock);
2583	bh = head;
2584	do {
2585		if (PageDirty(page))
2586			set_buffer_dirty(bh);
2587		if (!bh->b_this_page)
2588			bh->b_this_page = head;
2589		bh = bh->b_this_page;
2590	} while (bh != head);
2591	attach_page_buffers(page, head);
2592	spin_unlock(&page->mapping->private_lock);
2593}
2594
2595/*
2596 * On entry, the page is fully not uptodate.
2597 * On exit the page is fully uptodate in the areas outside (from,to)
2598 * The filesystem needs to handle block truncation upon failure.
2599 */
2600int nobh_write_begin(struct address_space *mapping,
2601			loff_t pos, unsigned len, unsigned flags,
2602			struct page **pagep, void **fsdata,
2603			get_block_t *get_block)
2604{
2605	struct inode *inode = mapping->host;
2606	const unsigned blkbits = inode->i_blkbits;
2607	const unsigned blocksize = 1 << blkbits;
2608	struct buffer_head *head, *bh;
2609	struct page *page;
2610	pgoff_t index;
2611	unsigned from, to;
2612	unsigned block_in_page;
2613	unsigned block_start, block_end;
2614	sector_t block_in_file;
2615	int nr_reads = 0;
2616	int ret = 0;
2617	int is_mapped_to_disk = 1;
2618
2619	index = pos >> PAGE_SHIFT;
2620	from = pos & (PAGE_SIZE - 1);
2621	to = from + len;
2622
2623	page = grab_cache_page_write_begin(mapping, index, flags);
2624	if (!page)
2625		return -ENOMEM;
2626	*pagep = page;
2627	*fsdata = NULL;
2628
2629	if (page_has_buffers(page)) {
2630		ret = __block_write_begin(page, pos, len, get_block);
2631		if (unlikely(ret))
2632			goto out_release;
2633		return ret;
2634	}
2635
2636	if (PageMappedToDisk(page))
2637		return 0;
2638
2639	/*
2640	 * Allocate buffers so that we can keep track of state, and potentially
2641	 * attach them to the page if an error occurs. In the common case of
2642	 * no error, they will just be freed again without ever being attached
2643	 * to the page (which is all OK, because we're under the page lock).
2644	 *
2645	 * Be careful: the buffer linked list is a NULL terminated one, rather
2646	 * than the circular one we're used to.
2647	 */
2648	head = alloc_page_buffers(page, blocksize, 0);
2649	if (!head) {
2650		ret = -ENOMEM;
2651		goto out_release;
2652	}
2653
2654	block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
2655
2656	/*
2657	 * We loop across all blocks in the page, whether or not they are
2658	 * part of the affected region.  This is so we can discover if the
2659	 * page is fully mapped-to-disk.
2660	 */
2661	for (block_start = 0, block_in_page = 0, bh = head;
2662		  block_start < PAGE_SIZE;
2663		  block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2664		int create;
2665
2666		block_end = block_start + blocksize;
2667		bh->b_state = 0;
2668		create = 1;
2669		if (block_start >= to)
2670			create = 0;
2671		ret = get_block(inode, block_in_file + block_in_page,
2672					bh, create);
2673		if (ret)
2674			goto failed;
2675		if (!buffer_mapped(bh))
2676			is_mapped_to_disk = 0;
2677		if (buffer_new(bh))
2678			clean_bdev_bh_alias(bh);
2679		if (PageUptodate(page)) {
2680			set_buffer_uptodate(bh);
2681			continue;
2682		}
2683		if (buffer_new(bh) || !buffer_mapped(bh)) {
2684			zero_user_segments(page, block_start, from,
2685							to, block_end);
2686			continue;
2687		}
2688		if (buffer_uptodate(bh))
2689			continue;	/* reiserfs does this */
2690		if (block_start < from || block_end > to) {
2691			lock_buffer(bh);
2692			bh->b_end_io = end_buffer_read_nobh;
2693			submit_bh(REQ_OP_READ, 0, bh);
2694			nr_reads++;
2695		}
2696	}
2697
2698	if (nr_reads) {
2699		/*
2700		 * The page is locked, so these buffers are protected from
2701		 * any VM or truncate activity.  Hence we don't need to care
2702		 * for the buffer_head refcounts.
2703		 */
2704		for (bh = head; bh; bh = bh->b_this_page) {
2705			wait_on_buffer(bh);
2706			if (!buffer_uptodate(bh))
2707				ret = -EIO;
2708		}
2709		if (ret)
2710			goto failed;
2711	}
2712
2713	if (is_mapped_to_disk)
2714		SetPageMappedToDisk(page);
2715
2716	*fsdata = head; /* to be released by nobh_write_end */
2717
2718	return 0;
2719
2720failed:
2721	BUG_ON(!ret);
2722	/*
2723	 * Error recovery is a bit difficult. We need to zero out blocks that
2724	 * were newly allocated, and dirty them to ensure they get written out.
2725	 * Buffers need to be attached to the page at this point, otherwise
2726	 * the handling of potential IO errors during writeout would be hard
2727	 * (could try doing synchronous writeout, but what if that fails too?)
2728	 */
2729	attach_nobh_buffers(page, head);
2730	page_zero_new_buffers(page, from, to);
2731
2732out_release:
2733	unlock_page(page);
2734	put_page(page);
2735	*pagep = NULL;
2736
2737	return ret;
2738}
2739EXPORT_SYMBOL(nobh_write_begin);
2740
2741int nobh_write_end(struct file *file, struct address_space *mapping,
2742			loff_t pos, unsigned len, unsigned copied,
2743			struct page *page, void *fsdata)
2744{
2745	struct inode *inode = page->mapping->host;
2746	struct buffer_head *head = fsdata;
2747	struct buffer_head *bh;
2748	BUG_ON(fsdata != NULL && page_has_buffers(page));
2749
2750	if (unlikely(copied < len) && head)
2751		attach_nobh_buffers(page, head);
2752	if (page_has_buffers(page))
2753		return generic_write_end(file, mapping, pos, len,
2754					copied, page, fsdata);
2755
2756	SetPageUptodate(page);
2757	set_page_dirty(page);
2758	if (pos+copied > inode->i_size) {
2759		i_size_write(inode, pos+copied);
2760		mark_inode_dirty(inode);
2761	}
2762
2763	unlock_page(page);
2764	put_page(page);
2765
2766	while (head) {
2767		bh = head;
2768		head = head->b_this_page;
2769		free_buffer_head(bh);
2770	}
2771
2772	return copied;
2773}
2774EXPORT_SYMBOL(nobh_write_end);
2775
2776/*
2777 * nobh_writepage() - based on block_full_write_page() except
2778 * that it tries to operate without attaching bufferheads to
2779 * the page.
2780 */
2781int nobh_writepage(struct page *page, get_block_t *get_block,
2782			struct writeback_control *wbc)
2783{
2784	struct inode * const inode = page->mapping->host;
2785	loff_t i_size = i_size_read(inode);
2786	const pgoff_t end_index = i_size >> PAGE_SHIFT;
2787	unsigned offset;
2788	int ret;
2789
2790	/* Is the page fully inside i_size? */
2791	if (page->index < end_index)
2792		goto out;
2793
2794	/* Is the page fully outside i_size? (truncate in progress) */
2795	offset = i_size & (PAGE_SIZE-1);
2796	if (page->index >= end_index+1 || !offset) {
2797		/*
2798		 * The page may have dirty, unmapped buffers.  For example,
2799		 * they may have been added in ext3_writepage().  Make them
2800		 * freeable here, so the page does not leak.
2801		 */
2802#if 0
2803		/* Not really sure about this  - do we need this ? */
2804		if (page->mapping->a_ops->invalidatepage)
2805			page->mapping->a_ops->invalidatepage(page, offset);
2806#endif
2807		unlock_page(page);
2808		return 0; /* don't care */
2809	}
2810
2811	/*
2812	 * The page straddles i_size.  It must be zeroed out on each and every
2813	 * writepage invocation because it may be mmapped.  "A file is mapped
2814	 * in multiples of the page size.  For a file that is not a multiple of
2815	 * the  page size, the remaining memory is zeroed when mapped, and
2816	 * writes to that region are not written out to the file."
2817	 */
2818	zero_user_segment(page, offset, PAGE_SIZE);
2819out:
2820	ret = mpage_writepage(page, get_block, wbc);
2821	if (ret == -EAGAIN)
2822		ret = __block_write_full_page(inode, page, get_block, wbc,
2823					      end_buffer_async_write);
2824	return ret;
2825}
2826EXPORT_SYMBOL(nobh_writepage);
2827
2828int nobh_truncate_page(struct address_space *mapping,
2829			loff_t from, get_block_t *get_block)
2830{
2831	pgoff_t index = from >> PAGE_SHIFT;
2832	unsigned offset = from & (PAGE_SIZE-1);
2833	unsigned blocksize;
2834	sector_t iblock;
2835	unsigned length, pos;
2836	struct inode *inode = mapping->host;
2837	struct page *page;
2838	struct buffer_head map_bh;
2839	int err;
2840
2841	blocksize = 1 << inode->i_blkbits;
2842	length = offset & (blocksize - 1);
2843
2844	/* Block boundary? Nothing to do */
2845	if (!length)
2846		return 0;
2847
2848	length = blocksize - length;
2849	iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
2850
2851	page = grab_cache_page(mapping, index);
2852	err = -ENOMEM;
2853	if (!page)
2854		goto out;
2855
2856	if (page_has_buffers(page)) {
2857has_buffers:
2858		unlock_page(page);
2859		put_page(page);
2860		return block_truncate_page(mapping, from, get_block);
2861	}
2862
2863	/* Find the buffer that contains "offset" */
2864	pos = blocksize;
2865	while (offset >= pos) {
2866		iblock++;
2867		pos += blocksize;
2868	}
2869
2870	map_bh.b_size = blocksize;
2871	map_bh.b_state = 0;
2872	err = get_block(inode, iblock, &map_bh, 0);
2873	if (err)
2874		goto unlock;
2875	/* unmapped? It's a hole - nothing to do */
2876	if (!buffer_mapped(&map_bh))
2877		goto unlock;
2878
2879	/* Ok, it's mapped. Make sure it's up-to-date */
2880	if (!PageUptodate(page)) {
2881		err = mapping->a_ops->readpage(NULL, page);
2882		if (err) {
2883			put_page(page);
2884			goto out;
2885		}
2886		lock_page(page);
2887		if (!PageUptodate(page)) {
2888			err = -EIO;
2889			goto unlock;
2890		}
2891		if (page_has_buffers(page))
2892			goto has_buffers;
2893	}
2894	zero_user(page, offset, length);
2895	set_page_dirty(page);
2896	err = 0;
2897
2898unlock:
2899	unlock_page(page);
2900	put_page(page);
2901out:
2902	return err;
2903}
2904EXPORT_SYMBOL(nobh_truncate_page);
2905
2906int block_truncate_page(struct address_space *mapping,
2907			loff_t from, get_block_t *get_block)
2908{
2909	pgoff_t index = from >> PAGE_SHIFT;
2910	unsigned offset = from & (PAGE_SIZE-1);
2911	unsigned blocksize;
2912	sector_t iblock;
2913	unsigned length, pos;
2914	struct inode *inode = mapping->host;
2915	struct page *page;
2916	struct buffer_head *bh;
2917	int err;
2918
2919	blocksize = 1 << inode->i_blkbits;
2920	length = offset & (blocksize - 1);
2921
2922	/* Block boundary? Nothing to do */
2923	if (!length)
2924		return 0;
2925
2926	length = blocksize - length;
2927	iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
2928	
2929	page = grab_cache_page(mapping, index);
2930	err = -ENOMEM;
2931	if (!page)
2932		goto out;
2933
2934	if (!page_has_buffers(page))
2935		create_empty_buffers(page, blocksize, 0);
2936
2937	/* Find the buffer that contains "offset" */
2938	bh = page_buffers(page);
2939	pos = blocksize;
2940	while (offset >= pos) {
2941		bh = bh->b_this_page;
2942		iblock++;
2943		pos += blocksize;
2944	}
2945
2946	err = 0;
2947	if (!buffer_mapped(bh)) {
2948		WARN_ON(bh->b_size != blocksize);
2949		err = get_block(inode, iblock, bh, 0);
2950		if (err)
2951			goto unlock;
2952		/* unmapped? It's a hole - nothing to do */
2953		if (!buffer_mapped(bh))
2954			goto unlock;
2955	}
2956
2957	/* Ok, it's mapped. Make sure it's up-to-date */
2958	if (PageUptodate(page))
2959		set_buffer_uptodate(bh);
2960
2961	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2962		err = -EIO;
2963		ll_rw_block(REQ_OP_READ, 0, 1, &bh);
2964		wait_on_buffer(bh);
2965		/* Uhhuh. Read error. Complain and punt. */
2966		if (!buffer_uptodate(bh))
2967			goto unlock;
2968	}
2969
2970	zero_user(page, offset, length);
2971	mark_buffer_dirty(bh);
2972	err = 0;
2973
2974unlock:
2975	unlock_page(page);
2976	put_page(page);
2977out:
2978	return err;
2979}
2980EXPORT_SYMBOL(block_truncate_page);
2981
2982/*
2983 * The generic ->writepage function for buffer-backed address_spaces
 
2984 */
2985int block_write_full_page(struct page *page, get_block_t *get_block,
2986			struct writeback_control *wbc)
2987{
2988	struct inode * const inode = page->mapping->host;
2989	loff_t i_size = i_size_read(inode);
2990	const pgoff_t end_index = i_size >> PAGE_SHIFT;
2991	unsigned offset;
2992
2993	/* Is the page fully inside i_size? */
2994	if (page->index < end_index)
2995		return __block_write_full_page(inode, page, get_block, wbc,
2996					       end_buffer_async_write);
2997
2998	/* Is the page fully outside i_size? (truncate in progress) */
2999	offset = i_size & (PAGE_SIZE-1);
3000	if (page->index >= end_index+1 || !offset) {
3001		/*
3002		 * The page may have dirty, unmapped buffers.  For example,
3003		 * they may have been added in ext3_writepage().  Make them
3004		 * freeable here, so the page does not leak.
3005		 */
3006		do_invalidatepage(page, 0, PAGE_SIZE);
3007		unlock_page(page);
3008		return 0; /* don't care */
3009	}
3010
3011	/*
3012	 * The page straddles i_size.  It must be zeroed out on each and every
3013	 * writepage invocation because it may be mmapped.  "A file is mapped
3014	 * in multiples of the page size.  For a file that is not a multiple of
3015	 * the  page size, the remaining memory is zeroed when mapped, and
3016	 * writes to that region are not written out to the file."
3017	 */
3018	zero_user_segment(page, offset, PAGE_SIZE);
3019	return __block_write_full_page(inode, page, get_block, wbc,
3020							end_buffer_async_write);
 
 
 
 
 
 
 
 
 
 
3021}
3022EXPORT_SYMBOL(block_write_full_page);
3023
3024sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
3025			    get_block_t *get_block)
3026{
3027	struct buffer_head tmp;
3028	struct inode *inode = mapping->host;
3029	tmp.b_state = 0;
3030	tmp.b_blocknr = 0;
3031	tmp.b_size = 1 << inode->i_blkbits;
3032	get_block(inode, block, &tmp, 0);
3033	return tmp.b_blocknr;
3034}
3035EXPORT_SYMBOL(generic_block_bmap);
3036
3037static void end_bio_bh_io_sync(struct bio *bio)
3038{
3039	struct buffer_head *bh = bio->bi_private;
3040
3041	if (unlikely(bio_flagged(bio, BIO_QUIET)))
 
 
 
 
3042		set_bit(BH_Quiet, &bh->b_state);
3043
3044	bh->b_end_io(bh, !bio->bi_error);
3045	bio_put(bio);
3046}
3047
3048/*
3049 * This allows us to do IO even on the odd last sectors
3050 * of a device, even if the block size is some multiple
3051 * of the physical sector size.
3052 *
3053 * We'll just truncate the bio to the size of the device,
3054 * and clear the end of the buffer head manually.
3055 *
3056 * Truly out-of-range accesses will turn into actual IO
3057 * errors, this only handles the "we need to be able to
3058 * do IO at the final sector" case.
3059 */
3060void guard_bio_eod(int op, struct bio *bio)
3061{
3062	sector_t maxsector;
3063	struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
3064	unsigned truncated_bytes;
3065
3066	maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
3067	if (!maxsector)
3068		return;
3069
3070	/*
3071	 * If the *whole* IO is past the end of the device,
3072	 * let it through, and the IO layer will turn it into
3073	 * an EIO.
3074	 */
3075	if (unlikely(bio->bi_iter.bi_sector >= maxsector))
3076		return;
3077
3078	maxsector -= bio->bi_iter.bi_sector;
3079	if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
3080		return;
3081
3082	/* Uhhuh. We've got a bio that straddles the device size! */
3083	truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
3084
3085	/* Truncate the bio.. */
3086	bio->bi_iter.bi_size -= truncated_bytes;
3087	bvec->bv_len -= truncated_bytes;
3088
3089	/* ..and clear the end of the buffer for reads */
3090	if (op == REQ_OP_READ) {
3091		zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
3092				truncated_bytes);
3093	}
3094}
3095
3096static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
3097			 unsigned long bio_flags, struct writeback_control *wbc)
3098{
3099	struct bio *bio;
 
3100
3101	BUG_ON(!buffer_locked(bh));
3102	BUG_ON(!buffer_mapped(bh));
3103	BUG_ON(!bh->b_end_io);
3104	BUG_ON(buffer_delay(bh));
3105	BUG_ON(buffer_unwritten(bh));
3106
3107	/*
3108	 * Only clear out a write error when rewriting
3109	 */
3110	if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
3111		clear_buffer_write_io_error(bh);
3112
3113	/*
3114	 * from here on down, it's all bio -- do the initial mapping,
3115	 * submit_bio -> generic_make_request may further map this bio around
3116	 */
3117	bio = bio_alloc(GFP_NOIO, 1);
3118
3119	if (wbc) {
3120		wbc_init_bio(wbc, bio);
3121		wbc_account_io(wbc, bh->b_page, bh->b_size);
3122	}
3123
3124	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
3125	bio->bi_bdev = bh->b_bdev;
3126
3127	bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
3128	BUG_ON(bio->bi_iter.bi_size != bh->b_size);
 
 
 
 
3129
3130	bio->bi_end_io = end_bio_bh_io_sync;
3131	bio->bi_private = bh;
3132	bio->bi_flags |= bio_flags;
3133
3134	/* Take care of bh's that straddle the end of the device */
3135	guard_bio_eod(op, bio);
3136
3137	if (buffer_meta(bh))
3138		op_flags |= REQ_META;
3139	if (buffer_prio(bh))
3140		op_flags |= REQ_PRIO;
3141	bio_set_op_attrs(bio, op, op_flags);
3142
3143	submit_bio(bio);
3144	return 0;
3145}
3146
3147int _submit_bh(int op, int op_flags, struct buffer_head *bh,
3148	       unsigned long bio_flags)
3149{
3150	return submit_bh_wbc(op, op_flags, bh, bio_flags, NULL);
3151}
3152EXPORT_SYMBOL_GPL(_submit_bh);
3153
3154int submit_bh(int op, int op_flags,  struct buffer_head *bh)
3155{
3156	return submit_bh_wbc(op, op_flags, bh, 0, NULL);
3157}
3158EXPORT_SYMBOL(submit_bh);
3159
3160/**
3161 * ll_rw_block: low-level access to block devices (DEPRECATED)
3162 * @op: whether to %READ or %WRITE
3163 * @op_flags: req_flag_bits
3164 * @nr: number of &struct buffer_heads in the array
3165 * @bhs: array of pointers to &struct buffer_head
3166 *
3167 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
3168 * requests an I/O operation on them, either a %REQ_OP_READ or a %REQ_OP_WRITE.
3169 * @op_flags contains flags modifying the detailed I/O behavior, most notably
3170 * %REQ_RAHEAD.
3171 *
3172 * This function drops any buffer that it cannot get a lock on (with the
3173 * BH_Lock state bit), any buffer that appears to be clean when doing a write
3174 * request, and any buffer that appears to be up-to-date when doing read
3175 * request.  Further it marks as clean buffers that are processed for
3176 * writing (the buffer cache won't assume that they are actually clean
3177 * until the buffer gets unlocked).
3178 *
3179 * ll_rw_block sets b_end_io to simple completion handler that marks
3180 * the buffer up-to-date (if appropriate), unlocks the buffer and wakes
3181 * any waiters. 
3182 *
3183 * All of the buffers must be for the same device, and must also be a
3184 * multiple of the current approved size for the device.
3185 */
3186void ll_rw_block(int op, int op_flags,  int nr, struct buffer_head *bhs[])
3187{
3188	int i;
3189
3190	for (i = 0; i < nr; i++) {
3191		struct buffer_head *bh = bhs[i];
3192
3193		if (!trylock_buffer(bh))
3194			continue;
3195		if (op == WRITE) {
3196			if (test_clear_buffer_dirty(bh)) {
3197				bh->b_end_io = end_buffer_write_sync;
3198				get_bh(bh);
3199				submit_bh(op, op_flags, bh);
3200				continue;
3201			}
3202		} else {
3203			if (!buffer_uptodate(bh)) {
3204				bh->b_end_io = end_buffer_read_sync;
3205				get_bh(bh);
3206				submit_bh(op, op_flags, bh);
3207				continue;
3208			}
3209		}
3210		unlock_buffer(bh);
3211	}
3212}
3213EXPORT_SYMBOL(ll_rw_block);
3214
3215void write_dirty_buffer(struct buffer_head *bh, int op_flags)
3216{
3217	lock_buffer(bh);
3218	if (!test_clear_buffer_dirty(bh)) {
3219		unlock_buffer(bh);
3220		return;
3221	}
3222	bh->b_end_io = end_buffer_write_sync;
3223	get_bh(bh);
3224	submit_bh(REQ_OP_WRITE, op_flags, bh);
3225}
3226EXPORT_SYMBOL(write_dirty_buffer);
3227
3228/*
3229 * For a data-integrity writeout, we need to wait upon any in-progress I/O
3230 * and then start new I/O and then wait upon it.  The caller must have a ref on
3231 * the buffer_head.
3232 */
3233int __sync_dirty_buffer(struct buffer_head *bh, int op_flags)
3234{
3235	int ret = 0;
3236
3237	WARN_ON(atomic_read(&bh->b_count) < 1);
3238	lock_buffer(bh);
3239	if (test_clear_buffer_dirty(bh)) {
3240		get_bh(bh);
3241		bh->b_end_io = end_buffer_write_sync;
3242		ret = submit_bh(REQ_OP_WRITE, op_flags, bh);
3243		wait_on_buffer(bh);
3244		if (!ret && !buffer_uptodate(bh))
3245			ret = -EIO;
3246	} else {
3247		unlock_buffer(bh);
3248	}
3249	return ret;
3250}
3251EXPORT_SYMBOL(__sync_dirty_buffer);
3252
3253int sync_dirty_buffer(struct buffer_head *bh)
3254{
3255	return __sync_dirty_buffer(bh, REQ_SYNC);
3256}
3257EXPORT_SYMBOL(sync_dirty_buffer);
3258
3259/*
3260 * try_to_free_buffers() checks if all the buffers on this particular page
3261 * are unused, and releases them if so.
3262 *
3263 * Exclusion against try_to_free_buffers may be obtained by either
3264 * locking the page or by holding its mapping's private_lock.
3265 *
3266 * If the page is dirty but all the buffers are clean then we need to
3267 * be sure to mark the page clean as well.  This is because the page
3268 * may be against a block device, and a later reattachment of buffers
3269 * to a dirty page will set *all* buffers dirty.  Which would corrupt
3270 * filesystem data on the same device.
3271 *
3272 * The same applies to regular filesystem pages: if all the buffers are
3273 * clean then we set the page clean and proceed.  To do that, we require
3274 * total exclusion from __set_page_dirty_buffers().  That is obtained with
3275 * private_lock.
3276 *
3277 * try_to_free_buffers() is non-blocking.
3278 */
3279static inline int buffer_busy(struct buffer_head *bh)
3280{
3281	return atomic_read(&bh->b_count) |
3282		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3283}
3284
3285static int
3286drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3287{
3288	struct buffer_head *head = page_buffers(page);
3289	struct buffer_head *bh;
3290
3291	bh = head;
3292	do {
3293		if (buffer_write_io_error(bh) && page->mapping)
3294			mapping_set_error(page->mapping, -EIO);
3295		if (buffer_busy(bh))
3296			goto failed;
3297		bh = bh->b_this_page;
3298	} while (bh != head);
3299
3300	do {
3301		struct buffer_head *next = bh->b_this_page;
3302
3303		if (bh->b_assoc_map)
3304			__remove_assoc_queue(bh);
3305		bh = next;
3306	} while (bh != head);
3307	*buffers_to_free = head;
3308	__clear_page_buffers(page);
3309	return 1;
3310failed:
3311	return 0;
3312}
3313
3314int try_to_free_buffers(struct page *page)
3315{
3316	struct address_space * const mapping = page->mapping;
3317	struct buffer_head *buffers_to_free = NULL;
3318	int ret = 0;
3319
3320	BUG_ON(!PageLocked(page));
3321	if (PageWriteback(page))
3322		return 0;
3323
3324	if (mapping == NULL) {		/* can this still happen? */
3325		ret = drop_buffers(page, &buffers_to_free);
3326		goto out;
3327	}
3328
3329	spin_lock(&mapping->private_lock);
3330	ret = drop_buffers(page, &buffers_to_free);
3331
3332	/*
3333	 * If the filesystem writes its buffers by hand (eg ext3)
3334	 * then we can have clean buffers against a dirty page.  We
3335	 * clean the page here; otherwise the VM will never notice
3336	 * that the filesystem did any IO at all.
3337	 *
3338	 * Also, during truncate, discard_buffer will have marked all
3339	 * the page's buffers clean.  We discover that here and clean
3340	 * the page also.
3341	 *
3342	 * private_lock must be held over this entire operation in order
3343	 * to synchronise against __set_page_dirty_buffers and prevent the
3344	 * dirty bit from being lost.
3345	 */
3346	if (ret)
3347		cancel_dirty_page(page);
3348	spin_unlock(&mapping->private_lock);
3349out:
3350	if (buffers_to_free) {
3351		struct buffer_head *bh = buffers_to_free;
3352
3353		do {
3354			struct buffer_head *next = bh->b_this_page;
3355			free_buffer_head(bh);
3356			bh = next;
3357		} while (bh != buffers_to_free);
3358	}
3359	return ret;
3360}
3361EXPORT_SYMBOL(try_to_free_buffers);
3362
3363/*
3364 * There are no bdflush tunables left.  But distributions are
3365 * still running obsolete flush daemons, so we terminate them here.
3366 *
3367 * Use of bdflush() is deprecated and will be removed in a future kernel.
3368 * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3369 */
3370SYSCALL_DEFINE2(bdflush, int, func, long, data)
3371{
3372	static int msg_count;
3373
3374	if (!capable(CAP_SYS_ADMIN))
3375		return -EPERM;
3376
3377	if (msg_count < 5) {
3378		msg_count++;
3379		printk(KERN_INFO
3380			"warning: process `%s' used the obsolete bdflush"
3381			" system call\n", current->comm);
3382		printk(KERN_INFO "Fix your initscripts?\n");
3383	}
3384
3385	if (func == 1)
3386		do_exit(0);
3387	return 0;
3388}
3389
3390/*
3391 * Buffer-head allocation
3392 */
3393static struct kmem_cache *bh_cachep __read_mostly;
3394
3395/*
3396 * Once the number of bh's in the machine exceeds this level, we start
3397 * stripping them in writeback.
3398 */
3399static unsigned long max_buffer_heads;
3400
3401int buffer_heads_over_limit;
3402
3403struct bh_accounting {
3404	int nr;			/* Number of live bh's */
3405	int ratelimit;		/* Limit cacheline bouncing */
3406};
3407
3408static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3409
3410static void recalc_bh_state(void)
3411{
3412	int i;
3413	int tot = 0;
3414
3415	if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3416		return;
3417	__this_cpu_write(bh_accounting.ratelimit, 0);
3418	for_each_online_cpu(i)
3419		tot += per_cpu(bh_accounting, i).nr;
3420	buffer_heads_over_limit = (tot > max_buffer_heads);
3421}
3422
3423struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3424{
3425	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3426	if (ret) {
3427		INIT_LIST_HEAD(&ret->b_assoc_buffers);
3428		preempt_disable();
3429		__this_cpu_inc(bh_accounting.nr);
3430		recalc_bh_state();
3431		preempt_enable();
3432	}
3433	return ret;
3434}
3435EXPORT_SYMBOL(alloc_buffer_head);
3436
3437void free_buffer_head(struct buffer_head *bh)
3438{
3439	BUG_ON(!list_empty(&bh->b_assoc_buffers));
3440	kmem_cache_free(bh_cachep, bh);
3441	preempt_disable();
3442	__this_cpu_dec(bh_accounting.nr);
3443	recalc_bh_state();
3444	preempt_enable();
3445}
3446EXPORT_SYMBOL(free_buffer_head);
3447
3448static int buffer_exit_cpu_dead(unsigned int cpu)
3449{
3450	int i;
3451	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3452
3453	for (i = 0; i < BH_LRU_SIZE; i++) {
3454		brelse(b->bhs[i]);
3455		b->bhs[i] = NULL;
3456	}
3457	this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3458	per_cpu(bh_accounting, cpu).nr = 0;
3459	return 0;
 
 
 
 
 
 
 
3460}
3461
3462/**
3463 * bh_uptodate_or_lock - Test whether the buffer is uptodate
3464 * @bh: struct buffer_head
3465 *
3466 * Return true if the buffer is up-to-date and false,
3467 * with the buffer locked, if not.
3468 */
3469int bh_uptodate_or_lock(struct buffer_head *bh)
3470{
3471	if (!buffer_uptodate(bh)) {
3472		lock_buffer(bh);
3473		if (!buffer_uptodate(bh))
3474			return 0;
3475		unlock_buffer(bh);
3476	}
3477	return 1;
3478}
3479EXPORT_SYMBOL(bh_uptodate_or_lock);
3480
3481/**
3482 * bh_submit_read - Submit a locked buffer for reading
3483 * @bh: struct buffer_head
3484 *
3485 * Returns zero on success and -EIO on error.
3486 */
3487int bh_submit_read(struct buffer_head *bh)
3488{
3489	BUG_ON(!buffer_locked(bh));
3490
3491	if (buffer_uptodate(bh)) {
3492		unlock_buffer(bh);
3493		return 0;
3494	}
3495
3496	get_bh(bh);
3497	bh->b_end_io = end_buffer_read_sync;
3498	submit_bh(REQ_OP_READ, 0, bh);
3499	wait_on_buffer(bh);
3500	if (buffer_uptodate(bh))
3501		return 0;
3502	return -EIO;
3503}
3504EXPORT_SYMBOL(bh_submit_read);
3505
3506void __init buffer_init(void)
3507{
3508	unsigned long nrpages;
3509	int ret;
3510
3511	bh_cachep = kmem_cache_create("buffer_head",
3512			sizeof(struct buffer_head), 0,
3513				(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3514				SLAB_MEM_SPREAD),
3515				NULL);
3516
3517	/*
3518	 * Limit the bh occupancy to 10% of ZONE_NORMAL
3519	 */
3520	nrpages = (nr_free_buffer_pages() * 10) / 100;
3521	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3522	ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
3523					NULL, buffer_exit_cpu_dead);
3524	WARN_ON(ret < 0);
3525}

   1/*
   2 *  linux/fs/buffer.c
   3 *
   4 *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
   5 */
   6
   7/*
   8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
   9 *
  10 * Removed a lot of unnecessary code and simplified things now that
  11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  12 *
  13 * Speed up hash, lru, and free list operations.  Use gfp() for allocating
  14 * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
  15 *
  16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
  17 *
  18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
  19 */
  20
  21#include <linux/kernel.h>
  22#include <linux/syscalls.h>
  23#include <linux/fs.h>
 
  24#include <linux/mm.h>
  25#include <linux/percpu.h>
  26#include <linux/slab.h>
  27#include <linux/capability.h>
  28#include <linux/blkdev.h>
  29#include <linux/file.h>
  30#include <linux/quotaops.h>
  31#include <linux/highmem.h>
  32#include <linux/module.h>
 
  33#include <linux/writeback.h>
  34#include <linux/hash.h>
  35#include <linux/suspend.h>
  36#include <linux/buffer_head.h>
  37#include <linux/task_io_accounting_ops.h>
  38#include <linux/bio.h>
  39#include <linux/notifier.h>
  40#include <linux/cpu.h>
  41#include <linux/bitops.h>
  42#include <linux/mpage.h>
  43#include <linux/bit_spinlock.h>
  44#include <linux/cleancache.h>
 
  45
  46static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 
 
 
  47
  48#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
  49
  50inline void
  51init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
  52{
  53	bh->b_end_io = handler;
  54	bh->b_private = private;
  55}
  56EXPORT_SYMBOL(init_buffer);
  57
  58static int sleep_on_buffer(void *word)
  59{
  60	io_schedule();
  61	return 0;
  62}
 
  63
  64void __lock_buffer(struct buffer_head *bh)
  65{
  66	wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer,
  67							TASK_UNINTERRUPTIBLE);
  68}
  69EXPORT_SYMBOL(__lock_buffer);
  70
  71void unlock_buffer(struct buffer_head *bh)
  72{
  73	clear_bit_unlock(BH_Lock, &bh->b_state);
  74	smp_mb__after_clear_bit();
  75	wake_up_bit(&bh->b_state, BH_Lock);
  76}
  77EXPORT_SYMBOL(unlock_buffer);
  78
  79/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  80 * Block until a buffer comes unlocked.  This doesn't stop it
  81 * from becoming locked again - you have to lock it yourself
  82 * if you want to preserve its state.
  83 */
  84void __wait_on_buffer(struct buffer_head * bh)
  85{
  86	wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE);
  87}
  88EXPORT_SYMBOL(__wait_on_buffer);
  89
  90static void
  91__clear_page_buffers(struct page *page)
  92{
  93	ClearPagePrivate(page);
  94	set_page_private(page, 0);
  95	page_cache_release(page);
  96}
  97
  98
  99static int quiet_error(struct buffer_head *bh)
 100{
 101	if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
 102		return 0;
 103	return 1;
 104}
 105
 106
 107static void buffer_io_error(struct buffer_head *bh)
 108{
 109	char b[BDEVNAME_SIZE];
 110	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
 111			bdevname(bh->b_bdev, b),
 112			(unsigned long long)bh->b_blocknr);
 113}
 114
 115/*
 116 * End-of-IO handler helper function which does not touch the bh after
 117 * unlocking it.
 118 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
 119 * a race there is benign: unlock_buffer() only use the bh's address for
 120 * hashing after unlocking the buffer, so it doesn't actually touch the bh
 121 * itself.
 122 */
 123static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
 124{
 125	if (uptodate) {
 126		set_buffer_uptodate(bh);
 127	} else {
 128		/* This happens, due to failed READA attempts. */
 129		clear_buffer_uptodate(bh);
 130	}
 131	unlock_buffer(bh);
 132}
 133
 134/*
 135 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 136 * unlock the buffer. This is what ll_rw_block uses too.
 137 */
 138void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
 139{
 140	__end_buffer_read_notouch(bh, uptodate);
 141	put_bh(bh);
 142}
 143EXPORT_SYMBOL(end_buffer_read_sync);
 144
 145void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 146{
 147	char b[BDEVNAME_SIZE];
 148
 149	if (uptodate) {
 150		set_buffer_uptodate(bh);
 151	} else {
 152		if (!quiet_error(bh)) {
 153			buffer_io_error(bh);
 154			printk(KERN_WARNING "lost page write due to "
 155					"I/O error on %s\n",
 156				       bdevname(bh->b_bdev, b));
 157		}
 158		set_buffer_write_io_error(bh);
 159		clear_buffer_uptodate(bh);
 160	}
 161	unlock_buffer(bh);
 162	put_bh(bh);
 163}
 164EXPORT_SYMBOL(end_buffer_write_sync);
 165
 166/*
 167 * Various filesystems appear to want __find_get_block to be non-blocking.
 168 * But it's the page lock which protects the buffers.  To get around this,
 169 * we get exclusion from try_to_free_buffers with the blockdev mapping's
 170 * private_lock.
 171 *
 172 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
 173 * may be quite high.  This code could TryLock the page, and if that
 174 * succeeds, there is no need to take private_lock. (But if
 175 * private_lock is contended then so is mapping->tree_lock).
 176 */
 177static struct buffer_head *
 178__find_get_block_slow(struct block_device *bdev, sector_t block)
 179{
 180	struct inode *bd_inode = bdev->bd_inode;
 181	struct address_space *bd_mapping = bd_inode->i_mapping;
 182	struct buffer_head *ret = NULL;
 183	pgoff_t index;
 184	struct buffer_head *bh;
 185	struct buffer_head *head;
 186	struct page *page;
 187	int all_mapped = 1;
 188
 189	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
 190	page = find_get_page(bd_mapping, index);
 191	if (!page)
 192		goto out;
 193
 194	spin_lock(&bd_mapping->private_lock);
 195	if (!page_has_buffers(page))
 196		goto out_unlock;
 197	head = page_buffers(page);
 198	bh = head;
 199	do {
 200		if (!buffer_mapped(bh))
 201			all_mapped = 0;
 202		else if (bh->b_blocknr == block) {
 203			ret = bh;
 204			get_bh(bh);
 205			goto out_unlock;
 206		}
 207		bh = bh->b_this_page;
 208	} while (bh != head);
 209
 210	/* we might be here because some of the buffers on this page are
 211	 * not mapped.  This is due to various races between
 212	 * file io on the block device and getblk.  It gets dealt with
 213	 * elsewhere, don't buffer_error if we had some unmapped buffers
 214	 */
 215	if (all_mapped) {
 216		printk("__find_get_block_slow() failed. "
 217			"block=%llu, b_blocknr=%llu\n",
 218			(unsigned long long)block,
 219			(unsigned long long)bh->b_blocknr);
 220		printk("b_state=0x%08lx, b_size=%zu\n",
 221			bh->b_state, bh->b_size);
 222		printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
 
 223	}
 224out_unlock:
 225	spin_unlock(&bd_mapping->private_lock);
 226	page_cache_release(page);
 227out:
 228	return ret;
 229}
 230
 231/* If invalidate_buffers() will trash dirty buffers, it means some kind
 232   of fs corruption is going on. Trashing dirty data always imply losing
 233   information that was supposed to be just stored on the physical layer
 234   by the user.
 235
 236   Thus invalidate_buffers in general usage is not allwowed to trash
 237   dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
 238   be preserved.  These buffers are simply skipped.
 239  
 240   We also skip buffers which are still in use.  For example this can
 241   happen if a userspace program is reading the block device.
 242
 243   NOTE: In the case where the user removed a removable-media-disk even if
 244   there's still dirty data not synced on disk (due a bug in the device driver
 245   or due an error of the user), by not destroying the dirty buffers we could
 246   generate corruption also on the next media inserted, thus a parameter is
 247   necessary to handle this case in the most safe way possible (trying
 248   to not corrupt also the new disk inserted with the data belonging to
 249   the old now corrupted disk). Also for the ramdisk the natural thing
 250   to do in order to release the ramdisk memory is to destroy dirty buffers.
 251
 252   These are two special cases. Normal usage imply the device driver
 253   to issue a sync on the device (without waiting I/O completion) and
 254   then an invalidate_buffers call that doesn't trash dirty buffers.
 255
 256   For handling cache coherency with the blkdev pagecache the 'update' case
 257   is been introduced. It is needed to re-read from disk any pinned
 258   buffer. NOTE: re-reading from disk is destructive so we can do it only
 259   when we assume nobody is changing the buffercache under our I/O and when
 260   we think the disk contains more recent information than the buffercache.
 261   The update == 1 pass marks the buffers we need to update, the update == 2
 262   pass does the actual I/O. */
 263void invalidate_bdev(struct block_device *bdev)
 264{
 265	struct address_space *mapping = bdev->bd_inode->i_mapping;
 266
 267	if (mapping->nrpages == 0)
 268		return;
 269
 270	invalidate_bh_lrus();
 271	lru_add_drain_all();	/* make sure all lru add caches are flushed */
 272	invalidate_mapping_pages(mapping, 0, -1);
 273	/* 99% of the time, we don't need to flush the cleancache on the bdev.
 274	 * But, for the strange corners, lets be cautious
 275	 */
 276	cleancache_flush_inode(mapping);
 277}
 278EXPORT_SYMBOL(invalidate_bdev);
 279
 280/*
 281 * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
 282 */
 283static void free_more_memory(void)
 284{
 285	struct zone *zone;
 286	int nid;
 287
 288	wakeup_flusher_threads(1024);
 289	yield();
 290
 291	for_each_online_node(nid) {
 292		(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
 293						gfp_zone(GFP_NOFS), NULL,
 294						&zone);
 295		if (zone)
 296			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
 297						GFP_NOFS, NULL);
 298	}
 299}
 300
 301/*
 302 * I/O completion handler for block_read_full_page() - pages
 303 * which come unlocked at the end of I/O.
 304 */
 305static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 306{
 307	unsigned long flags;
 308	struct buffer_head *first;
 309	struct buffer_head *tmp;
 310	struct page *page;
 311	int page_uptodate = 1;
 312
 313	BUG_ON(!buffer_async_read(bh));
 314
 315	page = bh->b_page;
 316	if (uptodate) {
 317		set_buffer_uptodate(bh);
 318	} else {
 319		clear_buffer_uptodate(bh);
 320		if (!quiet_error(bh))
 321			buffer_io_error(bh);
 322		SetPageError(page);
 323	}
 324
 325	/*
 326	 * Be _very_ careful from here on. Bad things can happen if
 327	 * two buffer heads end IO at almost the same time and both
 328	 * decide that the page is now completely done.
 329	 */
 330	first = page_buffers(page);
 331	local_irq_save(flags);
 332	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 333	clear_buffer_async_read(bh);
 334	unlock_buffer(bh);
 335	tmp = bh;
 336	do {
 337		if (!buffer_uptodate(tmp))
 338			page_uptodate = 0;
 339		if (buffer_async_read(tmp)) {
 340			BUG_ON(!buffer_locked(tmp));
 341			goto still_busy;
 342		}
 343		tmp = tmp->b_this_page;
 344	} while (tmp != bh);
 345	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 346	local_irq_restore(flags);
 347
 348	/*
 349	 * If none of the buffers had errors and they are all
 350	 * uptodate then we can set the page uptodate.
 351	 */
 352	if (page_uptodate && !PageError(page))
 353		SetPageUptodate(page);
 354	unlock_page(page);
 355	return;
 356
 357still_busy:
 358	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 359	local_irq_restore(flags);
 360	return;
 361}
 362
 363/*
 364 * Completion handler for block_write_full_page() - pages which are unlocked
 365 * during I/O, and which have PageWriteback cleared upon I/O completion.
 366 */
 367void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 368{
 369	char b[BDEVNAME_SIZE];
 370	unsigned long flags;
 371	struct buffer_head *first;
 372	struct buffer_head *tmp;
 373	struct page *page;
 374
 375	BUG_ON(!buffer_async_write(bh));
 376
 377	page = bh->b_page;
 378	if (uptodate) {
 379		set_buffer_uptodate(bh);
 380	} else {
 381		if (!quiet_error(bh)) {
 382			buffer_io_error(bh);
 383			printk(KERN_WARNING "lost page write due to "
 384					"I/O error on %s\n",
 385			       bdevname(bh->b_bdev, b));
 386		}
 387		set_bit(AS_EIO, &page->mapping->flags);
 388		set_buffer_write_io_error(bh);
 389		clear_buffer_uptodate(bh);
 390		SetPageError(page);
 391	}
 392
 393	first = page_buffers(page);
 394	local_irq_save(flags);
 395	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 396
 397	clear_buffer_async_write(bh);
 398	unlock_buffer(bh);
 399	tmp = bh->b_this_page;
 400	while (tmp != bh) {
 401		if (buffer_async_write(tmp)) {
 402			BUG_ON(!buffer_locked(tmp));
 403			goto still_busy;
 404		}
 405		tmp = tmp->b_this_page;
 406	}
 407	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 408	local_irq_restore(flags);
 409	end_page_writeback(page);
 410	return;
 411
 412still_busy:
 413	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 414	local_irq_restore(flags);
 415	return;
 416}
 417EXPORT_SYMBOL(end_buffer_async_write);
 418
 419/*
 420 * If a page's buffers are under async readin (end_buffer_async_read
 421 * completion) then there is a possibility that another thread of
 422 * control could lock one of the buffers after it has completed
 423 * but while some of the other buffers have not completed.  This
 424 * locked buffer would confuse end_buffer_async_read() into not unlocking
 425 * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 426 * that this buffer is not under async I/O.
 427 *
 428 * The page comes unlocked when it has no locked buffer_async buffers
 429 * left.
 430 *
 431 * PageLocked prevents anyone starting new async I/O reads any of
 432 * the buffers.
 433 *
 434 * PageWriteback is used to prevent simultaneous writeout of the same
 435 * page.
 436 *
 437 * PageLocked prevents anyone from starting writeback of a page which is
 438 * under read I/O (PageWriteback is only ever set against a locked page).
 439 */
 440static void mark_buffer_async_read(struct buffer_head *bh)
 441{
 442	bh->b_end_io = end_buffer_async_read;
 443	set_buffer_async_read(bh);
 444}
 445
 446static void mark_buffer_async_write_endio(struct buffer_head *bh,
 447					  bh_end_io_t *handler)
 448{
 449	bh->b_end_io = handler;
 450	set_buffer_async_write(bh);
 451}
 452
 453void mark_buffer_async_write(struct buffer_head *bh)
 454{
 455	mark_buffer_async_write_endio(bh, end_buffer_async_write);
 456}
 457EXPORT_SYMBOL(mark_buffer_async_write);
 458
 459
 460/*
 461 * fs/buffer.c contains helper functions for buffer-backed address space's
 462 * fsync functions.  A common requirement for buffer-based filesystems is
 463 * that certain data from the backing blockdev needs to be written out for
 464 * a successful fsync().  For example, ext2 indirect blocks need to be
 465 * written back and waited upon before fsync() returns.
 466 *
 467 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
 468 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 469 * management of a list of dependent buffers at ->i_mapping->private_list.
 470 *
 471 * Locking is a little subtle: try_to_free_buffers() will remove buffers
 472 * from their controlling inode's queue when they are being freed.  But
 473 * try_to_free_buffers() will be operating against the *blockdev* mapping
 474 * at the time, not against the S_ISREG file which depends on those buffers.
 475 * So the locking for private_list is via the private_lock in the address_space
 476 * which backs the buffers.  Which is different from the address_space 
 477 * against which the buffers are listed.  So for a particular address_space,
 478 * mapping->private_lock does *not* protect mapping->private_list!  In fact,
 479 * mapping->private_list will always be protected by the backing blockdev's
 480 * ->private_lock.
 481 *
 482 * Which introduces a requirement: all buffers on an address_space's
 483 * ->private_list must be from the same address_space: the blockdev's.
 484 *
 485 * address_spaces which do not place buffers at ->private_list via these
 486 * utility functions are free to use private_lock and private_list for
 487 * whatever they want.  The only requirement is that list_empty(private_list)
 488 * be true at clear_inode() time.
 489 *
 490 * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 491 * filesystems should do that.  invalidate_inode_buffers() should just go
 492 * BUG_ON(!list_empty).
 493 *
 494 * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 495 * take an address_space, not an inode.  And it should be called
 496 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 497 * queued up.
 498 *
 499 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 500 * list if it is already on a list.  Because if the buffer is on a list,
 501 * it *must* already be on the right one.  If not, the filesystem is being
 502 * silly.  This will save a ton of locking.  But first we have to ensure
 503 * that buffers are taken *off* the old inode's list when they are freed
 504 * (presumably in truncate).  That requires careful auditing of all
 505 * filesystems (do it inside bforget()).  It could also be done by bringing
 506 * b_inode back.
 507 */
 508
 509/*
 510 * The buffer's backing address_space's private_lock must be held
 511 */
 512static void __remove_assoc_queue(struct buffer_head *bh)
 513{
 514	list_del_init(&bh->b_assoc_buffers);
 515	WARN_ON(!bh->b_assoc_map);
 516	if (buffer_write_io_error(bh))
 517		set_bit(AS_EIO, &bh->b_assoc_map->flags);
 518	bh->b_assoc_map = NULL;
 519}
 520
 521int inode_has_buffers(struct inode *inode)
 522{
 523	return !list_empty(&inode->i_data.private_list);
 524}
 525
 526/*
 527 * osync is designed to support O_SYNC io.  It waits synchronously for
 528 * all already-submitted IO to complete, but does not queue any new
 529 * writes to the disk.
 530 *
 531 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 532 * you dirty the buffers, and then use osync_inode_buffers to wait for
 533 * completion.  Any other dirty buffers which are not yet queued for
 534 * write will not be flushed to disk by the osync.
 535 */
 536static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
 537{
 538	struct buffer_head *bh;
 539	struct list_head *p;
 540	int err = 0;
 541
 542	spin_lock(lock);
 543repeat:
 544	list_for_each_prev(p, list) {
 545		bh = BH_ENTRY(p);
 546		if (buffer_locked(bh)) {
 547			get_bh(bh);
 548			spin_unlock(lock);
 549			wait_on_buffer(bh);
 550			if (!buffer_uptodate(bh))
 551				err = -EIO;
 552			brelse(bh);
 553			spin_lock(lock);
 554			goto repeat;
 555		}
 556	}
 557	spin_unlock(lock);
 558	return err;
 559}
 560
 561static void do_thaw_one(struct super_block *sb, void *unused)
 562{
 563	char b[BDEVNAME_SIZE];
 564	while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
 565		printk(KERN_WARNING "Emergency Thaw on %s\n",
 566		       bdevname(sb->s_bdev, b));
 567}
 568
 569static void do_thaw_all(struct work_struct *work)
 570{
 571	iterate_supers(do_thaw_one, NULL);
 572	kfree(work);
 573	printk(KERN_WARNING "Emergency Thaw complete\n");
 574}
 575
 576/**
 577 * emergency_thaw_all -- forcibly thaw every frozen filesystem
 578 *
 579 * Used for emergency unfreeze of all filesystems via SysRq
 580 */
 581void emergency_thaw_all(void)
 582{
 583	struct work_struct *work;
 584
 585	work = kmalloc(sizeof(*work), GFP_ATOMIC);
 586	if (work) {
 587		INIT_WORK(work, do_thaw_all);
 588		schedule_work(work);
 589	}
 590}
 591
 592/**
 593 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
 594 * @mapping: the mapping which wants those buffers written
 595 *
 596 * Starts I/O against the buffers at mapping->private_list, and waits upon
 597 * that I/O.
 598 *
 599 * Basically, this is a convenience function for fsync().
 600 * @mapping is a file or directory which needs those buffers to be written for
 601 * a successful fsync().
 602 */
 603int sync_mapping_buffers(struct address_space *mapping)
 604{
 605	struct address_space *buffer_mapping = mapping->assoc_mapping;
 606
 607	if (buffer_mapping == NULL || list_empty(&mapping->private_list))
 608		return 0;
 609
 610	return fsync_buffers_list(&buffer_mapping->private_lock,
 611					&mapping->private_list);
 612}
 613EXPORT_SYMBOL(sync_mapping_buffers);
 614
 615/*
 616 * Called when we've recently written block `bblock', and it is known that
 617 * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 618 * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 619 * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 620 */
 621void write_boundary_block(struct block_device *bdev,
 622			sector_t bblock, unsigned blocksize)
 623{
 624	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
 625	if (bh) {
 626		if (buffer_dirty(bh))
 627			ll_rw_block(WRITE, 1, &bh);
 628		put_bh(bh);
 629	}
 630}
 631
 632void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 633{
 634	struct address_space *mapping = inode->i_mapping;
 635	struct address_space *buffer_mapping = bh->b_page->mapping;
 636
 637	mark_buffer_dirty(bh);
 638	if (!mapping->assoc_mapping) {
 639		mapping->assoc_mapping = buffer_mapping;
 640	} else {
 641		BUG_ON(mapping->assoc_mapping != buffer_mapping);
 642	}
 643	if (!bh->b_assoc_map) {
 644		spin_lock(&buffer_mapping->private_lock);
 645		list_move_tail(&bh->b_assoc_buffers,
 646				&mapping->private_list);
 647		bh->b_assoc_map = mapping;
 648		spin_unlock(&buffer_mapping->private_lock);
 649	}
 650}
 651EXPORT_SYMBOL(mark_buffer_dirty_inode);
 652
 653/*
 654 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
 655 * dirty.
 656 *
 657 * If warn is true, then emit a warning if the page is not uptodate and has
 658 * not been truncated.
 
 
 659 */
 660static void __set_page_dirty(struct page *page,
 661		struct address_space *mapping, int warn)
 662{
 663	spin_lock_irq(&mapping->tree_lock);
 
 
 664	if (page->mapping) {	/* Race with truncate? */
 665		WARN_ON_ONCE(warn && !PageUptodate(page));
 666		account_page_dirtied(page, mapping);
 667		radix_tree_tag_set(&mapping->page_tree,
 668				page_index(page), PAGECACHE_TAG_DIRTY);
 669	}
 670	spin_unlock_irq(&mapping->tree_lock);
 671	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 672}
 673
 674/*
 675 * Add a page to the dirty page list.
 676 *
 677 * It is a sad fact of life that this function is called from several places
 678 * deeply under spinlocking.  It may not sleep.
 679 *
 680 * If the page has buffers, the uptodate buffers are set dirty, to preserve
 681 * dirty-state coherency between the page and the buffers.  It the page does
 682 * not have buffers then when they are later attached they will all be set
 683 * dirty.
 684 *
 685 * The buffers are dirtied before the page is dirtied.  There's a small race
 686 * window in which a writepage caller may see the page cleanness but not the
 687 * buffer dirtiness.  That's fine.  If this code were to set the page dirty
 688 * before the buffers, a concurrent writepage caller could clear the page dirty
 689 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
 690 * page on the dirty page list.
 691 *
 692 * We use private_lock to lock against try_to_free_buffers while using the
 693 * page's buffer list.  Also use this to protect against clean buffers being
 694 * added to the page after it was set dirty.
 695 *
 696 * FIXME: may need to call ->reservepage here as well.  That's rather up to the
 697 * address_space though.
 698 */
 699int __set_page_dirty_buffers(struct page *page)
 700{
 701	int newly_dirty;
 702	struct address_space *mapping = page_mapping(page);
 703
 704	if (unlikely(!mapping))
 705		return !TestSetPageDirty(page);
 706
 707	spin_lock(&mapping->private_lock);
 708	if (page_has_buffers(page)) {
 709		struct buffer_head *head = page_buffers(page);
 710		struct buffer_head *bh = head;
 711
 712		do {
 713			set_buffer_dirty(bh);
 714			bh = bh->b_this_page;
 715		} while (bh != head);
 716	}
 
 
 
 
 
 717	newly_dirty = !TestSetPageDirty(page);
 718	spin_unlock(&mapping->private_lock);
 719
 720	if (newly_dirty)
 721		__set_page_dirty(page, mapping, 1);
 
 
 
 
 
 
 722	return newly_dirty;
 723}
 724EXPORT_SYMBOL(__set_page_dirty_buffers);
 725
 726/*
 727 * Write out and wait upon a list of buffers.
 728 *
 729 * We have conflicting pressures: we want to make sure that all
 730 * initially dirty buffers get waited on, but that any subsequently
 731 * dirtied buffers don't.  After all, we don't want fsync to last
 732 * forever if somebody is actively writing to the file.
 733 *
 734 * Do this in two main stages: first we copy dirty buffers to a
 735 * temporary inode list, queueing the writes as we go.  Then we clean
 736 * up, waiting for those writes to complete.
 737 * 
 738 * During this second stage, any subsequent updates to the file may end
 739 * up refiling the buffer on the original inode's dirty list again, so
 740 * there is a chance we will end up with a buffer queued for write but
 741 * not yet completed on that list.  So, as a final cleanup we go through
 742 * the osync code to catch these locked, dirty buffers without requeuing
 743 * any newly dirty buffers for write.
 744 */
 745static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 746{
 747	struct buffer_head *bh;
 748	struct list_head tmp;
 749	struct address_space *mapping;
 750	int err = 0, err2;
 751	struct blk_plug plug;
 752
 753	INIT_LIST_HEAD(&tmp);
 754	blk_start_plug(&plug);
 755
 756	spin_lock(lock);
 757	while (!list_empty(list)) {
 758		bh = BH_ENTRY(list->next);
 759		mapping = bh->b_assoc_map;
 760		__remove_assoc_queue(bh);
 761		/* Avoid race with mark_buffer_dirty_inode() which does
 762		 * a lockless check and we rely on seeing the dirty bit */
 763		smp_mb();
 764		if (buffer_dirty(bh) || buffer_locked(bh)) {
 765			list_add(&bh->b_assoc_buffers, &tmp);
 766			bh->b_assoc_map = mapping;
 767			if (buffer_dirty(bh)) {
 768				get_bh(bh);
 769				spin_unlock(lock);
 770				/*
 771				 * Ensure any pending I/O completes so that
 772				 * write_dirty_buffer() actually writes the
 773				 * current contents - it is a noop if I/O is
 774				 * still in flight on potentially older
 775				 * contents.
 776				 */
 777				write_dirty_buffer(bh, WRITE_SYNC);
 778
 779				/*
 780				 * Kick off IO for the previous mapping. Note
 781				 * that we will not run the very last mapping,
 782				 * wait_on_buffer() will do that for us
 783				 * through sync_buffer().
 784				 */
 785				brelse(bh);
 786				spin_lock(lock);
 787			}
 788		}
 789	}
 790
 791	spin_unlock(lock);
 792	blk_finish_plug(&plug);
 793	spin_lock(lock);
 794
 795	while (!list_empty(&tmp)) {
 796		bh = BH_ENTRY(tmp.prev);
 797		get_bh(bh);
 798		mapping = bh->b_assoc_map;
 799		__remove_assoc_queue(bh);
 800		/* Avoid race with mark_buffer_dirty_inode() which does
 801		 * a lockless check and we rely on seeing the dirty bit */
 802		smp_mb();
 803		if (buffer_dirty(bh)) {
 804			list_add(&bh->b_assoc_buffers,
 805				 &mapping->private_list);
 806			bh->b_assoc_map = mapping;
 807		}
 808		spin_unlock(lock);
 809		wait_on_buffer(bh);
 810		if (!buffer_uptodate(bh))
 811			err = -EIO;
 812		brelse(bh);
 813		spin_lock(lock);
 814	}
 815	
 816	spin_unlock(lock);
 817	err2 = osync_buffers_list(lock, list);
 818	if (err)
 819		return err;
 820	else
 821		return err2;
 822}
 823
 824/*
 825 * Invalidate any and all dirty buffers on a given inode.  We are
 826 * probably unmounting the fs, but that doesn't mean we have already
 827 * done a sync().  Just drop the buffers from the inode list.
 828 *
 829 * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
 830 * assumes that all the buffers are against the blockdev.  Not true
 831 * for reiserfs.
 832 */
 833void invalidate_inode_buffers(struct inode *inode)
 834{
 835	if (inode_has_buffers(inode)) {
 836		struct address_space *mapping = &inode->i_data;
 837		struct list_head *list = &mapping->private_list;
 838		struct address_space *buffer_mapping = mapping->assoc_mapping;
 839
 840		spin_lock(&buffer_mapping->private_lock);
 841		while (!list_empty(list))
 842			__remove_assoc_queue(BH_ENTRY(list->next));
 843		spin_unlock(&buffer_mapping->private_lock);
 844	}
 845}
 846EXPORT_SYMBOL(invalidate_inode_buffers);
 847
 848/*
 849 * Remove any clean buffers from the inode's buffer list.  This is called
 850 * when we're trying to free the inode itself.  Those buffers can pin it.
 851 *
 852 * Returns true if all buffers were removed.
 853 */
 854int remove_inode_buffers(struct inode *inode)
 855{
 856	int ret = 1;
 857
 858	if (inode_has_buffers(inode)) {
 859		struct address_space *mapping = &inode->i_data;
 860		struct list_head *list = &mapping->private_list;
 861		struct address_space *buffer_mapping = mapping->assoc_mapping;
 862
 863		spin_lock(&buffer_mapping->private_lock);
 864		while (!list_empty(list)) {
 865			struct buffer_head *bh = BH_ENTRY(list->next);
 866			if (buffer_dirty(bh)) {
 867				ret = 0;
 868				break;
 869			}
 870			__remove_assoc_queue(bh);
 871		}
 872		spin_unlock(&buffer_mapping->private_lock);
 873	}
 874	return ret;
 875}
 876
 877/*
 878 * Create the appropriate buffers when given a page for data area and
 879 * the size of each buffer.. Use the bh->b_this_page linked list to
 880 * follow the buffers created.  Return NULL if unable to create more
 881 * buffers.
 882 *
 883 * The retry flag is used to differentiate async IO (paging, swapping)
 884 * which may not fail from ordinary buffer allocations.
 885 */
 886struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 887		int retry)
 888{
 889	struct buffer_head *bh, *head;
 890	long offset;
 891
 892try_again:
 893	head = NULL;
 894	offset = PAGE_SIZE;
 895	while ((offset -= size) >= 0) {
 896		bh = alloc_buffer_head(GFP_NOFS);
 897		if (!bh)
 898			goto no_grow;
 899
 900		bh->b_bdev = NULL;
 901		bh->b_this_page = head;
 902		bh->b_blocknr = -1;
 903		head = bh;
 904
 905		bh->b_state = 0;
 906		atomic_set(&bh->b_count, 0);
 907		bh->b_size = size;
 908
 909		/* Link the buffer to its page */
 910		set_bh_page(bh, page, offset);
 911
 912		init_buffer(bh, NULL, NULL);
 913	}
 914	return head;
 915/*
 916 * In case anything failed, we just free everything we got.
 917 */
 918no_grow:
 919	if (head) {
 920		do {
 921			bh = head;
 922			head = head->b_this_page;
 923			free_buffer_head(bh);
 924		} while (head);
 925	}
 926
 927	/*
 928	 * Return failure for non-async IO requests.  Async IO requests
 929	 * are not allowed to fail, so we have to wait until buffer heads
 930	 * become available.  But we don't want tasks sleeping with 
 931	 * partially complete buffers, so all were released above.
 932	 */
 933	if (!retry)
 934		return NULL;
 935
 936	/* We're _really_ low on memory. Now we just
 937	 * wait for old buffer heads to become free due to
 938	 * finishing IO.  Since this is an async request and
 939	 * the reserve list is empty, we're sure there are 
 940	 * async buffer heads in use.
 941	 */
 942	free_more_memory();
 943	goto try_again;
 944}
 945EXPORT_SYMBOL_GPL(alloc_page_buffers);
 946
 947static inline void
 948link_dev_buffers(struct page *page, struct buffer_head *head)
 949{
 950	struct buffer_head *bh, *tail;
 951
 952	bh = head;
 953	do {
 954		tail = bh;
 955		bh = bh->b_this_page;
 956	} while (bh);
 957	tail->b_this_page = head;
 958	attach_page_buffers(page, head);
 959}
 960
 
 
 
 
 
 
 
 
 
 
 
 
 961/*
 962 * Initialise the state of a blockdev page's buffers.
 963 */ 
 964static void
 965init_page_buffers(struct page *page, struct block_device *bdev,
 966			sector_t block, int size)
 967{
 968	struct buffer_head *head = page_buffers(page);
 969	struct buffer_head *bh = head;
 970	int uptodate = PageUptodate(page);
 
 971
 972	do {
 973		if (!buffer_mapped(bh)) {
 974			init_buffer(bh, NULL, NULL);
 975			bh->b_bdev = bdev;
 976			bh->b_blocknr = block;
 977			if (uptodate)
 978				set_buffer_uptodate(bh);
 979			set_buffer_mapped(bh);
 
 980		}
 981		block++;
 982		bh = bh->b_this_page;
 983	} while (bh != head);
 
 
 
 
 
 984}
 985
 986/*
 987 * Create the page-cache page that contains the requested block.
 988 *
 989 * This is user purely for blockdev mappings.
 990 */
 991static struct page *
 992grow_dev_page(struct block_device *bdev, sector_t block,
 993		pgoff_t index, int size)
 994{
 995	struct inode *inode = bdev->bd_inode;
 996	struct page *page;
 997	struct buffer_head *bh;
 
 
 
 
 
 
 
 
 
 
 
 
 
 998
 999	page = find_or_create_page(inode->i_mapping, index,
1000		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
1001	if (!page)
1002		return NULL;
1003
1004	BUG_ON(!PageLocked(page));
1005
1006	if (page_has_buffers(page)) {
1007		bh = page_buffers(page);
1008		if (bh->b_size == size) {
1009			init_page_buffers(page, bdev, block, size);
1010			return page;
 
 
1011		}
1012		if (!try_to_free_buffers(page))
1013			goto failed;
1014	}
1015
1016	/*
1017	 * Allocate some buffers for this page
1018	 */
1019	bh = alloc_page_buffers(page, size, 0);
1020	if (!bh)
1021		goto failed;
1022
1023	/*
1024	 * Link the page to the buffers and initialise them.  Take the
1025	 * lock to be atomic wrt __find_get_block(), which does not
1026	 * run under the page lock.
1027	 */
1028	spin_lock(&inode->i_mapping->private_lock);
1029	link_dev_buffers(page, bh);
1030	init_page_buffers(page, bdev, block, size);
 
1031	spin_unlock(&inode->i_mapping->private_lock);
1032	return page;
1033
1034failed:
1035	BUG();
1036	unlock_page(page);
1037	page_cache_release(page);
1038	return NULL;
1039}
1040
1041/*
1042 * Create buffers for the specified block device block's page.  If
1043 * that page was dirty, the buffers are set dirty also.
1044 */
1045static int
1046grow_buffers(struct block_device *bdev, sector_t block, int size)
1047{
1048	struct page *page;
1049	pgoff_t index;
1050	int sizebits;
1051
1052	sizebits = -1;
1053	do {
1054		sizebits++;
1055	} while ((size << sizebits) < PAGE_SIZE);
1056
1057	index = block >> sizebits;
1058
1059	/*
1060	 * Check for a block which wants to lie outside our maximum possible
1061	 * pagecache index.  (this comparison is done using sector_t types).
1062	 */
1063	if (unlikely(index != block >> sizebits)) {
1064		char b[BDEVNAME_SIZE];
1065
1066		printk(KERN_ERR "%s: requested out-of-range block %llu for "
1067			"device %s\n",
1068			__func__, (unsigned long long)block,
1069			bdevname(bdev, b));
1070		return -EIO;
1071	}
1072	block = index << sizebits;
1073	/* Create a page with the proper size buffers.. */
1074	page = grow_dev_page(bdev, block, index, size);
1075	if (!page)
1076		return 0;
1077	unlock_page(page);
1078	page_cache_release(page);
1079	return 1;
1080}
1081
1082static struct buffer_head *
1083__getblk_slow(struct block_device *bdev, sector_t block, int size)
 
1084{
1085	/* Size must be multiple of hard sectorsize */
1086	if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1087			(size < 512 || size > PAGE_SIZE))) {
1088		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1089					size);
1090		printk(KERN_ERR "logical block size: %d\n",
1091					bdev_logical_block_size(bdev));
1092
1093		dump_stack();
1094		return NULL;
1095	}
1096
1097	for (;;) {
1098		struct buffer_head * bh;
1099		int ret;
1100
1101		bh = __find_get_block(bdev, block, size);
1102		if (bh)
1103			return bh;
1104
1105		ret = grow_buffers(bdev, block, size);
1106		if (ret < 0)
1107			return NULL;
1108		if (ret == 0)
1109			free_more_memory();
1110	}
1111}
1112
1113/*
1114 * The relationship between dirty buffers and dirty pages:
1115 *
1116 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1117 * the page is tagged dirty in its radix tree.
1118 *
1119 * At all times, the dirtiness of the buffers represents the dirtiness of
1120 * subsections of the page.  If the page has buffers, the page dirty bit is
1121 * merely a hint about the true dirty state.
1122 *
1123 * When a page is set dirty in its entirety, all its buffers are marked dirty
1124 * (if the page has buffers).
1125 *
1126 * When a buffer is marked dirty, its page is dirtied, but the page's other
1127 * buffers are not.
1128 *
1129 * Also.  When blockdev buffers are explicitly read with bread(), they
1130 * individually become uptodate.  But their backing page remains not
1131 * uptodate - even if all of its buffers are uptodate.  A subsequent
1132 * block_read_full_page() against that page will discover all the uptodate
1133 * buffers, will set the page uptodate and will perform no I/O.
1134 */
1135
1136/**
1137 * mark_buffer_dirty - mark a buffer_head as needing writeout
1138 * @bh: the buffer_head to mark dirty
1139 *
1140 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1141 * backing page dirty, then tag the page as dirty in its address_space's radix
1142 * tree and then attach the address_space's inode to its superblock's dirty
1143 * inode list.
1144 *
1145 * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1146 * mapping->tree_lock and mapping->host->i_lock.
1147 */
1148void mark_buffer_dirty(struct buffer_head *bh)
1149{
1150	WARN_ON_ONCE(!buffer_uptodate(bh));
1151
 
 
1152	/*
1153	 * Very *carefully* optimize the it-is-already-dirty case.
1154	 *
1155	 * Don't let the final "is it dirty" escape to before we
1156	 * perhaps modified the buffer.
1157	 */
1158	if (buffer_dirty(bh)) {
1159		smp_mb();
1160		if (buffer_dirty(bh))
1161			return;
1162	}
1163
1164	if (!test_set_buffer_dirty(bh)) {
1165		struct page *page = bh->b_page;
 
 
 
1166		if (!TestSetPageDirty(page)) {
1167			struct address_space *mapping = page_mapping(page);
1168			if (mapping)
1169				__set_page_dirty(page, mapping, 0);
1170		}
 
 
 
1171	}
1172}
1173EXPORT_SYMBOL(mark_buffer_dirty);
1174
1175/*
1176 * Decrement a buffer_head's reference count.  If all buffers against a page
1177 * have zero reference count, are clean and unlocked, and if the page is clean
1178 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1179 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1180 * a page but it ends up not being freed, and buffers may later be reattached).
1181 */
1182void __brelse(struct buffer_head * buf)
1183{
1184	if (atomic_read(&buf->b_count)) {
1185		put_bh(buf);
1186		return;
1187	}
1188	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1189}
1190EXPORT_SYMBOL(__brelse);
1191
1192/*
1193 * bforget() is like brelse(), except it discards any
1194 * potentially dirty data.
1195 */
1196void __bforget(struct buffer_head *bh)
1197{
1198	clear_buffer_dirty(bh);
1199	if (bh->b_assoc_map) {
1200		struct address_space *buffer_mapping = bh->b_page->mapping;
1201
1202		spin_lock(&buffer_mapping->private_lock);
1203		list_del_init(&bh->b_assoc_buffers);
1204		bh->b_assoc_map = NULL;
1205		spin_unlock(&buffer_mapping->private_lock);
1206	}
1207	__brelse(bh);
1208}
1209EXPORT_SYMBOL(__bforget);
1210
1211static struct buffer_head *__bread_slow(struct buffer_head *bh)
1212{
1213	lock_buffer(bh);
1214	if (buffer_uptodate(bh)) {
1215		unlock_buffer(bh);
1216		return bh;
1217	} else {
1218		get_bh(bh);
1219		bh->b_end_io = end_buffer_read_sync;
1220		submit_bh(READ, bh);
1221		wait_on_buffer(bh);
1222		if (buffer_uptodate(bh))
1223			return bh;
1224	}
1225	brelse(bh);
1226	return NULL;
1227}
1228
1229/*
1230 * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1231 * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1232 * refcount elevated by one when they're in an LRU.  A buffer can only appear
1233 * once in a particular CPU's LRU.  A single buffer can be present in multiple
1234 * CPU's LRUs at the same time.
1235 *
1236 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1237 * sb_find_get_block().
1238 *
1239 * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1240 * a local interrupt disable for that.
1241 */
1242
1243#define BH_LRU_SIZE	8
1244
1245struct bh_lru {
1246	struct buffer_head *bhs[BH_LRU_SIZE];
1247};
1248
1249static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1250
1251#ifdef CONFIG_SMP
1252#define bh_lru_lock()	local_irq_disable()
1253#define bh_lru_unlock()	local_irq_enable()
1254#else
1255#define bh_lru_lock()	preempt_disable()
1256#define bh_lru_unlock()	preempt_enable()
1257#endif
1258
1259static inline void check_irqs_on(void)
1260{
1261#ifdef irqs_disabled
1262	BUG_ON(irqs_disabled());
1263#endif
1264}
1265
1266/*
1267 * The LRU management algorithm is dopey-but-simple.  Sorry.
1268 */
1269static void bh_lru_install(struct buffer_head *bh)
1270{
1271	struct buffer_head *evictee = NULL;
1272
1273	check_irqs_on();
1274	bh_lru_lock();
1275	if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
1276		struct buffer_head *bhs[BH_LRU_SIZE];
1277		int in;
1278		int out = 0;
1279
1280		get_bh(bh);
1281		bhs[out++] = bh;
1282		for (in = 0; in < BH_LRU_SIZE; in++) {
1283			struct buffer_head *bh2 =
1284				__this_cpu_read(bh_lrus.bhs[in]);
1285
1286			if (bh2 == bh) {
1287				__brelse(bh2);
1288			} else {
1289				if (out >= BH_LRU_SIZE) {
1290					BUG_ON(evictee != NULL);
1291					evictee = bh2;
1292				} else {
1293					bhs[out++] = bh2;
1294				}
1295			}
1296		}
1297		while (out < BH_LRU_SIZE)
1298			bhs[out++] = NULL;
1299		memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
1300	}
1301	bh_lru_unlock();
1302
1303	if (evictee)
1304		__brelse(evictee);
1305}
1306
1307/*
1308 * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1309 */
1310static struct buffer_head *
1311lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1312{
1313	struct buffer_head *ret = NULL;
1314	unsigned int i;
1315
1316	check_irqs_on();
1317	bh_lru_lock();
1318	for (i = 0; i < BH_LRU_SIZE; i++) {
1319		struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1320
1321		if (bh && bh->b_bdev == bdev &&
1322				bh->b_blocknr == block && bh->b_size == size) {
1323			if (i) {
1324				while (i) {
1325					__this_cpu_write(bh_lrus.bhs[i],
1326						__this_cpu_read(bh_lrus.bhs[i - 1]));
1327					i--;
1328				}
1329				__this_cpu_write(bh_lrus.bhs[0], bh);
1330			}
1331			get_bh(bh);
1332			ret = bh;
1333			break;
1334		}
1335	}
1336	bh_lru_unlock();
1337	return ret;
1338}
1339
1340/*
1341 * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1342 * it in the LRU and mark it as accessed.  If it is not present then return
1343 * NULL
1344 */
1345struct buffer_head *
1346__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1347{
1348	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1349
1350	if (bh == NULL) {
 
1351		bh = __find_get_block_slow(bdev, block);
1352		if (bh)
1353			bh_lru_install(bh);
1354	}
1355	if (bh)
1356		touch_buffer(bh);
 
1357	return bh;
1358}
1359EXPORT_SYMBOL(__find_get_block);
1360
1361/*
1362 * __getblk will locate (and, if necessary, create) the buffer_head
1363 * which corresponds to the passed block_device, block and size. The
1364 * returned buffer has its reference count incremented.
1365 *
1366 * __getblk() cannot fail - it just keeps trying.  If you pass it an
1367 * illegal block number, __getblk() will happily return a buffer_head
1368 * which represents the non-existent block.  Very weird.
1369 *
1370 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1371 * attempt is failing.  FIXME, perhaps?
1372 */
1373struct buffer_head *
1374__getblk(struct block_device *bdev, sector_t block, unsigned size)
 
1375{
1376	struct buffer_head *bh = __find_get_block(bdev, block, size);
1377
1378	might_sleep();
1379	if (bh == NULL)
1380		bh = __getblk_slow(bdev, block, size);
1381	return bh;
1382}
1383EXPORT_SYMBOL(__getblk);
1384
1385/*
1386 * Do async read-ahead on a buffer..
1387 */
1388void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1389{
1390	struct buffer_head *bh = __getblk(bdev, block, size);
1391	if (likely(bh)) {
1392		ll_rw_block(READA, 1, &bh);
1393		brelse(bh);
1394	}
1395}
1396EXPORT_SYMBOL(__breadahead);
1397
1398/**
1399 *  __bread() - reads a specified block and returns the bh
1400 *  @bdev: the block_device to read from
1401 *  @block: number of block
1402 *  @size: size (in bytes) to read
1403 * 
 
1404 *  Reads a specified block, and returns buffer head that contains it.
 
 
1405 *  It returns NULL if the block was unreadable.
1406 */
1407struct buffer_head *
1408__bread(struct block_device *bdev, sector_t block, unsigned size)
 
1409{
1410	struct buffer_head *bh = __getblk(bdev, block, size);
1411
1412	if (likely(bh) && !buffer_uptodate(bh))
1413		bh = __bread_slow(bh);
1414	return bh;
1415}
1416EXPORT_SYMBOL(__bread);
1417
1418/*
1419 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1420 * This doesn't race because it runs in each cpu either in irq
1421 * or with preempt disabled.
1422 */
1423static void invalidate_bh_lru(void *arg)
1424{
1425	struct bh_lru *b = &get_cpu_var(bh_lrus);
1426	int i;
1427
1428	for (i = 0; i < BH_LRU_SIZE; i++) {
1429		brelse(b->bhs[i]);
1430		b->bhs[i] = NULL;
1431	}
1432	put_cpu_var(bh_lrus);
1433}
 
 
 
 
 
1434	
 
 
 
 
 
 
 
 
1435void invalidate_bh_lrus(void)
1436{
1437	on_each_cpu(invalidate_bh_lru, NULL, 1);
1438}
1439EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1440
1441void set_bh_page(struct buffer_head *bh,
1442		struct page *page, unsigned long offset)
1443{
1444	bh->b_page = page;
1445	BUG_ON(offset >= PAGE_SIZE);
1446	if (PageHighMem(page))
1447		/*
1448		 * This catches illegal uses and preserves the offset:
1449		 */
1450		bh->b_data = (char *)(0 + offset);
1451	else
1452		bh->b_data = page_address(page) + offset;
1453}
1454EXPORT_SYMBOL(set_bh_page);
1455
1456/*
1457 * Called when truncating a buffer on a page completely.
1458 */
 
 
 
 
 
 
1459static void discard_buffer(struct buffer_head * bh)
1460{
 
 
1461	lock_buffer(bh);
1462	clear_buffer_dirty(bh);
1463	bh->b_bdev = NULL;
1464	clear_buffer_mapped(bh);
1465	clear_buffer_req(bh);
1466	clear_buffer_new(bh);
1467	clear_buffer_delay(bh);
1468	clear_buffer_unwritten(bh);
 
 
 
1469	unlock_buffer(bh);
1470}
1471
1472/**
1473 * block_invalidatepage - invalidate part of all of a buffer-backed page
1474 *
1475 * @page: the page which is affected
1476 * @offset: the index of the truncation point
 
1477 *
1478 * block_invalidatepage() is called when all or part of the page has become
1479 * invalidatedby a truncate operation.
1480 *
1481 * block_invalidatepage() does not have to release all buffers, but it must
1482 * ensure that no dirty buffer is left outside @offset and that no I/O
1483 * is underway against any of the blocks which are outside the truncation
1484 * point.  Because the caller is about to free (and possibly reuse) those
1485 * blocks on-disk.
1486 */
1487void block_invalidatepage(struct page *page, unsigned long offset)
 
1488{
1489	struct buffer_head *head, *bh, *next;
1490	unsigned int curr_off = 0;
 
1491
1492	BUG_ON(!PageLocked(page));
1493	if (!page_has_buffers(page))
1494		goto out;
1495
 
 
 
 
 
1496	head = page_buffers(page);
1497	bh = head;
1498	do {
1499		unsigned int next_off = curr_off + bh->b_size;
1500		next = bh->b_this_page;
1501
1502		/*
 
 
 
 
 
 
1503		 * is this block fully invalidated?
1504		 */
1505		if (offset <= curr_off)
1506			discard_buffer(bh);
1507		curr_off = next_off;
1508		bh = next;
1509	} while (bh != head);
1510
1511	/*
1512	 * We release buffers only if the entire page is being invalidated.
1513	 * The get_block cached value has been unconditionally invalidated,
1514	 * so real IO is not possible anymore.
1515	 */
1516	if (offset == 0)
1517		try_to_release_page(page, 0);
1518out:
1519	return;
1520}
1521EXPORT_SYMBOL(block_invalidatepage);
1522
 
1523/*
1524 * We attach and possibly dirty the buffers atomically wrt
1525 * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1526 * is already excluded via the page lock.
1527 */
1528void create_empty_buffers(struct page *page,
1529			unsigned long blocksize, unsigned long b_state)
1530{
1531	struct buffer_head *bh, *head, *tail;
1532
1533	head = alloc_page_buffers(page, blocksize, 1);
1534	bh = head;
1535	do {
1536		bh->b_state |= b_state;
1537		tail = bh;
1538		bh = bh->b_this_page;
1539	} while (bh);
1540	tail->b_this_page = head;
1541
1542	spin_lock(&page->mapping->private_lock);
1543	if (PageUptodate(page) || PageDirty(page)) {
1544		bh = head;
1545		do {
1546			if (PageDirty(page))
1547				set_buffer_dirty(bh);
1548			if (PageUptodate(page))
1549				set_buffer_uptodate(bh);
1550			bh = bh->b_this_page;
1551		} while (bh != head);
1552	}
1553	attach_page_buffers(page, head);
1554	spin_unlock(&page->mapping->private_lock);
1555}
1556EXPORT_SYMBOL(create_empty_buffers);
1557
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1558/*
1559 * We are taking a block for data and we don't want any output from any
1560 * buffer-cache aliases starting from return from that function and
1561 * until the moment when something will explicitly mark the buffer
1562 * dirty (hopefully that will not happen until we will free that block ;-)
1563 * We don't even need to mark it not-uptodate - nobody can expect
1564 * anything from a newly allocated buffer anyway. We used to used
1565 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1566 * don't want to mark the alias unmapped, for example - it would confuse
1567 * anyone who might pick it with bread() afterwards...
1568 *
1569 * Also..  Note that bforget() doesn't lock the buffer.  So there can
1570 * be writeout I/O going on against recently-freed buffers.  We don't
1571 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1572 * only if we really need to.  That happens here.
1573 */
1574void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1575{
1576	struct buffer_head *old_bh;
 
1577
1578	might_sleep();
 
 
1579
1580	old_bh = __find_get_block_slow(bdev, block);
1581	if (old_bh) {
1582		clear_buffer_dirty(old_bh);
1583		wait_on_buffer(old_bh);
1584		clear_buffer_req(old_bh);
1585		__brelse(old_bh);
1586	}
1587}
1588EXPORT_SYMBOL(unmap_underlying_metadata);
1589
1590/*
1591 * NOTE! All mapped/uptodate combinations are valid:
1592 *
1593 *	Mapped	Uptodate	Meaning
1594 *
1595 *	No	No		"unknown" - must do get_block()
1596 *	No	Yes		"hole" - zero-filled
1597 *	Yes	No		"allocated" - allocated on disk, not read in
1598 *	Yes	Yes		"valid" - allocated and up-to-date in memory.
1599 *
1600 * "Dirty" is valid only with the last case (mapped+uptodate).
1601 */
1602
1603/*
1604 * While block_write_full_page is writing back the dirty buffers under
1605 * the page lock, whoever dirtied the buffers may decide to clean them
1606 * again at any time.  We handle that by only looking at the buffer
1607 * state inside lock_buffer().
1608 *
1609 * If block_write_full_page() is called for regular writeback
1610 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1611 * locked buffer.   This only can happen if someone has written the buffer
1612 * directly, with submit_bh().  At the address_space level PageWriteback
1613 * prevents this contention from occurring.
1614 *
1615 * If block_write_full_page() is called with wbc->sync_mode ==
1616 * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
1617 * causes the writes to be flagged as synchronous writes.
1618 */
1619static int __block_write_full_page(struct inode *inode, struct page *page,
1620			get_block_t *get_block, struct writeback_control *wbc,
1621			bh_end_io_t *handler)
1622{
1623	int err;
1624	sector_t block;
1625	sector_t last_block;
1626	struct buffer_head *bh, *head;
1627	const unsigned blocksize = 1 << inode->i_blkbits;
1628	int nr_underway = 0;
1629	int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1630			WRITE_SYNC : WRITE);
1631
1632	BUG_ON(!PageLocked(page));
1633
1634	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1635
1636	if (!page_has_buffers(page)) {
1637		create_empty_buffers(page, blocksize,
1638					(1 << BH_Dirty)|(1 << BH_Uptodate));
1639	}
1640
1641	/*
1642	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1643	 * here, and the (potentially unmapped) buffers may become dirty at
1644	 * any time.  If a buffer becomes dirty here after we've inspected it
1645	 * then we just miss that fact, and the page stays dirty.
1646	 *
1647	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1648	 * handle that here by just cleaning them.
1649	 */
1650
1651	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1652	head = page_buffers(page);
1653	bh = head;
 
 
 
 
 
1654
1655	/*
1656	 * Get all the dirty buffers mapped to disk addresses and
1657	 * handle any aliases from the underlying blockdev's mapping.
1658	 */
1659	do {
1660		if (block > last_block) {
1661			/*
1662			 * mapped buffers outside i_size will occur, because
1663			 * this page can be outside i_size when there is a
1664			 * truncate in progress.
1665			 */
1666			/*
1667			 * The buffer was zeroed by block_write_full_page()
1668			 */
1669			clear_buffer_dirty(bh);
1670			set_buffer_uptodate(bh);
1671		} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1672			   buffer_dirty(bh)) {
1673			WARN_ON(bh->b_size != blocksize);
1674			err = get_block(inode, block, bh, 1);
1675			if (err)
1676				goto recover;
1677			clear_buffer_delay(bh);
1678			if (buffer_new(bh)) {
1679				/* blockdev mappings never come here */
1680				clear_buffer_new(bh);
1681				unmap_underlying_metadata(bh->b_bdev,
1682							bh->b_blocknr);
1683			}
1684		}
1685		bh = bh->b_this_page;
1686		block++;
1687	} while (bh != head);
1688
1689	do {
1690		if (!buffer_mapped(bh))
1691			continue;
1692		/*
1693		 * If it's a fully non-blocking write attempt and we cannot
1694		 * lock the buffer then redirty the page.  Note that this can
1695		 * potentially cause a busy-wait loop from writeback threads
1696		 * and kswapd activity, but those code paths have their own
1697		 * higher-level throttling.
1698		 */
1699		if (wbc->sync_mode != WB_SYNC_NONE) {
1700			lock_buffer(bh);
1701		} else if (!trylock_buffer(bh)) {
1702			redirty_page_for_writepage(wbc, page);
1703			continue;
1704		}
1705		if (test_clear_buffer_dirty(bh)) {
1706			mark_buffer_async_write_endio(bh, handler);
1707		} else {
1708			unlock_buffer(bh);
1709		}
1710	} while ((bh = bh->b_this_page) != head);
1711
1712	/*
1713	 * The page and its buffers are protected by PageWriteback(), so we can
1714	 * drop the bh refcounts early.
1715	 */
1716	BUG_ON(PageWriteback(page));
1717	set_page_writeback(page);
1718
1719	do {
1720		struct buffer_head *next = bh->b_this_page;
1721		if (buffer_async_write(bh)) {
1722			submit_bh(write_op, bh);
1723			nr_underway++;
1724		}
1725		bh = next;
1726	} while (bh != head);
1727	unlock_page(page);
1728
1729	err = 0;
1730done:
1731	if (nr_underway == 0) {
1732		/*
1733		 * The page was marked dirty, but the buffers were
1734		 * clean.  Someone wrote them back by hand with
1735		 * ll_rw_block/submit_bh.  A rare case.
1736		 */
1737		end_page_writeback(page);
1738
1739		/*
1740		 * The page and buffer_heads can be released at any time from
1741		 * here on.
1742		 */
1743	}
1744	return err;
1745
1746recover:
1747	/*
1748	 * ENOSPC, or some other error.  We may already have added some
1749	 * blocks to the file, so we need to write these out to avoid
1750	 * exposing stale data.
1751	 * The page is currently locked and not marked for writeback
1752	 */
1753	bh = head;
1754	/* Recovery: lock and submit the mapped buffers */
1755	do {
1756		if (buffer_mapped(bh) && buffer_dirty(bh) &&
1757		    !buffer_delay(bh)) {
1758			lock_buffer(bh);
1759			mark_buffer_async_write_endio(bh, handler);
1760		} else {
1761			/*
1762			 * The buffer may have been set dirty during
1763			 * attachment to a dirty page.
1764			 */
1765			clear_buffer_dirty(bh);
1766		}
1767	} while ((bh = bh->b_this_page) != head);
1768	SetPageError(page);
1769	BUG_ON(PageWriteback(page));
1770	mapping_set_error(page->mapping, err);
1771	set_page_writeback(page);
1772	do {
1773		struct buffer_head *next = bh->b_this_page;
1774		if (buffer_async_write(bh)) {
1775			clear_buffer_dirty(bh);
1776			submit_bh(write_op, bh);
1777			nr_underway++;
1778		}
1779		bh = next;
1780	} while (bh != head);
1781	unlock_page(page);
1782	goto done;
1783}
 
1784
1785/*
1786 * If a page has any new buffers, zero them out here, and mark them uptodate
1787 * and dirty so they'll be written out (in order to prevent uninitialised
1788 * block data from leaking). And clear the new bit.
1789 */
1790void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1791{
1792	unsigned int block_start, block_end;
1793	struct buffer_head *head, *bh;
1794
1795	BUG_ON(!PageLocked(page));
1796	if (!page_has_buffers(page))
1797		return;
1798
1799	bh = head = page_buffers(page);
1800	block_start = 0;
1801	do {
1802		block_end = block_start + bh->b_size;
1803
1804		if (buffer_new(bh)) {
1805			if (block_end > from && block_start < to) {
1806				if (!PageUptodate(page)) {
1807					unsigned start, size;
1808
1809					start = max(from, block_start);
1810					size = min(to, block_end) - start;
1811
1812					zero_user(page, start, size);
1813					set_buffer_uptodate(bh);
1814				}
1815
1816				clear_buffer_new(bh);
1817				mark_buffer_dirty(bh);
1818			}
1819		}
1820
1821		block_start = block_end;
1822		bh = bh->b_this_page;
1823	} while (bh != head);
1824}
1825EXPORT_SYMBOL(page_zero_new_buffers);
1826
1827int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1828		get_block_t *get_block)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1829{
1830	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1831	unsigned to = from + len;
1832	struct inode *inode = page->mapping->host;
1833	unsigned block_start, block_end;
1834	sector_t block;
1835	int err = 0;
1836	unsigned blocksize, bbits;
1837	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1838
1839	BUG_ON(!PageLocked(page));
1840	BUG_ON(from > PAGE_CACHE_SIZE);
1841	BUG_ON(to > PAGE_CACHE_SIZE);
1842	BUG_ON(from > to);
1843
1844	blocksize = 1 << inode->i_blkbits;
1845	if (!page_has_buffers(page))
1846		create_empty_buffers(page, blocksize, 0);
1847	head = page_buffers(page);
1848
1849	bbits = inode->i_blkbits;
1850	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1851
1852	for(bh = head, block_start = 0; bh != head || !block_start;
1853	    block++, block_start=block_end, bh = bh->b_this_page) {
1854		block_end = block_start + blocksize;
1855		if (block_end <= from || block_start >= to) {
1856			if (PageUptodate(page)) {
1857				if (!buffer_uptodate(bh))
1858					set_buffer_uptodate(bh);
1859			}
1860			continue;
1861		}
1862		if (buffer_new(bh))
1863			clear_buffer_new(bh);
1864		if (!buffer_mapped(bh)) {
1865			WARN_ON(bh->b_size != blocksize);
1866			err = get_block(inode, block, bh, 1);
1867			if (err)
1868				break;
 
 
 
 
 
1869			if (buffer_new(bh)) {
1870				unmap_underlying_metadata(bh->b_bdev,
1871							bh->b_blocknr);
1872				if (PageUptodate(page)) {
1873					clear_buffer_new(bh);
1874					set_buffer_uptodate(bh);
1875					mark_buffer_dirty(bh);
1876					continue;
1877				}
1878				if (block_end > to || block_start < from)
1879					zero_user_segments(page,
1880						to, block_end,
1881						block_start, from);
1882				continue;
1883			}
1884		}
1885		if (PageUptodate(page)) {
1886			if (!buffer_uptodate(bh))
1887				set_buffer_uptodate(bh);
1888			continue; 
1889		}
1890		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1891		    !buffer_unwritten(bh) &&
1892		     (block_start < from || block_end > to)) {
1893			ll_rw_block(READ, 1, &bh);
1894			*wait_bh++=bh;
1895		}
1896	}
1897	/*
1898	 * If we issued read requests - let them complete.
1899	 */
1900	while(wait_bh > wait) {
1901		wait_on_buffer(*--wait_bh);
1902		if (!buffer_uptodate(*wait_bh))
1903			err = -EIO;
1904	}
1905	if (unlikely(err))
1906		page_zero_new_buffers(page, from, to);
1907	return err;
1908}
 
 
 
 
 
 
1909EXPORT_SYMBOL(__block_write_begin);
1910
1911static int __block_commit_write(struct inode *inode, struct page *page,
1912		unsigned from, unsigned to)
1913{
1914	unsigned block_start, block_end;
1915	int partial = 0;
1916	unsigned blocksize;
1917	struct buffer_head *bh, *head;
1918
1919	blocksize = 1 << inode->i_blkbits;
 
1920
1921	for(bh = head = page_buffers(page), block_start = 0;
1922	    bh != head || !block_start;
1923	    block_start=block_end, bh = bh->b_this_page) {
1924		block_end = block_start + blocksize;
1925		if (block_end <= from || block_start >= to) {
1926			if (!buffer_uptodate(bh))
1927				partial = 1;
1928		} else {
1929			set_buffer_uptodate(bh);
1930			mark_buffer_dirty(bh);
1931		}
1932		clear_buffer_new(bh);
1933	}
 
 
 
1934
1935	/*
1936	 * If this is a partial write which happened to make all buffers
1937	 * uptodate then we can optimize away a bogus readpage() for
1938	 * the next read(). Here we 'discover' whether the page went
1939	 * uptodate as a result of this (potentially partial) write.
1940	 */
1941	if (!partial)
1942		SetPageUptodate(page);
1943	return 0;
1944}
1945
1946/*
1947 * block_write_begin takes care of the basic task of block allocation and
1948 * bringing partial write blocks uptodate first.
1949 *
1950 * The filesystem needs to handle block truncation upon failure.
1951 */
1952int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
1953		unsigned flags, struct page **pagep, get_block_t *get_block)
1954{
1955	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1956	struct page *page;
1957	int status;
1958
1959	page = grab_cache_page_write_begin(mapping, index, flags);
1960	if (!page)
1961		return -ENOMEM;
1962
1963	status = __block_write_begin(page, pos, len, get_block);
1964	if (unlikely(status)) {
1965		unlock_page(page);
1966		page_cache_release(page);
1967		page = NULL;
1968	}
1969
1970	*pagep = page;
1971	return status;
1972}
1973EXPORT_SYMBOL(block_write_begin);
1974
1975int block_write_end(struct file *file, struct address_space *mapping,
1976			loff_t pos, unsigned len, unsigned copied,
1977			struct page *page, void *fsdata)
1978{
1979	struct inode *inode = mapping->host;
1980	unsigned start;
1981
1982	start = pos & (PAGE_CACHE_SIZE - 1);
1983
1984	if (unlikely(copied < len)) {
1985		/*
1986		 * The buffers that were written will now be uptodate, so we
1987		 * don't have to worry about a readpage reading them and
1988		 * overwriting a partial write. However if we have encountered
1989		 * a short write and only partially written into a buffer, it
1990		 * will not be marked uptodate, so a readpage might come in and
1991		 * destroy our partial write.
1992		 *
1993		 * Do the simplest thing, and just treat any short write to a
1994		 * non uptodate page as a zero-length write, and force the
1995		 * caller to redo the whole thing.
1996		 */
1997		if (!PageUptodate(page))
1998			copied = 0;
1999
2000		page_zero_new_buffers(page, start+copied, start+len);
2001	}
2002	flush_dcache_page(page);
2003
2004	/* This could be a short (even 0-length) commit */
2005	__block_commit_write(inode, page, start, start+copied);
2006
2007	return copied;
2008}
2009EXPORT_SYMBOL(block_write_end);
2010
2011int generic_write_end(struct file *file, struct address_space *mapping,
2012			loff_t pos, unsigned len, unsigned copied,
2013			struct page *page, void *fsdata)
2014{
2015	struct inode *inode = mapping->host;
 
2016	int i_size_changed = 0;
2017
2018	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2019
2020	/*
2021	 * No need to use i_size_read() here, the i_size
2022	 * cannot change under us because we hold i_mutex.
2023	 *
2024	 * But it's important to update i_size while still holding page lock:
2025	 * page writeout could otherwise come in and zero beyond i_size.
2026	 */
2027	if (pos+copied > inode->i_size) {
2028		i_size_write(inode, pos+copied);
2029		i_size_changed = 1;
2030	}
2031
2032	unlock_page(page);
2033	page_cache_release(page);
2034
 
 
2035	/*
2036	 * Don't mark the inode dirty under page lock. First, it unnecessarily
2037	 * makes the holding time of page lock longer. Second, it forces lock
2038	 * ordering of page lock and transaction start for journaling
2039	 * filesystems.
2040	 */
2041	if (i_size_changed)
2042		mark_inode_dirty(inode);
2043
2044	return copied;
2045}
2046EXPORT_SYMBOL(generic_write_end);
2047
2048/*
2049 * block_is_partially_uptodate checks whether buffers within a page are
2050 * uptodate or not.
2051 *
2052 * Returns true if all buffers which correspond to a file portion
2053 * we want to read are uptodate.
2054 */
2055int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2056					unsigned long from)
2057{
2058	struct inode *inode = page->mapping->host;
2059	unsigned block_start, block_end, blocksize;
2060	unsigned to;
2061	struct buffer_head *bh, *head;
2062	int ret = 1;
2063
2064	if (!page_has_buffers(page))
2065		return 0;
2066
2067	blocksize = 1 << inode->i_blkbits;
2068	to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
 
2069	to = from + to;
2070	if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2071		return 0;
2072
2073	head = page_buffers(page);
2074	bh = head;
2075	block_start = 0;
2076	do {
2077		block_end = block_start + blocksize;
2078		if (block_end > from && block_start < to) {
2079			if (!buffer_uptodate(bh)) {
2080				ret = 0;
2081				break;
2082			}
2083			if (block_end >= to)
2084				break;
2085		}
2086		block_start = block_end;
2087		bh = bh->b_this_page;
2088	} while (bh != head);
2089
2090	return ret;
2091}
2092EXPORT_SYMBOL(block_is_partially_uptodate);
2093
2094/*
2095 * Generic "read page" function for block devices that have the normal
2096 * get_block functionality. This is most of the block device filesystems.
2097 * Reads the page asynchronously --- the unlock_buffer() and
2098 * set/clear_buffer_uptodate() functions propagate buffer state into the
2099 * page struct once IO has completed.
2100 */
2101int block_read_full_page(struct page *page, get_block_t *get_block)
2102{
2103	struct inode *inode = page->mapping->host;
2104	sector_t iblock, lblock;
2105	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2106	unsigned int blocksize;
2107	int nr, i;
2108	int fully_mapped = 1;
2109
2110	BUG_ON(!PageLocked(page));
2111	blocksize = 1 << inode->i_blkbits;
2112	if (!page_has_buffers(page))
2113		create_empty_buffers(page, blocksize, 0);
2114	head = page_buffers(page);
2115
2116	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2117	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2118	bh = head;
2119	nr = 0;
2120	i = 0;
2121
2122	do {
2123		if (buffer_uptodate(bh))
2124			continue;
2125
2126		if (!buffer_mapped(bh)) {
2127			int err = 0;
2128
2129			fully_mapped = 0;
2130			if (iblock < lblock) {
2131				WARN_ON(bh->b_size != blocksize);
2132				err = get_block(inode, iblock, bh, 0);
2133				if (err)
2134					SetPageError(page);
2135			}
2136			if (!buffer_mapped(bh)) {
2137				zero_user(page, i * blocksize, blocksize);
2138				if (!err)
2139					set_buffer_uptodate(bh);
2140				continue;
2141			}
2142			/*
2143			 * get_block() might have updated the buffer
2144			 * synchronously
2145			 */
2146			if (buffer_uptodate(bh))
2147				continue;
2148		}
2149		arr[nr++] = bh;
2150	} while (i++, iblock++, (bh = bh->b_this_page) != head);
2151
2152	if (fully_mapped)
2153		SetPageMappedToDisk(page);
2154
2155	if (!nr) {
2156		/*
2157		 * All buffers are uptodate - we can set the page uptodate
2158		 * as well. But not if get_block() returned an error.
2159		 */
2160		if (!PageError(page))
2161			SetPageUptodate(page);
2162		unlock_page(page);
2163		return 0;
2164	}
2165
2166	/* Stage two: lock the buffers */
2167	for (i = 0; i < nr; i++) {
2168		bh = arr[i];
2169		lock_buffer(bh);
2170		mark_buffer_async_read(bh);
2171	}
2172
2173	/*
2174	 * Stage 3: start the IO.  Check for uptodateness
2175	 * inside the buffer lock in case another process reading
2176	 * the underlying blockdev brought it uptodate (the sct fix).
2177	 */
2178	for (i = 0; i < nr; i++) {
2179		bh = arr[i];
2180		if (buffer_uptodate(bh))
2181			end_buffer_async_read(bh, 1);
2182		else
2183			submit_bh(READ, bh);
2184	}
2185	return 0;
2186}
2187EXPORT_SYMBOL(block_read_full_page);
2188
2189/* utility function for filesystems that need to do work on expanding
2190 * truncates.  Uses filesystem pagecache writes to allow the filesystem to
2191 * deal with the hole.  
2192 */
2193int generic_cont_expand_simple(struct inode *inode, loff_t size)
2194{
2195	struct address_space *mapping = inode->i_mapping;
2196	struct page *page;
2197	void *fsdata;
2198	int err;
2199
2200	err = inode_newsize_ok(inode, size);
2201	if (err)
2202		goto out;
2203
2204	err = pagecache_write_begin(NULL, mapping, size, 0,
2205				AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2206				&page, &fsdata);
2207	if (err)
2208		goto out;
2209
2210	err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2211	BUG_ON(err > 0);
2212
2213out:
2214	return err;
2215}
2216EXPORT_SYMBOL(generic_cont_expand_simple);
2217
2218static int cont_expand_zero(struct file *file, struct address_space *mapping,
2219			    loff_t pos, loff_t *bytes)
2220{
2221	struct inode *inode = mapping->host;
2222	unsigned blocksize = 1 << inode->i_blkbits;
2223	struct page *page;
2224	void *fsdata;
2225	pgoff_t index, curidx;
2226	loff_t curpos;
2227	unsigned zerofrom, offset, len;
2228	int err = 0;
2229
2230	index = pos >> PAGE_CACHE_SHIFT;
2231	offset = pos & ~PAGE_CACHE_MASK;
2232
2233	while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2234		zerofrom = curpos & ~PAGE_CACHE_MASK;
2235		if (zerofrom & (blocksize-1)) {
2236			*bytes |= (blocksize-1);
2237			(*bytes)++;
2238		}
2239		len = PAGE_CACHE_SIZE - zerofrom;
2240
2241		err = pagecache_write_begin(file, mapping, curpos, len,
2242						AOP_FLAG_UNINTERRUPTIBLE,
2243						&page, &fsdata);
2244		if (err)
2245			goto out;
2246		zero_user(page, zerofrom, len);
2247		err = pagecache_write_end(file, mapping, curpos, len, len,
2248						page, fsdata);
2249		if (err < 0)
2250			goto out;
2251		BUG_ON(err != len);
2252		err = 0;
2253
2254		balance_dirty_pages_ratelimited(mapping);
 
 
 
 
 
2255	}
2256
2257	/* page covers the boundary, find the boundary offset */
2258	if (index == curidx) {
2259		zerofrom = curpos & ~PAGE_CACHE_MASK;
2260		/* if we will expand the thing last block will be filled */
2261		if (offset <= zerofrom) {
2262			goto out;
2263		}
2264		if (zerofrom & (blocksize-1)) {
2265			*bytes |= (blocksize-1);
2266			(*bytes)++;
2267		}
2268		len = offset - zerofrom;
2269
2270		err = pagecache_write_begin(file, mapping, curpos, len,
2271						AOP_FLAG_UNINTERRUPTIBLE,
2272						&page, &fsdata);
2273		if (err)
2274			goto out;
2275		zero_user(page, zerofrom, len);
2276		err = pagecache_write_end(file, mapping, curpos, len, len,
2277						page, fsdata);
2278		if (err < 0)
2279			goto out;
2280		BUG_ON(err != len);
2281		err = 0;
2282	}
2283out:
2284	return err;
2285}
2286
2287/*
2288 * For moronic filesystems that do not allow holes in file.
2289 * We may have to extend the file.
2290 */
2291int cont_write_begin(struct file *file, struct address_space *mapping,
2292			loff_t pos, unsigned len, unsigned flags,
2293			struct page **pagep, void **fsdata,
2294			get_block_t *get_block, loff_t *bytes)
2295{
2296	struct inode *inode = mapping->host;
2297	unsigned blocksize = 1 << inode->i_blkbits;
2298	unsigned zerofrom;
2299	int err;
2300
2301	err = cont_expand_zero(file, mapping, pos, bytes);
2302	if (err)
2303		return err;
2304
2305	zerofrom = *bytes & ~PAGE_CACHE_MASK;
2306	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2307		*bytes |= (blocksize-1);
2308		(*bytes)++;
2309	}
2310
2311	return block_write_begin(mapping, pos, len, flags, pagep, get_block);
2312}
2313EXPORT_SYMBOL(cont_write_begin);
2314
2315int block_commit_write(struct page *page, unsigned from, unsigned to)
2316{
2317	struct inode *inode = page->mapping->host;
2318	__block_commit_write(inode,page,from,to);
2319	return 0;
2320}
2321EXPORT_SYMBOL(block_commit_write);
2322
2323/*
2324 * block_page_mkwrite() is not allowed to change the file size as it gets
2325 * called from a page fault handler when a page is first dirtied. Hence we must
2326 * be careful to check for EOF conditions here. We set the page up correctly
2327 * for a written page which means we get ENOSPC checking when writing into
2328 * holes and correct delalloc and unwritten extent mapping on filesystems that
2329 * support these features.
2330 *
2331 * We are not allowed to take the i_mutex here so we have to play games to
2332 * protect against truncate races as the page could now be beyond EOF.  Because
2333 * truncate writes the inode size before removing pages, once we have the
2334 * page lock we can determine safely if the page is beyond EOF. If it is not
2335 * beyond EOF, then the page is guaranteed safe against truncation until we
2336 * unlock the page.
2337 *
2338 * Direct callers of this function should call vfs_check_frozen() so that page
2339 * fault does not busyloop until the fs is thawed.
2340 */
2341int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2342			 get_block_t get_block)
2343{
2344	struct page *page = vmf->page;
2345	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2346	unsigned long end;
2347	loff_t size;
2348	int ret;
2349
2350	lock_page(page);
2351	size = i_size_read(inode);
2352	if ((page->mapping != inode->i_mapping) ||
2353	    (page_offset(page) > size)) {
2354		/* We overload EFAULT to mean page got truncated */
2355		ret = -EFAULT;
2356		goto out_unlock;
2357	}
2358
2359	/* page is wholly or partially inside EOF */
2360	if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2361		end = size & ~PAGE_CACHE_MASK;
2362	else
2363		end = PAGE_CACHE_SIZE;
2364
2365	ret = __block_write_begin(page, 0, end, get_block);
2366	if (!ret)
2367		ret = block_commit_write(page, 0, end);
2368
2369	if (unlikely(ret < 0))
2370		goto out_unlock;
2371	/*
2372	 * Freezing in progress? We check after the page is marked dirty and
2373	 * with page lock held so if the test here fails, we are sure freezing
2374	 * code will wait during syncing until the page fault is done - at that
2375	 * point page will be dirty and unlocked so freezing code will write it
2376	 * and writeprotect it again.
2377	 */
2378	set_page_dirty(page);
2379	if (inode->i_sb->s_frozen != SB_UNFROZEN) {
2380		ret = -EAGAIN;
2381		goto out_unlock;
2382	}
2383	wait_on_page_writeback(page);
2384	return 0;
2385out_unlock:
2386	unlock_page(page);
2387	return ret;
2388}
2389EXPORT_SYMBOL(__block_page_mkwrite);
2390
2391int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2392		   get_block_t get_block)
2393{
2394	int ret;
2395	struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
2396
2397	/*
2398	 * This check is racy but catches the common case. The check in
2399	 * __block_page_mkwrite() is reliable.
2400	 */
2401	vfs_check_frozen(sb, SB_FREEZE_WRITE);
2402	ret = __block_page_mkwrite(vma, vmf, get_block);
2403	return block_page_mkwrite_return(ret);
2404}
2405EXPORT_SYMBOL(block_page_mkwrite);
2406
2407/*
2408 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2409 * immediately, while under the page lock.  So it needs a special end_io
2410 * handler which does not touch the bh after unlocking it.
2411 */
2412static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2413{
2414	__end_buffer_read_notouch(bh, uptodate);
2415}
2416
2417/*
2418 * Attach the singly-linked list of buffers created by nobh_write_begin, to
2419 * the page (converting it to circular linked list and taking care of page
2420 * dirty races).
2421 */
2422static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2423{
2424	struct buffer_head *bh;
2425
2426	BUG_ON(!PageLocked(page));
2427
2428	spin_lock(&page->mapping->private_lock);
2429	bh = head;
2430	do {
2431		if (PageDirty(page))
2432			set_buffer_dirty(bh);
2433		if (!bh->b_this_page)
2434			bh->b_this_page = head;
2435		bh = bh->b_this_page;
2436	} while (bh != head);
2437	attach_page_buffers(page, head);
2438	spin_unlock(&page->mapping->private_lock);
2439}
2440
2441/*
2442 * On entry, the page is fully not uptodate.
2443 * On exit the page is fully uptodate in the areas outside (from,to)
2444 * The filesystem needs to handle block truncation upon failure.
2445 */
2446int nobh_write_begin(struct address_space *mapping,
2447			loff_t pos, unsigned len, unsigned flags,
2448			struct page **pagep, void **fsdata,
2449			get_block_t *get_block)
2450{
2451	struct inode *inode = mapping->host;
2452	const unsigned blkbits = inode->i_blkbits;
2453	const unsigned blocksize = 1 << blkbits;
2454	struct buffer_head *head, *bh;
2455	struct page *page;
2456	pgoff_t index;
2457	unsigned from, to;
2458	unsigned block_in_page;
2459	unsigned block_start, block_end;
2460	sector_t block_in_file;
2461	int nr_reads = 0;
2462	int ret = 0;
2463	int is_mapped_to_disk = 1;
2464
2465	index = pos >> PAGE_CACHE_SHIFT;
2466	from = pos & (PAGE_CACHE_SIZE - 1);
2467	to = from + len;
2468
2469	page = grab_cache_page_write_begin(mapping, index, flags);
2470	if (!page)
2471		return -ENOMEM;
2472	*pagep = page;
2473	*fsdata = NULL;
2474
2475	if (page_has_buffers(page)) {
2476		ret = __block_write_begin(page, pos, len, get_block);
2477		if (unlikely(ret))
2478			goto out_release;
2479		return ret;
2480	}
2481
2482	if (PageMappedToDisk(page))
2483		return 0;
2484
2485	/*
2486	 * Allocate buffers so that we can keep track of state, and potentially
2487	 * attach them to the page if an error occurs. In the common case of
2488	 * no error, they will just be freed again without ever being attached
2489	 * to the page (which is all OK, because we're under the page lock).
2490	 *
2491	 * Be careful: the buffer linked list is a NULL terminated one, rather
2492	 * than the circular one we're used to.
2493	 */
2494	head = alloc_page_buffers(page, blocksize, 0);
2495	if (!head) {
2496		ret = -ENOMEM;
2497		goto out_release;
2498	}
2499
2500	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2501
2502	/*
2503	 * We loop across all blocks in the page, whether or not they are
2504	 * part of the affected region.  This is so we can discover if the
2505	 * page is fully mapped-to-disk.
2506	 */
2507	for (block_start = 0, block_in_page = 0, bh = head;
2508		  block_start < PAGE_CACHE_SIZE;
2509		  block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2510		int create;
2511
2512		block_end = block_start + blocksize;
2513		bh->b_state = 0;
2514		create = 1;
2515		if (block_start >= to)
2516			create = 0;
2517		ret = get_block(inode, block_in_file + block_in_page,
2518					bh, create);
2519		if (ret)
2520			goto failed;
2521		if (!buffer_mapped(bh))
2522			is_mapped_to_disk = 0;
2523		if (buffer_new(bh))
2524			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2525		if (PageUptodate(page)) {
2526			set_buffer_uptodate(bh);
2527			continue;
2528		}
2529		if (buffer_new(bh) || !buffer_mapped(bh)) {
2530			zero_user_segments(page, block_start, from,
2531							to, block_end);
2532			continue;
2533		}
2534		if (buffer_uptodate(bh))
2535			continue;	/* reiserfs does this */
2536		if (block_start < from || block_end > to) {
2537			lock_buffer(bh);
2538			bh->b_end_io = end_buffer_read_nobh;
2539			submit_bh(READ, bh);
2540			nr_reads++;
2541		}
2542	}
2543
2544	if (nr_reads) {
2545		/*
2546		 * The page is locked, so these buffers are protected from
2547		 * any VM or truncate activity.  Hence we don't need to care
2548		 * for the buffer_head refcounts.
2549		 */
2550		for (bh = head; bh; bh = bh->b_this_page) {
2551			wait_on_buffer(bh);
2552			if (!buffer_uptodate(bh))
2553				ret = -EIO;
2554		}
2555		if (ret)
2556			goto failed;
2557	}
2558
2559	if (is_mapped_to_disk)
2560		SetPageMappedToDisk(page);
2561
2562	*fsdata = head; /* to be released by nobh_write_end */
2563
2564	return 0;
2565
2566failed:
2567	BUG_ON(!ret);
2568	/*
2569	 * Error recovery is a bit difficult. We need to zero out blocks that
2570	 * were newly allocated, and dirty them to ensure they get written out.
2571	 * Buffers need to be attached to the page at this point, otherwise
2572	 * the handling of potential IO errors during writeout would be hard
2573	 * (could try doing synchronous writeout, but what if that fails too?)
2574	 */
2575	attach_nobh_buffers(page, head);
2576	page_zero_new_buffers(page, from, to);
2577
2578out_release:
2579	unlock_page(page);
2580	page_cache_release(page);
2581	*pagep = NULL;
2582
2583	return ret;
2584}
2585EXPORT_SYMBOL(nobh_write_begin);
2586
2587int nobh_write_end(struct file *file, struct address_space *mapping,
2588			loff_t pos, unsigned len, unsigned copied,
2589			struct page *page, void *fsdata)
2590{
2591	struct inode *inode = page->mapping->host;
2592	struct buffer_head *head = fsdata;
2593	struct buffer_head *bh;
2594	BUG_ON(fsdata != NULL && page_has_buffers(page));
2595
2596	if (unlikely(copied < len) && head)
2597		attach_nobh_buffers(page, head);
2598	if (page_has_buffers(page))
2599		return generic_write_end(file, mapping, pos, len,
2600					copied, page, fsdata);
2601
2602	SetPageUptodate(page);
2603	set_page_dirty(page);
2604	if (pos+copied > inode->i_size) {
2605		i_size_write(inode, pos+copied);
2606		mark_inode_dirty(inode);
2607	}
2608
2609	unlock_page(page);
2610	page_cache_release(page);
2611
2612	while (head) {
2613		bh = head;
2614		head = head->b_this_page;
2615		free_buffer_head(bh);
2616	}
2617
2618	return copied;
2619}
2620EXPORT_SYMBOL(nobh_write_end);
2621
2622/*
2623 * nobh_writepage() - based on block_full_write_page() except
2624 * that it tries to operate without attaching bufferheads to
2625 * the page.
2626 */
2627int nobh_writepage(struct page *page, get_block_t *get_block,
2628			struct writeback_control *wbc)
2629{
2630	struct inode * const inode = page->mapping->host;
2631	loff_t i_size = i_size_read(inode);
2632	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2633	unsigned offset;
2634	int ret;
2635
2636	/* Is the page fully inside i_size? */
2637	if (page->index < end_index)
2638		goto out;
2639
2640	/* Is the page fully outside i_size? (truncate in progress) */
2641	offset = i_size & (PAGE_CACHE_SIZE-1);
2642	if (page->index >= end_index+1 || !offset) {
2643		/*
2644		 * The page may have dirty, unmapped buffers.  For example,
2645		 * they may have been added in ext3_writepage().  Make them
2646		 * freeable here, so the page does not leak.
2647		 */
2648#if 0
2649		/* Not really sure about this  - do we need this ? */
2650		if (page->mapping->a_ops->invalidatepage)
2651			page->mapping->a_ops->invalidatepage(page, offset);
2652#endif
2653		unlock_page(page);
2654		return 0; /* don't care */
2655	}
2656
2657	/*
2658	 * The page straddles i_size.  It must be zeroed out on each and every
2659	 * writepage invocation because it may be mmapped.  "A file is mapped
2660	 * in multiples of the page size.  For a file that is not a multiple of
2661	 * the  page size, the remaining memory is zeroed when mapped, and
2662	 * writes to that region are not written out to the file."
2663	 */
2664	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2665out:
2666	ret = mpage_writepage(page, get_block, wbc);
2667	if (ret == -EAGAIN)
2668		ret = __block_write_full_page(inode, page, get_block, wbc,
2669					      end_buffer_async_write);
2670	return ret;
2671}
2672EXPORT_SYMBOL(nobh_writepage);
2673
2674int nobh_truncate_page(struct address_space *mapping,
2675			loff_t from, get_block_t *get_block)
2676{
2677	pgoff_t index = from >> PAGE_CACHE_SHIFT;
2678	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2679	unsigned blocksize;
2680	sector_t iblock;
2681	unsigned length, pos;
2682	struct inode *inode = mapping->host;
2683	struct page *page;
2684	struct buffer_head map_bh;
2685	int err;
2686
2687	blocksize = 1 << inode->i_blkbits;
2688	length = offset & (blocksize - 1);
2689
2690	/* Block boundary? Nothing to do */
2691	if (!length)
2692		return 0;
2693
2694	length = blocksize - length;
2695	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2696
2697	page = grab_cache_page(mapping, index);
2698	err = -ENOMEM;
2699	if (!page)
2700		goto out;
2701
2702	if (page_has_buffers(page)) {
2703has_buffers:
2704		unlock_page(page);
2705		page_cache_release(page);
2706		return block_truncate_page(mapping, from, get_block);
2707	}
2708
2709	/* Find the buffer that contains "offset" */
2710	pos = blocksize;
2711	while (offset >= pos) {
2712		iblock++;
2713		pos += blocksize;
2714	}
2715
2716	map_bh.b_size = blocksize;
2717	map_bh.b_state = 0;
2718	err = get_block(inode, iblock, &map_bh, 0);
2719	if (err)
2720		goto unlock;
2721	/* unmapped? It's a hole - nothing to do */
2722	if (!buffer_mapped(&map_bh))
2723		goto unlock;
2724
2725	/* Ok, it's mapped. Make sure it's up-to-date */
2726	if (!PageUptodate(page)) {
2727		err = mapping->a_ops->readpage(NULL, page);
2728		if (err) {
2729			page_cache_release(page);
2730			goto out;
2731		}
2732		lock_page(page);
2733		if (!PageUptodate(page)) {
2734			err = -EIO;
2735			goto unlock;
2736		}
2737		if (page_has_buffers(page))
2738			goto has_buffers;
2739	}
2740	zero_user(page, offset, length);
2741	set_page_dirty(page);
2742	err = 0;
2743
2744unlock:
2745	unlock_page(page);
2746	page_cache_release(page);
2747out:
2748	return err;
2749}
2750EXPORT_SYMBOL(nobh_truncate_page);
2751
2752int block_truncate_page(struct address_space *mapping,
2753			loff_t from, get_block_t *get_block)
2754{
2755	pgoff_t index = from >> PAGE_CACHE_SHIFT;
2756	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2757	unsigned blocksize;
2758	sector_t iblock;
2759	unsigned length, pos;
2760	struct inode *inode = mapping->host;
2761	struct page *page;
2762	struct buffer_head *bh;
2763	int err;
2764
2765	blocksize = 1 << inode->i_blkbits;
2766	length = offset & (blocksize - 1);
2767
2768	/* Block boundary? Nothing to do */
2769	if (!length)
2770		return 0;
2771
2772	length = blocksize - length;
2773	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2774	
2775	page = grab_cache_page(mapping, index);
2776	err = -ENOMEM;
2777	if (!page)
2778		goto out;
2779
2780	if (!page_has_buffers(page))
2781		create_empty_buffers(page, blocksize, 0);
2782
2783	/* Find the buffer that contains "offset" */
2784	bh = page_buffers(page);
2785	pos = blocksize;
2786	while (offset >= pos) {
2787		bh = bh->b_this_page;
2788		iblock++;
2789		pos += blocksize;
2790	}
2791
2792	err = 0;
2793	if (!buffer_mapped(bh)) {
2794		WARN_ON(bh->b_size != blocksize);
2795		err = get_block(inode, iblock, bh, 0);
2796		if (err)
2797			goto unlock;
2798		/* unmapped? It's a hole - nothing to do */
2799		if (!buffer_mapped(bh))
2800			goto unlock;
2801	}
2802
2803	/* Ok, it's mapped. Make sure it's up-to-date */
2804	if (PageUptodate(page))
2805		set_buffer_uptodate(bh);
2806
2807	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2808		err = -EIO;
2809		ll_rw_block(READ, 1, &bh);
2810		wait_on_buffer(bh);
2811		/* Uhhuh. Read error. Complain and punt. */
2812		if (!buffer_uptodate(bh))
2813			goto unlock;
2814	}
2815
2816	zero_user(page, offset, length);
2817	mark_buffer_dirty(bh);
2818	err = 0;
2819
2820unlock:
2821	unlock_page(page);
2822	page_cache_release(page);
2823out:
2824	return err;
2825}
2826EXPORT_SYMBOL(block_truncate_page);
2827
2828/*
2829 * The generic ->writepage function for buffer-backed address_spaces
2830 * this form passes in the end_io handler used to finish the IO.
2831 */
2832int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2833			struct writeback_control *wbc, bh_end_io_t *handler)
2834{
2835	struct inode * const inode = page->mapping->host;
2836	loff_t i_size = i_size_read(inode);
2837	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2838	unsigned offset;
2839
2840	/* Is the page fully inside i_size? */
2841	if (page->index < end_index)
2842		return __block_write_full_page(inode, page, get_block, wbc,
2843					       handler);
2844
2845	/* Is the page fully outside i_size? (truncate in progress) */
2846	offset = i_size & (PAGE_CACHE_SIZE-1);
2847	if (page->index >= end_index+1 || !offset) {
2848		/*
2849		 * The page may have dirty, unmapped buffers.  For example,
2850		 * they may have been added in ext3_writepage().  Make them
2851		 * freeable here, so the page does not leak.
2852		 */
2853		do_invalidatepage(page, 0);
2854		unlock_page(page);
2855		return 0; /* don't care */
2856	}
2857
2858	/*
2859	 * The page straddles i_size.  It must be zeroed out on each and every
2860	 * writepage invocation because it may be mmapped.  "A file is mapped
2861	 * in multiples of the page size.  For a file that is not a multiple of
2862	 * the  page size, the remaining memory is zeroed when mapped, and
2863	 * writes to that region are not written out to the file."
2864	 */
2865	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2866	return __block_write_full_page(inode, page, get_block, wbc, handler);
2867}
2868EXPORT_SYMBOL(block_write_full_page_endio);
2869
2870/*
2871 * The generic ->writepage function for buffer-backed address_spaces
2872 */
2873int block_write_full_page(struct page *page, get_block_t *get_block,
2874			struct writeback_control *wbc)
2875{
2876	return block_write_full_page_endio(page, get_block, wbc,
2877					   end_buffer_async_write);
2878}
2879EXPORT_SYMBOL(block_write_full_page);
2880
2881sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2882			    get_block_t *get_block)
2883{
2884	struct buffer_head tmp;
2885	struct inode *inode = mapping->host;
2886	tmp.b_state = 0;
2887	tmp.b_blocknr = 0;
2888	tmp.b_size = 1 << inode->i_blkbits;
2889	get_block(inode, block, &tmp, 0);
2890	return tmp.b_blocknr;
2891}
2892EXPORT_SYMBOL(generic_block_bmap);
2893
2894static void end_bio_bh_io_sync(struct bio *bio, int err)
2895{
2896	struct buffer_head *bh = bio->bi_private;
2897
2898	if (err == -EOPNOTSUPP) {
2899		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2900	}
2901
2902	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
2903		set_bit(BH_Quiet, &bh->b_state);
2904
2905	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2906	bio_put(bio);
2907}
2908
2909int submit_bh(int rw, struct buffer_head * bh)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2910{
2911	struct bio *bio;
2912	int ret = 0;
2913
2914	BUG_ON(!buffer_locked(bh));
2915	BUG_ON(!buffer_mapped(bh));
2916	BUG_ON(!bh->b_end_io);
2917	BUG_ON(buffer_delay(bh));
2918	BUG_ON(buffer_unwritten(bh));
2919
2920	/*
2921	 * Only clear out a write error when rewriting
2922	 */
2923	if (test_set_buffer_req(bh) && (rw & WRITE))
2924		clear_buffer_write_io_error(bh);
2925
2926	/*
2927	 * from here on down, it's all bio -- do the initial mapping,
2928	 * submit_bio -> generic_make_request may further map this bio around
2929	 */
2930	bio = bio_alloc(GFP_NOIO, 1);
2931
2932	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 
 
 
 
 
2933	bio->bi_bdev = bh->b_bdev;
2934	bio->bi_io_vec[0].bv_page = bh->b_page;
2935	bio->bi_io_vec[0].bv_len = bh->b_size;
2936	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2937
2938	bio->bi_vcnt = 1;
2939	bio->bi_idx = 0;
2940	bio->bi_size = bh->b_size;
2941
2942	bio->bi_end_io = end_bio_bh_io_sync;
2943	bio->bi_private = bh;
 
 
 
 
 
 
 
 
 
 
2944
2945	bio_get(bio);
2946	submit_bio(rw, bio);
 
2947
2948	if (bio_flagged(bio, BIO_EOPNOTSUPP))
2949		ret = -EOPNOTSUPP;
 
 
 
 
2950
2951	bio_put(bio);
2952	return ret;
 
2953}
2954EXPORT_SYMBOL(submit_bh);
2955
2956/**
2957 * ll_rw_block: low-level access to block devices (DEPRECATED)
2958 * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
 
2959 * @nr: number of &struct buffer_heads in the array
2960 * @bhs: array of pointers to &struct buffer_head
2961 *
2962 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2963 * requests an I/O operation on them, either a %READ or a %WRITE.  The third
2964 * %READA option is described in the documentation for generic_make_request()
2965 * which ll_rw_block() calls.
2966 *
2967 * This function drops any buffer that it cannot get a lock on (with the
2968 * BH_Lock state bit), any buffer that appears to be clean when doing a write
2969 * request, and any buffer that appears to be up-to-date when doing read
2970 * request.  Further it marks as clean buffers that are processed for
2971 * writing (the buffer cache won't assume that they are actually clean
2972 * until the buffer gets unlocked).
2973 *
2974 * ll_rw_block sets b_end_io to simple completion handler that marks
2975 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2976 * any waiters. 
2977 *
2978 * All of the buffers must be for the same device, and must also be a
2979 * multiple of the current approved size for the device.
2980 */
2981void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2982{
2983	int i;
2984
2985	for (i = 0; i < nr; i++) {
2986		struct buffer_head *bh = bhs[i];
2987
2988		if (!trylock_buffer(bh))
2989			continue;
2990		if (rw == WRITE) {
2991			if (test_clear_buffer_dirty(bh)) {
2992				bh->b_end_io = end_buffer_write_sync;
2993				get_bh(bh);
2994				submit_bh(WRITE, bh);
2995				continue;
2996			}
2997		} else {
2998			if (!buffer_uptodate(bh)) {
2999				bh->b_end_io = end_buffer_read_sync;
3000				get_bh(bh);
3001				submit_bh(rw, bh);
3002				continue;
3003			}
3004		}
3005		unlock_buffer(bh);
3006	}
3007}
3008EXPORT_SYMBOL(ll_rw_block);
3009
3010void write_dirty_buffer(struct buffer_head *bh, int rw)
3011{
3012	lock_buffer(bh);
3013	if (!test_clear_buffer_dirty(bh)) {
3014		unlock_buffer(bh);
3015		return;
3016	}
3017	bh->b_end_io = end_buffer_write_sync;
3018	get_bh(bh);
3019	submit_bh(rw, bh);
3020}
3021EXPORT_SYMBOL(write_dirty_buffer);
3022
3023/*
3024 * For a data-integrity writeout, we need to wait upon any in-progress I/O
3025 * and then start new I/O and then wait upon it.  The caller must have a ref on
3026 * the buffer_head.
3027 */
3028int __sync_dirty_buffer(struct buffer_head *bh, int rw)
3029{
3030	int ret = 0;
3031
3032	WARN_ON(atomic_read(&bh->b_count) < 1);
3033	lock_buffer(bh);
3034	if (test_clear_buffer_dirty(bh)) {
3035		get_bh(bh);
3036		bh->b_end_io = end_buffer_write_sync;
3037		ret = submit_bh(rw, bh);
3038		wait_on_buffer(bh);
3039		if (!ret && !buffer_uptodate(bh))
3040			ret = -EIO;
3041	} else {
3042		unlock_buffer(bh);
3043	}
3044	return ret;
3045}
3046EXPORT_SYMBOL(__sync_dirty_buffer);
3047
3048int sync_dirty_buffer(struct buffer_head *bh)
3049{
3050	return __sync_dirty_buffer(bh, WRITE_SYNC);
3051}
3052EXPORT_SYMBOL(sync_dirty_buffer);
3053
3054/*
3055 * try_to_free_buffers() checks if all the buffers on this particular page
3056 * are unused, and releases them if so.
3057 *
3058 * Exclusion against try_to_free_buffers may be obtained by either
3059 * locking the page or by holding its mapping's private_lock.
3060 *
3061 * If the page is dirty but all the buffers are clean then we need to
3062 * be sure to mark the page clean as well.  This is because the page
3063 * may be against a block device, and a later reattachment of buffers
3064 * to a dirty page will set *all* buffers dirty.  Which would corrupt
3065 * filesystem data on the same device.
3066 *
3067 * The same applies to regular filesystem pages: if all the buffers are
3068 * clean then we set the page clean and proceed.  To do that, we require
3069 * total exclusion from __set_page_dirty_buffers().  That is obtained with
3070 * private_lock.
3071 *
3072 * try_to_free_buffers() is non-blocking.
3073 */
3074static inline int buffer_busy(struct buffer_head *bh)
3075{
3076	return atomic_read(&bh->b_count) |
3077		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3078}
3079
3080static int
3081drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3082{
3083	struct buffer_head *head = page_buffers(page);
3084	struct buffer_head *bh;
3085
3086	bh = head;
3087	do {
3088		if (buffer_write_io_error(bh) && page->mapping)
3089			set_bit(AS_EIO, &page->mapping->flags);
3090		if (buffer_busy(bh))
3091			goto failed;
3092		bh = bh->b_this_page;
3093	} while (bh != head);
3094
3095	do {
3096		struct buffer_head *next = bh->b_this_page;
3097
3098		if (bh->b_assoc_map)
3099			__remove_assoc_queue(bh);
3100		bh = next;
3101	} while (bh != head);
3102	*buffers_to_free = head;
3103	__clear_page_buffers(page);
3104	return 1;
3105failed:
3106	return 0;
3107}
3108
3109int try_to_free_buffers(struct page *page)
3110{
3111	struct address_space * const mapping = page->mapping;
3112	struct buffer_head *buffers_to_free = NULL;
3113	int ret = 0;
3114
3115	BUG_ON(!PageLocked(page));
3116	if (PageWriteback(page))
3117		return 0;
3118
3119	if (mapping == NULL) {		/* can this still happen? */
3120		ret = drop_buffers(page, &buffers_to_free);
3121		goto out;
3122	}
3123
3124	spin_lock(&mapping->private_lock);
3125	ret = drop_buffers(page, &buffers_to_free);
3126
3127	/*
3128	 * If the filesystem writes its buffers by hand (eg ext3)
3129	 * then we can have clean buffers against a dirty page.  We
3130	 * clean the page here; otherwise the VM will never notice
3131	 * that the filesystem did any IO at all.
3132	 *
3133	 * Also, during truncate, discard_buffer will have marked all
3134	 * the page's buffers clean.  We discover that here and clean
3135	 * the page also.
3136	 *
3137	 * private_lock must be held over this entire operation in order
3138	 * to synchronise against __set_page_dirty_buffers and prevent the
3139	 * dirty bit from being lost.
3140	 */
3141	if (ret)
3142		cancel_dirty_page(page, PAGE_CACHE_SIZE);
3143	spin_unlock(&mapping->private_lock);
3144out:
3145	if (buffers_to_free) {
3146		struct buffer_head *bh = buffers_to_free;
3147
3148		do {
3149			struct buffer_head *next = bh->b_this_page;
3150			free_buffer_head(bh);
3151			bh = next;
3152		} while (bh != buffers_to_free);
3153	}
3154	return ret;
3155}
3156EXPORT_SYMBOL(try_to_free_buffers);
3157
3158/*
3159 * There are no bdflush tunables left.  But distributions are
3160 * still running obsolete flush daemons, so we terminate them here.
3161 *
3162 * Use of bdflush() is deprecated and will be removed in a future kernel.
3163 * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3164 */
3165SYSCALL_DEFINE2(bdflush, int, func, long, data)
3166{
3167	static int msg_count;
3168
3169	if (!capable(CAP_SYS_ADMIN))
3170		return -EPERM;
3171
3172	if (msg_count < 5) {
3173		msg_count++;
3174		printk(KERN_INFO
3175			"warning: process `%s' used the obsolete bdflush"
3176			" system call\n", current->comm);
3177		printk(KERN_INFO "Fix your initscripts?\n");
3178	}
3179
3180	if (func == 1)
3181		do_exit(0);
3182	return 0;
3183}
3184
3185/*
3186 * Buffer-head allocation
3187 */
3188static struct kmem_cache *bh_cachep;
3189
3190/*
3191 * Once the number of bh's in the machine exceeds this level, we start
3192 * stripping them in writeback.
3193 */
3194static int max_buffer_heads;
3195
3196int buffer_heads_over_limit;
3197
3198struct bh_accounting {
3199	int nr;			/* Number of live bh's */
3200	int ratelimit;		/* Limit cacheline bouncing */
3201};
3202
3203static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3204
3205static void recalc_bh_state(void)
3206{
3207	int i;
3208	int tot = 0;
3209
3210	if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3211		return;
3212	__this_cpu_write(bh_accounting.ratelimit, 0);
3213	for_each_online_cpu(i)
3214		tot += per_cpu(bh_accounting, i).nr;
3215	buffer_heads_over_limit = (tot > max_buffer_heads);
3216}
3217
3218struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3219{
3220	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3221	if (ret) {
3222		INIT_LIST_HEAD(&ret->b_assoc_buffers);
3223		preempt_disable();
3224		__this_cpu_inc(bh_accounting.nr);
3225		recalc_bh_state();
3226		preempt_enable();
3227	}
3228	return ret;
3229}
3230EXPORT_SYMBOL(alloc_buffer_head);
3231
3232void free_buffer_head(struct buffer_head *bh)
3233{
3234	BUG_ON(!list_empty(&bh->b_assoc_buffers));
3235	kmem_cache_free(bh_cachep, bh);
3236	preempt_disable();
3237	__this_cpu_dec(bh_accounting.nr);
3238	recalc_bh_state();
3239	preempt_enable();
3240}
3241EXPORT_SYMBOL(free_buffer_head);
3242
3243static void buffer_exit_cpu(int cpu)
3244{
3245	int i;
3246	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3247
3248	for (i = 0; i < BH_LRU_SIZE; i++) {
3249		brelse(b->bhs[i]);
3250		b->bhs[i] = NULL;
3251	}
3252	this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3253	per_cpu(bh_accounting, cpu).nr = 0;
3254}
3255
3256static int buffer_cpu_notify(struct notifier_block *self,
3257			      unsigned long action, void *hcpu)
3258{
3259	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3260		buffer_exit_cpu((unsigned long)hcpu);
3261	return NOTIFY_OK;
3262}
3263
3264/**
3265 * bh_uptodate_or_lock - Test whether the buffer is uptodate
3266 * @bh: struct buffer_head
3267 *
3268 * Return true if the buffer is up-to-date and false,
3269 * with the buffer locked, if not.
3270 */
3271int bh_uptodate_or_lock(struct buffer_head *bh)
3272{
3273	if (!buffer_uptodate(bh)) {
3274		lock_buffer(bh);
3275		if (!buffer_uptodate(bh))
3276			return 0;
3277		unlock_buffer(bh);
3278	}
3279	return 1;
3280}
3281EXPORT_SYMBOL(bh_uptodate_or_lock);
3282
3283/**
3284 * bh_submit_read - Submit a locked buffer for reading
3285 * @bh: struct buffer_head
3286 *
3287 * Returns zero on success and -EIO on error.
3288 */
3289int bh_submit_read(struct buffer_head *bh)
3290{
3291	BUG_ON(!buffer_locked(bh));
3292
3293	if (buffer_uptodate(bh)) {
3294		unlock_buffer(bh);
3295		return 0;
3296	}
3297
3298	get_bh(bh);
3299	bh->b_end_io = end_buffer_read_sync;
3300	submit_bh(READ, bh);
3301	wait_on_buffer(bh);
3302	if (buffer_uptodate(bh))
3303		return 0;
3304	return -EIO;
3305}
3306EXPORT_SYMBOL(bh_submit_read);
3307
3308void __init buffer_init(void)
3309{
3310	int nrpages;
 
3311
3312	bh_cachep = kmem_cache_create("buffer_head",
3313			sizeof(struct buffer_head), 0,
3314				(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3315				SLAB_MEM_SPREAD),
3316				NULL);
3317
3318	/*
3319	 * Limit the bh occupancy to 10% of ZONE_NORMAL
3320	 */
3321	nrpages = (nr_free_buffer_pages() * 10) / 100;
3322	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3323	hotcpu_notifier(buffer_cpu_notify, 0);
 
 
3324}