buffer.c - fs/buffer.c - Linux diff v6.13.7 - Bootlin Elixir Cross Referencer

   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *  linux/fs/buffer.c
   4 *
   5 *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
   6 */
   7
   8/*
   9 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
  10 *
  11 * Removed a lot of unnecessary code and simplified things now that
  12 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  13 *
  14 * Speed up hash, lru, and free list operations.  Use gfp() for allocating
  15 * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
  16 *
  17 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
  18 *
  19 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
  20 */
  21
  22#include <linux/kernel.h>
  23#include <linux/sched/signal.h>
  24#include <linux/syscalls.h>
  25#include <linux/fs.h>
  26#include <linux/iomap.h>
  27#include <linux/mm.h>
  28#include <linux/percpu.h>
  29#include <linux/slab.h>
  30#include <linux/capability.h>
  31#include <linux/blkdev.h>
  32#include <linux/file.h>
  33#include <linux/quotaops.h>
  34#include <linux/highmem.h>
  35#include <linux/export.h>
  36#include <linux/backing-dev.h>
  37#include <linux/writeback.h>
  38#include <linux/hash.h>
  39#include <linux/suspend.h>
  40#include <linux/buffer_head.h>
  41#include <linux/task_io_accounting_ops.h>
  42#include <linux/bio.h>
 
  43#include <linux/cpu.h>
  44#include <linux/bitops.h>
  45#include <linux/mpage.h>
  46#include <linux/bit_spinlock.h>
  47#include <linux/pagevec.h>
  48#include <linux/sched/mm.h>
  49#include <trace/events/block.h>
  50#include <linux/fscrypt.h>
  51#include <linux/fsverity.h>
  52#include <linux/sched/isolation.h>
  53
  54#include "internal.h"
  55
  56static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
  57static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
  58			  enum rw_hint hint, struct writeback_control *wbc);
  59
  60#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
  61
  62inline void touch_buffer(struct buffer_head *bh)
 
 
 
 
 
 
 
 
  63{
  64	trace_block_touch_buffer(bh);
  65	folio_mark_accessed(bh->b_folio);
  66}
  67EXPORT_SYMBOL(touch_buffer);
  68
  69void __lock_buffer(struct buffer_head *bh)
  70{
  71	wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
 
  72}
  73EXPORT_SYMBOL(__lock_buffer);
  74
  75void unlock_buffer(struct buffer_head *bh)
  76{
  77	clear_bit_unlock(BH_Lock, &bh->b_state);
  78	smp_mb__after_atomic();
  79	wake_up_bit(&bh->b_state, BH_Lock);
  80}
  81EXPORT_SYMBOL(unlock_buffer);
  82
  83/*
  84 * Returns if the folio has dirty or writeback buffers. If all the buffers
  85 * are unlocked and clean then the folio_test_dirty information is stale. If
  86 * any of the buffers are locked, it is assumed they are locked for IO.
  87 */
  88void buffer_check_dirty_writeback(struct folio *folio,
  89				     bool *dirty, bool *writeback)
  90{
  91	struct buffer_head *head, *bh;
  92	*dirty = false;
  93	*writeback = false;
  94
  95	BUG_ON(!folio_test_locked(folio));
  96
  97	head = folio_buffers(folio);
  98	if (!head)
  99		return;
 100
 101	if (folio_test_writeback(folio))
 102		*writeback = true;
 103
 104	bh = head;
 105	do {
 106		if (buffer_locked(bh))
 107			*writeback = true;
 108
 109		if (buffer_dirty(bh))
 110			*dirty = true;
 111
 112		bh = bh->b_this_page;
 113	} while (bh != head);
 114}
 115
 116/*
 117 * Block until a buffer comes unlocked.  This doesn't stop it
 118 * from becoming locked again - you have to lock it yourself
 119 * if you want to preserve its state.
 120 */
 121void __wait_on_buffer(struct buffer_head * bh)
 122{
 123	wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
 124}
 125EXPORT_SYMBOL(__wait_on_buffer);
 126
 127static void buffer_io_error(struct buffer_head *bh, char *msg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 128{
 129	if (!test_bit(BH_Quiet, &bh->b_state))
 130		printk_ratelimited(KERN_ERR
 131			"Buffer I/O error on dev %pg, logical block %llu%s\n",
 132			bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
 133}
 134
 135/*
 136 * End-of-IO handler helper function which does not touch the bh after
 137 * unlocking it.
 138 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
 139 * a race there is benign: unlock_buffer() only use the bh's address for
 140 * hashing after unlocking the buffer, so it doesn't actually touch the bh
 141 * itself.
 142 */
 143static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
 144{
 145	if (uptodate) {
 146		set_buffer_uptodate(bh);
 147	} else {
 148		/* This happens, due to failed read-ahead attempts. */
 149		clear_buffer_uptodate(bh);
 150	}
 151	unlock_buffer(bh);
 152}
 153
 154/*
 155 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 156 * unlock the buffer.
 157 */
 158void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
 159{
 160	__end_buffer_read_notouch(bh, uptodate);
 161	put_bh(bh);
 162}
 163EXPORT_SYMBOL(end_buffer_read_sync);
 164
 165void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 166{
 
 
 167	if (uptodate) {
 168		set_buffer_uptodate(bh);
 169	} else {
 170		buffer_io_error(bh, ", lost sync page write");
 171		mark_buffer_write_io_error(bh);
 
 
 
 
 
 172		clear_buffer_uptodate(bh);
 173	}
 174	unlock_buffer(bh);
 175	put_bh(bh);
 176}
 177EXPORT_SYMBOL(end_buffer_write_sync);
 178
 179/*
 180 * Various filesystems appear to want __find_get_block to be non-blocking.
 181 * But it's the page lock which protects the buffers.  To get around this,
 182 * we get exclusion from try_to_free_buffers with the blockdev mapping's
 183 * i_private_lock.
 184 *
 185 * Hack idea: for the blockdev mapping, i_private_lock contention
 186 * may be quite high.  This code could TryLock the page, and if that
 187 * succeeds, there is no need to take i_private_lock.
 
 188 */
 189static struct buffer_head *
 190__find_get_block_slow(struct block_device *bdev, sector_t block)
 191{
 192	struct address_space *bd_mapping = bdev->bd_mapping;
 193	const int blkbits = bd_mapping->host->i_blkbits;
 194	struct buffer_head *ret = NULL;
 195	pgoff_t index;
 196	struct buffer_head *bh;
 197	struct buffer_head *head;
 198	struct folio *folio;
 199	int all_mapped = 1;
 200	static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
 201
 202	index = ((loff_t)block << blkbits) / PAGE_SIZE;
 203	folio = __filemap_get_folio(bd_mapping, index, FGP_ACCESSED, 0);
 204	if (IS_ERR(folio))
 205		goto out;
 206
 207	spin_lock(&bd_mapping->i_private_lock);
 208	head = folio_buffers(folio);
 209	if (!head)
 210		goto out_unlock;
 
 211	bh = head;
 212	do {
 213		if (!buffer_mapped(bh))
 214			all_mapped = 0;
 215		else if (bh->b_blocknr == block) {
 216			ret = bh;
 217			get_bh(bh);
 218			goto out_unlock;
 219		}
 220		bh = bh->b_this_page;
 221	} while (bh != head);
 222
 223	/* we might be here because some of the buffers on this page are
 224	 * not mapped.  This is due to various races between
 225	 * file io on the block device and getblk.  It gets dealt with
 226	 * elsewhere, don't buffer_error if we had some unmapped buffers
 227	 */
 228	ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
 229	if (all_mapped && __ratelimit(&last_warned)) {
 230		printk("__find_get_block_slow() failed. block=%llu, "
 231		       "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
 232		       "device %pg blocksize: %d\n",
 233		       (unsigned long long)block,
 234		       (unsigned long long)bh->b_blocknr,
 235		       bh->b_state, bh->b_size, bdev,
 236		       1 << blkbits);
 237	}
 238out_unlock:
 239	spin_unlock(&bd_mapping->i_private_lock);
 240	folio_put(folio);
 241out:
 242	return ret;
 243}
 244
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 245static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 246{
 247	unsigned long flags;
 248	struct buffer_head *first;
 249	struct buffer_head *tmp;
 250	struct folio *folio;
 251	int folio_uptodate = 1;
 252
 253	BUG_ON(!buffer_async_read(bh));
 254
 255	folio = bh->b_folio;
 256	if (uptodate) {
 257		set_buffer_uptodate(bh);
 258	} else {
 259		clear_buffer_uptodate(bh);
 260		buffer_io_error(bh, ", async page read");
 
 
 261	}
 262
 263	/*
 264	 * Be _very_ careful from here on. Bad things can happen if
 265	 * two buffer heads end IO at almost the same time and both
 266	 * decide that the page is now completely done.
 267	 */
 268	first = folio_buffers(folio);
 269	spin_lock_irqsave(&first->b_uptodate_lock, flags);
 
 270	clear_buffer_async_read(bh);
 271	unlock_buffer(bh);
 272	tmp = bh;
 273	do {
 274		if (!buffer_uptodate(tmp))
 275			folio_uptodate = 0;
 276		if (buffer_async_read(tmp)) {
 277			BUG_ON(!buffer_locked(tmp));
 278			goto still_busy;
 279		}
 280		tmp = tmp->b_this_page;
 281	} while (tmp != bh);
 282	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 
 283
 284	folio_end_read(folio, folio_uptodate);
 
 
 
 
 
 
 285	return;
 286
 287still_busy:
 288	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 
 289	return;
 290}
 291
 292struct postprocess_bh_ctx {
 293	struct work_struct work;
 294	struct buffer_head *bh;
 295};
 296
 297static void verify_bh(struct work_struct *work)
 298{
 299	struct postprocess_bh_ctx *ctx =
 300		container_of(work, struct postprocess_bh_ctx, work);
 301	struct buffer_head *bh = ctx->bh;
 302	bool valid;
 303
 304	valid = fsverity_verify_blocks(bh->b_folio, bh->b_size, bh_offset(bh));
 305	end_buffer_async_read(bh, valid);
 306	kfree(ctx);
 307}
 308
 309static bool need_fsverity(struct buffer_head *bh)
 310{
 311	struct folio *folio = bh->b_folio;
 312	struct inode *inode = folio->mapping->host;
 313
 314	return fsverity_active(inode) &&
 315		/* needed by ext4 */
 316		folio->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
 317}
 318
 319static void decrypt_bh(struct work_struct *work)
 320{
 321	struct postprocess_bh_ctx *ctx =
 322		container_of(work, struct postprocess_bh_ctx, work);
 323	struct buffer_head *bh = ctx->bh;
 324	int err;
 325
 326	err = fscrypt_decrypt_pagecache_blocks(bh->b_folio, bh->b_size,
 327					       bh_offset(bh));
 328	if (err == 0 && need_fsverity(bh)) {
 329		/*
 330		 * We use different work queues for decryption and for verity
 331		 * because verity may require reading metadata pages that need
 332		 * decryption, and we shouldn't recurse to the same workqueue.
 333		 */
 334		INIT_WORK(&ctx->work, verify_bh);
 335		fsverity_enqueue_verify_work(&ctx->work);
 336		return;
 337	}
 338	end_buffer_async_read(bh, err == 0);
 339	kfree(ctx);
 340}
 341
 342/*
 343 * I/O completion handler for block_read_full_folio() - pages
 344 * which come unlocked at the end of I/O.
 345 */
 346static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
 347{
 348	struct inode *inode = bh->b_folio->mapping->host;
 349	bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
 350	bool verify = need_fsverity(bh);
 351
 352	/* Decrypt (with fscrypt) and/or verify (with fsverity) if needed. */
 353	if (uptodate && (decrypt || verify)) {
 354		struct postprocess_bh_ctx *ctx =
 355			kmalloc(sizeof(*ctx), GFP_ATOMIC);
 356
 357		if (ctx) {
 358			ctx->bh = bh;
 359			if (decrypt) {
 360				INIT_WORK(&ctx->work, decrypt_bh);
 361				fscrypt_enqueue_decrypt_work(&ctx->work);
 362			} else {
 363				INIT_WORK(&ctx->work, verify_bh);
 364				fsverity_enqueue_verify_work(&ctx->work);
 365			}
 366			return;
 367		}
 368		uptodate = 0;
 369	}
 370	end_buffer_async_read(bh, uptodate);
 371}
 372
 373/*
 374 * Completion handler for block_write_full_folio() - folios which are unlocked
 375 * during I/O, and which have the writeback flag cleared upon I/O completion.
 376 */
 377static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 378{
 
 379	unsigned long flags;
 380	struct buffer_head *first;
 381	struct buffer_head *tmp;
 382	struct folio *folio;
 383
 384	BUG_ON(!buffer_async_write(bh));
 385
 386	folio = bh->b_folio;
 387	if (uptodate) {
 388		set_buffer_uptodate(bh);
 389	} else {
 390		buffer_io_error(bh, ", lost async page write");
 391		mark_buffer_write_io_error(bh);
 
 
 
 
 
 
 392		clear_buffer_uptodate(bh);
 
 393	}
 394
 395	first = folio_buffers(folio);
 396	spin_lock_irqsave(&first->b_uptodate_lock, flags);
 
 397
 398	clear_buffer_async_write(bh);
 399	unlock_buffer(bh);
 400	tmp = bh->b_this_page;
 401	while (tmp != bh) {
 402		if (buffer_async_write(tmp)) {
 403			BUG_ON(!buffer_locked(tmp));
 404			goto still_busy;
 405		}
 406		tmp = tmp->b_this_page;
 407	}
 408	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 409	folio_end_writeback(folio);
 
 410	return;
 411
 412still_busy:
 413	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 
 414	return;
 415}
 
 416
 417/*
 418 * If a page's buffers are under async readin (end_buffer_async_read
 419 * completion) then there is a possibility that another thread of
 420 * control could lock one of the buffers after it has completed
 421 * but while some of the other buffers have not completed.  This
 422 * locked buffer would confuse end_buffer_async_read() into not unlocking
 423 * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 424 * that this buffer is not under async I/O.
 425 *
 426 * The page comes unlocked when it has no locked buffer_async buffers
 427 * left.
 428 *
 429 * PageLocked prevents anyone starting new async I/O reads any of
 430 * the buffers.
 431 *
 432 * PageWriteback is used to prevent simultaneous writeout of the same
 433 * page.
 434 *
 435 * PageLocked prevents anyone from starting writeback of a page which is
 436 * under read I/O (PageWriteback is only ever set against a locked page).
 437 */
 438static void mark_buffer_async_read(struct buffer_head *bh)
 439{
 440	bh->b_end_io = end_buffer_async_read_io;
 441	set_buffer_async_read(bh);
 442}
 443
 444static void mark_buffer_async_write_endio(struct buffer_head *bh,
 445					  bh_end_io_t *handler)
 446{
 447	bh->b_end_io = handler;
 448	set_buffer_async_write(bh);
 449}
 450
 451void mark_buffer_async_write(struct buffer_head *bh)
 452{
 453	mark_buffer_async_write_endio(bh, end_buffer_async_write);
 454}
 455EXPORT_SYMBOL(mark_buffer_async_write);
 456
 457
 458/*
 459 * fs/buffer.c contains helper functions for buffer-backed address space's
 460 * fsync functions.  A common requirement for buffer-based filesystems is
 461 * that certain data from the backing blockdev needs to be written out for
 462 * a successful fsync().  For example, ext2 indirect blocks need to be
 463 * written back and waited upon before fsync() returns.
 464 *
 465 * The functions mark_buffer_dirty_inode(), fsync_inode_buffers(),
 466 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 467 * management of a list of dependent buffers at ->i_mapping->i_private_list.
 468 *
 469 * Locking is a little subtle: try_to_free_buffers() will remove buffers
 470 * from their controlling inode's queue when they are being freed.  But
 471 * try_to_free_buffers() will be operating against the *blockdev* mapping
 472 * at the time, not against the S_ISREG file which depends on those buffers.
 473 * So the locking for i_private_list is via the i_private_lock in the address_space
 474 * which backs the buffers.  Which is different from the address_space 
 475 * against which the buffers are listed.  So for a particular address_space,
 476 * mapping->i_private_lock does *not* protect mapping->i_private_list!  In fact,
 477 * mapping->i_private_list will always be protected by the backing blockdev's
 478 * ->i_private_lock.
 479 *
 480 * Which introduces a requirement: all buffers on an address_space's
 481 * ->i_private_list must be from the same address_space: the blockdev's.
 482 *
 483 * address_spaces which do not place buffers at ->i_private_list via these
 484 * utility functions are free to use i_private_lock and i_private_list for
 485 * whatever they want.  The only requirement is that list_empty(i_private_list)
 486 * be true at clear_inode() time.
 487 *
 488 * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 489 * filesystems should do that.  invalidate_inode_buffers() should just go
 490 * BUG_ON(!list_empty).
 491 *
 492 * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 493 * take an address_space, not an inode.  And it should be called
 494 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 495 * queued up.
 496 *
 497 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 498 * list if it is already on a list.  Because if the buffer is on a list,
 499 * it *must* already be on the right one.  If not, the filesystem is being
 500 * silly.  This will save a ton of locking.  But first we have to ensure
 501 * that buffers are taken *off* the old inode's list when they are freed
 502 * (presumably in truncate).  That requires careful auditing of all
 503 * filesystems (do it inside bforget()).  It could also be done by bringing
 504 * b_inode back.
 505 */
 506
 507/*
 508 * The buffer's backing address_space's i_private_lock must be held
 509 */
 510static void __remove_assoc_queue(struct buffer_head *bh)
 511{
 512	list_del_init(&bh->b_assoc_buffers);
 513	WARN_ON(!bh->b_assoc_map);
 
 
 514	bh->b_assoc_map = NULL;
 515}
 516
 517int inode_has_buffers(struct inode *inode)
 518{
 519	return !list_empty(&inode->i_data.i_private_list);
 520}
 521
 522/*
 523 * osync is designed to support O_SYNC io.  It waits synchronously for
 524 * all already-submitted IO to complete, but does not queue any new
 525 * writes to the disk.
 526 *
 527 * To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer
 528 * as you dirty the buffers, and then use osync_inode_buffers to wait for
 529 * completion.  Any other dirty buffers which are not yet queued for
 530 * write will not be flushed to disk by the osync.
 531 */
 532static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
 533{
 534	struct buffer_head *bh;
 535	struct list_head *p;
 536	int err = 0;
 537
 538	spin_lock(lock);
 539repeat:
 540	list_for_each_prev(p, list) {
 541		bh = BH_ENTRY(p);
 542		if (buffer_locked(bh)) {
 543			get_bh(bh);
 544			spin_unlock(lock);
 545			wait_on_buffer(bh);
 546			if (!buffer_uptodate(bh))
 547				err = -EIO;
 548			brelse(bh);
 549			spin_lock(lock);
 550			goto repeat;
 551		}
 552	}
 553	spin_unlock(lock);
 554	return err;
 555}
 556
 557/**
 558 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
 559 * @mapping: the mapping which wants those buffers written
 560 *
 561 * Starts I/O against the buffers at mapping->i_private_list, and waits upon
 562 * that I/O.
 563 *
 564 * Basically, this is a convenience function for fsync().
 565 * @mapping is a file or directory which needs those buffers to be written for
 566 * a successful fsync().
 567 */
 568int sync_mapping_buffers(struct address_space *mapping)
 569{
 570	struct address_space *buffer_mapping = mapping->i_private_data;
 571
 572	if (buffer_mapping == NULL || list_empty(&mapping->i_private_list))
 573		return 0;
 
 574
 575	return fsync_buffers_list(&buffer_mapping->i_private_lock,
 576					&mapping->i_private_list);
 
 
 
 577}
 578EXPORT_SYMBOL(sync_mapping_buffers);
 579
 580/**
 581 * generic_buffers_fsync_noflush - generic buffer fsync implementation
 582 * for simple filesystems with no inode lock
 583 *
 584 * @file:	file to synchronize
 585 * @start:	start offset in bytes
 586 * @end:	end offset in bytes (inclusive)
 587 * @datasync:	only synchronize essential metadata if true
 588 *
 589 * This is a generic implementation of the fsync method for simple
 590 * filesystems which track all non-inode metadata in the buffers list
 591 * hanging off the address_space structure.
 592 */
 593int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end,
 594				  bool datasync)
 595{
 596	struct inode *inode = file->f_mapping->host;
 597	int err;
 598	int ret;
 599
 600	err = file_write_and_wait_range(file, start, end);
 601	if (err)
 602		return err;
 603
 604	ret = sync_mapping_buffers(inode->i_mapping);
 605	if (!(inode->i_state & I_DIRTY_ALL))
 606		goto out;
 607	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
 608		goto out;
 609
 610	err = sync_inode_metadata(inode, 1);
 611	if (ret == 0)
 612		ret = err;
 613
 614out:
 615	/* check and advance again to catch errors after syncing out buffers */
 616	err = file_check_and_advance_wb_err(file);
 617	if (ret == 0)
 618		ret = err;
 619	return ret;
 620}
 621EXPORT_SYMBOL(generic_buffers_fsync_noflush);
 622
 623/**
 624 * generic_buffers_fsync - generic buffer fsync implementation
 625 * for simple filesystems with no inode lock
 
 
 
 626 *
 627 * @file:	file to synchronize
 628 * @start:	start offset in bytes
 629 * @end:	end offset in bytes (inclusive)
 630 * @datasync:	only synchronize essential metadata if true
 631 *
 632 * This is a generic implementation of the fsync method for simple
 633 * filesystems which track all non-inode metadata in the buffers list
 634 * hanging off the address_space structure. This also makes sure that
 635 * a device cache flush operation is called at the end.
 636 */
 637int generic_buffers_fsync(struct file *file, loff_t start, loff_t end,
 638			  bool datasync)
 639{
 640	struct inode *inode = file->f_mapping->host;
 641	int ret;
 642
 643	ret = generic_buffers_fsync_noflush(file, start, end, datasync);
 644	if (!ret)
 645		ret = blkdev_issue_flush(inode->i_sb->s_bdev);
 646	return ret;
 
 647}
 648EXPORT_SYMBOL(generic_buffers_fsync);
 649
 650/*
 651 * Called when we've recently written block `bblock', and it is known that
 652 * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 653 * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 654 * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 655 */
 656void write_boundary_block(struct block_device *bdev,
 657			sector_t bblock, unsigned blocksize)
 658{
 659	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
 660	if (bh) {
 661		if (buffer_dirty(bh))
 662			write_dirty_buffer(bh, 0);
 663		put_bh(bh);
 664	}
 665}
 666
 667void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 668{
 669	struct address_space *mapping = inode->i_mapping;
 670	struct address_space *buffer_mapping = bh->b_folio->mapping;
 671
 672	mark_buffer_dirty(bh);
 673	if (!mapping->i_private_data) {
 674		mapping->i_private_data = buffer_mapping;
 675	} else {
 676		BUG_ON(mapping->i_private_data != buffer_mapping);
 677	}
 678	if (!bh->b_assoc_map) {
 679		spin_lock(&buffer_mapping->i_private_lock);
 680		list_move_tail(&bh->b_assoc_buffers,
 681				&mapping->i_private_list);
 682		bh->b_assoc_map = mapping;
 683		spin_unlock(&buffer_mapping->i_private_lock);
 684	}
 685}
 686EXPORT_SYMBOL(mark_buffer_dirty_inode);
 687
 688/**
 689 * block_dirty_folio - Mark a folio as dirty.
 690 * @mapping: The address space containing this folio.
 691 * @folio: The folio to mark dirty.
 692 *
 693 * Filesystems which use buffer_heads can use this function as their
 694 * ->dirty_folio implementation.  Some filesystems need to do a little
 695 * work before calling this function.  Filesystems which do not use
 696 * buffer_heads should call filemap_dirty_folio() instead.
 697 *
 698 * If the folio has buffers, the uptodate buffers are set dirty, to
 699 * preserve dirty-state coherency between the folio and the buffers.
 700 * Buffers added to a dirty folio are created dirty.
 701 *
 702 * The buffers are dirtied before the folio is dirtied.  There's a small
 703 * race window in which writeback may see the folio cleanness but not the
 704 * buffer dirtiness.  That's fine.  If this code were to set the folio
 705 * dirty before the buffers, writeback could clear the folio dirty flag,
 706 * see a bunch of clean buffers and we'd end up with dirty buffers/clean
 707 * folio on the dirty folio list.
 708 *
 709 * We use i_private_lock to lock against try_to_free_buffers() while
 710 * using the folio's buffer list.  This also prevents clean buffers
 711 * being added to the folio after it was set dirty.
 712 *
 713 * Context: May only be called from process context.  Does not sleep.
 714 * Caller must ensure that @folio cannot be truncated during this call,
 715 * typically by holding the folio lock or having a page in the folio
 716 * mapped and holding the page table lock.
 717 *
 718 * Return: True if the folio was dirtied; false if it was already dirtied.
 
 719 */
 720bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
 
 721{
 722	struct buffer_head *head;
 723	bool newly_dirty;
 
 
 
 
 
 
 
 
 724
 725	spin_lock(&mapping->i_private_lock);
 726	head = folio_buffers(folio);
 727	if (head) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 728		struct buffer_head *bh = head;
 729
 730		do {
 731			set_buffer_dirty(bh);
 732			bh = bh->b_this_page;
 733		} while (bh != head);
 734	}
 735	/*
 736	 * Lock out page's memcg migration to keep PageDirty
 737	 * synchronized with per-memcg dirty page counters.
 738	 */
 739	newly_dirty = !folio_test_set_dirty(folio);
 740	spin_unlock(&mapping->i_private_lock);
 741
 742	if (newly_dirty)
 743		__folio_mark_dirty(folio, mapping, 1);
 744
 745	if (newly_dirty)
 746		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 747
 748	return newly_dirty;
 749}
 750EXPORT_SYMBOL(block_dirty_folio);
 751
 752/*
 753 * Write out and wait upon a list of buffers.
 754 *
 755 * We have conflicting pressures: we want to make sure that all
 756 * initially dirty buffers get waited on, but that any subsequently
 757 * dirtied buffers don't.  After all, we don't want fsync to last
 758 * forever if somebody is actively writing to the file.
 759 *
 760 * Do this in two main stages: first we copy dirty buffers to a
 761 * temporary inode list, queueing the writes as we go.  Then we clean
 762 * up, waiting for those writes to complete.
 763 * 
 764 * During this second stage, any subsequent updates to the file may end
 765 * up refiling the buffer on the original inode's dirty list again, so
 766 * there is a chance we will end up with a buffer queued for write but
 767 * not yet completed on that list.  So, as a final cleanup we go through
 768 * the osync code to catch these locked, dirty buffers without requeuing
 769 * any newly dirty buffers for write.
 770 */
 771static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 772{
 773	struct buffer_head *bh;
 
 774	struct address_space *mapping;
 775	int err = 0, err2;
 776	struct blk_plug plug;
 777	LIST_HEAD(tmp);
 778
 
 779	blk_start_plug(&plug);
 780
 781	spin_lock(lock);
 782	while (!list_empty(list)) {
 783		bh = BH_ENTRY(list->next);
 784		mapping = bh->b_assoc_map;
 785		__remove_assoc_queue(bh);
 786		/* Avoid race with mark_buffer_dirty_inode() which does
 787		 * a lockless check and we rely on seeing the dirty bit */
 788		smp_mb();
 789		if (buffer_dirty(bh) || buffer_locked(bh)) {
 790			list_add(&bh->b_assoc_buffers, &tmp);
 791			bh->b_assoc_map = mapping;
 792			if (buffer_dirty(bh)) {
 793				get_bh(bh);
 794				spin_unlock(lock);
 795				/*
 796				 * Ensure any pending I/O completes so that
 797				 * write_dirty_buffer() actually writes the
 798				 * current contents - it is a noop if I/O is
 799				 * still in flight on potentially older
 800				 * contents.
 801				 */
 802				write_dirty_buffer(bh, REQ_SYNC);
 803
 804				/*
 805				 * Kick off IO for the previous mapping. Note
 806				 * that we will not run the very last mapping,
 807				 * wait_on_buffer() will do that for us
 808				 * through sync_buffer().
 809				 */
 810				brelse(bh);
 811				spin_lock(lock);
 812			}
 813		}
 814	}
 815
 816	spin_unlock(lock);
 817	blk_finish_plug(&plug);
 818	spin_lock(lock);
 819
 820	while (!list_empty(&tmp)) {
 821		bh = BH_ENTRY(tmp.prev);
 822		get_bh(bh);
 823		mapping = bh->b_assoc_map;
 824		__remove_assoc_queue(bh);
 825		/* Avoid race with mark_buffer_dirty_inode() which does
 826		 * a lockless check and we rely on seeing the dirty bit */
 827		smp_mb();
 828		if (buffer_dirty(bh)) {
 829			list_add(&bh->b_assoc_buffers,
 830				 &mapping->i_private_list);
 831			bh->b_assoc_map = mapping;
 832		}
 833		spin_unlock(lock);
 834		wait_on_buffer(bh);
 835		if (!buffer_uptodate(bh))
 836			err = -EIO;
 837		brelse(bh);
 838		spin_lock(lock);
 839	}
 840	
 841	spin_unlock(lock);
 842	err2 = osync_buffers_list(lock, list);
 843	if (err)
 844		return err;
 845	else
 846		return err2;
 847}
 848
 849/*
 850 * Invalidate any and all dirty buffers on a given inode.  We are
 851 * probably unmounting the fs, but that doesn't mean we have already
 852 * done a sync().  Just drop the buffers from the inode list.
 853 *
 854 * NOTE: we take the inode's blockdev's mapping's i_private_lock.  Which
 855 * assumes that all the buffers are against the blockdev.
 
 856 */
 857void invalidate_inode_buffers(struct inode *inode)
 858{
 859	if (inode_has_buffers(inode)) {
 860		struct address_space *mapping = &inode->i_data;
 861		struct list_head *list = &mapping->i_private_list;
 862		struct address_space *buffer_mapping = mapping->i_private_data;
 863
 864		spin_lock(&buffer_mapping->i_private_lock);
 865		while (!list_empty(list))
 866			__remove_assoc_queue(BH_ENTRY(list->next));
 867		spin_unlock(&buffer_mapping->i_private_lock);
 868	}
 869}
 870EXPORT_SYMBOL(invalidate_inode_buffers);
 871
 872/*
 873 * Remove any clean buffers from the inode's buffer list.  This is called
 874 * when we're trying to free the inode itself.  Those buffers can pin it.
 875 *
 876 * Returns true if all buffers were removed.
 877 */
 878int remove_inode_buffers(struct inode *inode)
 879{
 880	int ret = 1;
 881
 882	if (inode_has_buffers(inode)) {
 883		struct address_space *mapping = &inode->i_data;
 884		struct list_head *list = &mapping->i_private_list;
 885		struct address_space *buffer_mapping = mapping->i_private_data;
 886
 887		spin_lock(&buffer_mapping->i_private_lock);
 888		while (!list_empty(list)) {
 889			struct buffer_head *bh = BH_ENTRY(list->next);
 890			if (buffer_dirty(bh)) {
 891				ret = 0;
 892				break;
 893			}
 894			__remove_assoc_queue(bh);
 895		}
 896		spin_unlock(&buffer_mapping->i_private_lock);
 897	}
 898	return ret;
 899}
 900
 901/*
 902 * Create the appropriate buffers when given a folio for data area and
 903 * the size of each buffer.. Use the bh->b_this_page linked list to
 904 * follow the buffers created.  Return NULL if unable to create more
 905 * buffers.
 906 *
 907 * The retry flag is used to differentiate async IO (paging, swapping)
 908 * which may not fail from ordinary buffer allocations.
 909 */
 910struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
 911					gfp_t gfp)
 912{
 913	struct buffer_head *bh, *head;
 914	long offset;
 915	struct mem_cgroup *memcg, *old_memcg;
 916
 917	/* The folio lock pins the memcg */
 918	memcg = folio_memcg(folio);
 919	old_memcg = set_active_memcg(memcg);
 920
 
 921	head = NULL;
 922	offset = folio_size(folio);
 923	while ((offset -= size) >= 0) {
 924		bh = alloc_buffer_head(gfp);
 925		if (!bh)
 926			goto no_grow;
 927
 
 928		bh->b_this_page = head;
 929		bh->b_blocknr = -1;
 930		head = bh;
 931
 
 
 932		bh->b_size = size;
 933
 934		/* Link the buffer to its folio */
 935		folio_set_bh(bh, folio, offset);
 
 
 936	}
 937out:
 938	set_active_memcg(old_memcg);
 939	return head;
 940/*
 941 * In case anything failed, we just free everything we got.
 942 */
 943no_grow:
 944	if (head) {
 945		do {
 946			bh = head;
 947			head = head->b_this_page;
 948			free_buffer_head(bh);
 949		} while (head);
 950	}
 951
 952	goto out;
 953}
 954EXPORT_SYMBOL_GPL(folio_alloc_buffers);
 955
 956struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size)
 957{
 958	gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
 
 959
 960	return folio_alloc_buffers(page_folio(page), size, gfp);
 
 
 
 
 
 
 
 961}
 962EXPORT_SYMBOL_GPL(alloc_page_buffers);
 963
 964static inline void link_dev_buffers(struct folio *folio,
 965		struct buffer_head *head)
 966{
 967	struct buffer_head *bh, *tail;
 968
 969	bh = head;
 970	do {
 971		tail = bh;
 972		bh = bh->b_this_page;
 973	} while (bh);
 974	tail->b_this_page = head;
 975	folio_attach_private(folio, head);
 976}
 977
 978static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
 979{
 980	sector_t retval = ~((sector_t)0);
 981	loff_t sz = bdev_nr_bytes(bdev);
 982
 983	if (sz) {
 984		unsigned int sizebits = blksize_bits(size);
 985		retval = (sz >> sizebits);
 986	}
 987	return retval;
 988}
 989
 990/*
 991 * Initialise the state of a blockdev folio's buffers.
 992 */ 
 993static sector_t folio_init_buffers(struct folio *folio,
 994		struct block_device *bdev, unsigned size)
 
 995{
 996	struct buffer_head *head = folio_buffers(folio);
 997	struct buffer_head *bh = head;
 998	bool uptodate = folio_test_uptodate(folio);
 999	sector_t block = div_u64(folio_pos(folio), size);
1000	sector_t end_block = blkdev_max_block(bdev, size);
1001
1002	do {
1003		if (!buffer_mapped(bh)) {
1004			bh->b_end_io = NULL;
1005			bh->b_private = NULL;
1006			bh->b_bdev = bdev;
1007			bh->b_blocknr = block;
1008			if (uptodate)
1009				set_buffer_uptodate(bh);
1010			if (block < end_block)
1011				set_buffer_mapped(bh);
1012		}
1013		block++;
1014		bh = bh->b_this_page;
1015	} while (bh != head);
1016
1017	/*
1018	 * Caller needs to validate requested block against end of device.
1019	 */
1020	return end_block;
1021}
1022
1023/*
1024 * Create the page-cache folio that contains the requested block.
1025 *
1026 * This is used purely for blockdev mappings.
1027 *
1028 * Returns false if we have a failure which cannot be cured by retrying
1029 * without sleeping.  Returns true if we succeeded, or the caller should retry.
1030 */
1031static bool grow_dev_folio(struct block_device *bdev, sector_t block,
1032		pgoff_t index, unsigned size, gfp_t gfp)
 
1033{
1034	struct address_space *mapping = bdev->bd_mapping;
1035	struct folio *folio;
1036	struct buffer_head *bh;
1037	sector_t end_block = 0;
1038
1039	folio = __filemap_get_folio(mapping, index,
1040			FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
1041	if (IS_ERR(folio))
1042		return false;
1043
1044	bh = folio_buffers(folio);
1045	if (bh) {
1046		if (bh->b_size == size) {
1047			end_block = folio_init_buffers(folio, bdev, size);
1048			goto unlock;
1049		}
1050
1051		/*
1052		 * Retrying may succeed; for example the folio may finish
1053		 * writeback, or buffers may be cleaned.  This should not
1054		 * happen very often; maybe we have old buffers attached to
1055		 * this blockdev's page cache and we're trying to change
1056		 * the block size?
1057		 */
1058		if (!try_to_free_buffers(folio)) {
1059			end_block = ~0ULL;
1060			goto unlock;
1061		}
 
 
1062	}
1063
1064	bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT);
 
 
 
1065	if (!bh)
1066		goto unlock;
1067
1068	/*
1069	 * Link the folio to the buffers and initialise them.  Take the
1070	 * lock to be atomic wrt __find_get_block(), which does not
1071	 * run under the folio lock.
1072	 */
1073	spin_lock(&mapping->i_private_lock);
1074	link_dev_buffers(folio, bh);
1075	end_block = folio_init_buffers(folio, bdev, size);
1076	spin_unlock(&mapping->i_private_lock);
1077unlock:
1078	folio_unlock(folio);
1079	folio_put(folio);
1080	return block < end_block;
 
 
 
1081}
1082
1083/*
1084 * Create buffers for the specified block device block's folio.  If
1085 * that folio was dirty, the buffers are set dirty also.  Returns false
1086 * if we've hit a permanent error.
1087 */
1088static bool grow_buffers(struct block_device *bdev, sector_t block,
1089		unsigned size, gfp_t gfp)
1090{
1091	loff_t pos;
 
 
 
 
 
 
 
 
 
1092
1093	/*
1094	 * Check for a block which lies outside our maximum possible
1095	 * pagecache index.
1096	 */
1097	if (check_mul_overflow(block, (sector_t)size, &pos) || pos > MAX_LFS_FILESIZE) {
1098		printk(KERN_ERR "%s: requested out-of-range block %llu for device %pg\n",
 
 
 
1099			__func__, (unsigned long long)block,
1100			bdev);
1101		return false;
1102	}
1103
1104	/* Create a folio with the proper size buffers */
1105	return grow_dev_folio(bdev, block, pos / PAGE_SIZE, size, gfp);
 
 
 
 
 
1106}
1107
1108static struct buffer_head *
1109__getblk_slow(struct block_device *bdev, sector_t block,
1110	     unsigned size, gfp_t gfp)
1111{
1112	/* Size must be multiple of hard sectorsize */
1113	if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1114			(size < 512 || size > PAGE_SIZE))) {
1115		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1116					size);
1117		printk(KERN_ERR "logical block size: %d\n",
1118					bdev_logical_block_size(bdev));
1119
1120		dump_stack();
1121		return NULL;
1122	}
1123
1124	for (;;) {
1125		struct buffer_head *bh;
 
1126
1127		bh = __find_get_block(bdev, block, size);
1128		if (bh)
1129			return bh;
1130
1131		if (!grow_buffers(bdev, block, size, gfp))
 
1132			return NULL;
 
 
1133	}
1134}
1135
1136/*
1137 * The relationship between dirty buffers and dirty pages:
1138 *
1139 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1140 * the page is tagged dirty in the page cache.
1141 *
1142 * At all times, the dirtiness of the buffers represents the dirtiness of
1143 * subsections of the page.  If the page has buffers, the page dirty bit is
1144 * merely a hint about the true dirty state.
1145 *
1146 * When a page is set dirty in its entirety, all its buffers are marked dirty
1147 * (if the page has buffers).
1148 *
1149 * When a buffer is marked dirty, its page is dirtied, but the page's other
1150 * buffers are not.
1151 *
1152 * Also.  When blockdev buffers are explicitly read with bread(), they
1153 * individually become uptodate.  But their backing page remains not
1154 * uptodate - even if all of its buffers are uptodate.  A subsequent
1155 * block_read_full_folio() against that folio will discover all the uptodate
1156 * buffers, will set the folio uptodate and will perform no I/O.
1157 */
1158
1159/**
1160 * mark_buffer_dirty - mark a buffer_head as needing writeout
1161 * @bh: the buffer_head to mark dirty
1162 *
1163 * mark_buffer_dirty() will set the dirty bit against the buffer, then set
1164 * its backing page dirty, then tag the page as dirty in the page cache
1165 * and then attach the address_space's inode to its superblock's dirty
1166 * inode list.
1167 *
1168 * mark_buffer_dirty() is atomic.  It takes bh->b_folio->mapping->i_private_lock,
1169 * i_pages lock and mapping->host->i_lock.
1170 */
1171void mark_buffer_dirty(struct buffer_head *bh)
1172{
1173	WARN_ON_ONCE(!buffer_uptodate(bh));
1174
1175	trace_block_dirty_buffer(bh);
1176
1177	/*
1178	 * Very *carefully* optimize the it-is-already-dirty case.
1179	 *
1180	 * Don't let the final "is it dirty" escape to before we
1181	 * perhaps modified the buffer.
1182	 */
1183	if (buffer_dirty(bh)) {
1184		smp_mb();
1185		if (buffer_dirty(bh))
1186			return;
1187	}
1188
1189	if (!test_set_buffer_dirty(bh)) {
1190		struct folio *folio = bh->b_folio;
1191		struct address_space *mapping = NULL;
1192
1193		if (!folio_test_set_dirty(folio)) {
1194			mapping = folio->mapping;
1195			if (mapping)
1196				__folio_mark_dirty(folio, mapping, 0);
1197		}
1198		if (mapping)
1199			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1200	}
1201}
1202EXPORT_SYMBOL(mark_buffer_dirty);
1203
1204void mark_buffer_write_io_error(struct buffer_head *bh)
1205{
1206	set_buffer_write_io_error(bh);
1207	/* FIXME: do we need to set this in both places? */
1208	if (bh->b_folio && bh->b_folio->mapping)
1209		mapping_set_error(bh->b_folio->mapping, -EIO);
1210	if (bh->b_assoc_map) {
1211		mapping_set_error(bh->b_assoc_map, -EIO);
1212		errseq_set(&bh->b_assoc_map->host->i_sb->s_wb_err, -EIO);
1213	}
1214}
1215EXPORT_SYMBOL(mark_buffer_write_io_error);
1216
1217/**
1218 * __brelse - Release a buffer.
1219 * @bh: The buffer to release.
1220 *
1221 * This variant of brelse() can be called if @bh is guaranteed to not be NULL.
1222 */
1223void __brelse(struct buffer_head *bh)
1224{
1225	if (atomic_read(&bh->b_count)) {
1226		put_bh(bh);
1227		return;
1228	}
1229	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1230}
1231EXPORT_SYMBOL(__brelse);
1232
1233/**
1234 * __bforget - Discard any dirty data in a buffer.
1235 * @bh: The buffer to forget.
1236 *
1237 * This variant of bforget() can be called if @bh is guaranteed to not
1238 * be NULL.
1239 */
1240void __bforget(struct buffer_head *bh)
1241{
1242	clear_buffer_dirty(bh);
1243	if (bh->b_assoc_map) {
1244		struct address_space *buffer_mapping = bh->b_folio->mapping;
1245
1246		spin_lock(&buffer_mapping->i_private_lock);
1247		list_del_init(&bh->b_assoc_buffers);
1248		bh->b_assoc_map = NULL;
1249		spin_unlock(&buffer_mapping->i_private_lock);
1250	}
1251	__brelse(bh);
1252}
1253EXPORT_SYMBOL(__bforget);
1254
1255static struct buffer_head *__bread_slow(struct buffer_head *bh)
1256{
1257	lock_buffer(bh);
1258	if (buffer_uptodate(bh)) {
1259		unlock_buffer(bh);
1260		return bh;
1261	} else {
1262		get_bh(bh);
1263		bh->b_end_io = end_buffer_read_sync;
1264		submit_bh(REQ_OP_READ, bh);
1265		wait_on_buffer(bh);
1266		if (buffer_uptodate(bh))
1267			return bh;
1268	}
1269	brelse(bh);
1270	return NULL;
1271}
1272
1273/*
1274 * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1275 * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1276 * refcount elevated by one when they're in an LRU.  A buffer can only appear
1277 * once in a particular CPU's LRU.  A single buffer can be present in multiple
1278 * CPU's LRUs at the same time.
1279 *
1280 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1281 * sb_find_get_block().
1282 *
1283 * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1284 * a local interrupt disable for that.
1285 */
1286
1287#define BH_LRU_SIZE	16
1288
1289struct bh_lru {
1290	struct buffer_head *bhs[BH_LRU_SIZE];
1291};
1292
1293static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1294
1295#ifdef CONFIG_SMP
1296#define bh_lru_lock()	local_irq_disable()
1297#define bh_lru_unlock()	local_irq_enable()
1298#else
1299#define bh_lru_lock()	preempt_disable()
1300#define bh_lru_unlock()	preempt_enable()
1301#endif
1302
1303static inline void check_irqs_on(void)
1304{
1305#ifdef irqs_disabled
1306	BUG_ON(irqs_disabled());
1307#endif
1308}
1309
1310/*
1311 * Install a buffer_head into this cpu's LRU.  If not already in the LRU, it is
1312 * inserted at the front, and the buffer_head at the back if any is evicted.
1313 * Or, if already in the LRU it is moved to the front.
1314 */
1315static void bh_lru_install(struct buffer_head *bh)
1316{
1317	struct buffer_head *evictee = bh;
1318	struct bh_lru *b;
1319	int i;
1320
1321	check_irqs_on();
1322	bh_lru_lock();
 
 
 
 
1323
1324	/*
1325	 * the refcount of buffer_head in bh_lru prevents dropping the
1326	 * attached page(i.e., try_to_free_buffers) so it could cause
1327	 * failing page migration.
1328	 * Skip putting upcoming bh into bh_lru until migration is done.
1329	 */
1330	if (lru_cache_disabled() || cpu_is_isolated(smp_processor_id())) {
1331		bh_lru_unlock();
1332		return;
1333	}
1334
1335	b = this_cpu_ptr(&bh_lrus);
1336	for (i = 0; i < BH_LRU_SIZE; i++) {
1337		swap(evictee, b->bhs[i]);
1338		if (evictee == bh) {
1339			bh_lru_unlock();
1340			return;
 
 
 
 
1341		}
 
 
 
1342	}
1343
1344	get_bh(bh);
1345	bh_lru_unlock();
1346	brelse(evictee);
 
 
1347}
1348
1349/*
1350 * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1351 */
1352static struct buffer_head *
1353lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1354{
1355	struct buffer_head *ret = NULL;
1356	unsigned int i;
1357
1358	check_irqs_on();
1359	bh_lru_lock();
1360	if (cpu_is_isolated(smp_processor_id())) {
1361		bh_lru_unlock();
1362		return NULL;
1363	}
1364	for (i = 0; i < BH_LRU_SIZE; i++) {
1365		struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1366
1367		if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
1368		    bh->b_size == size) {
1369			if (i) {
1370				while (i) {
1371					__this_cpu_write(bh_lrus.bhs[i],
1372						__this_cpu_read(bh_lrus.bhs[i - 1]));
1373					i--;
1374				}
1375				__this_cpu_write(bh_lrus.bhs[0], bh);
1376			}
1377			get_bh(bh);
1378			ret = bh;
1379			break;
1380		}
1381	}
1382	bh_lru_unlock();
1383	return ret;
1384}
1385
1386/*
1387 * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1388 * it in the LRU and mark it as accessed.  If it is not present then return
1389 * NULL
1390 */
1391struct buffer_head *
1392__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1393{
1394	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1395
1396	if (bh == NULL) {
1397		/* __find_get_block_slow will mark the page accessed */
1398		bh = __find_get_block_slow(bdev, block);
1399		if (bh)
1400			bh_lru_install(bh);
1401	} else
 
1402		touch_buffer(bh);
1403
1404	return bh;
1405}
1406EXPORT_SYMBOL(__find_get_block);
1407
1408/**
1409 * bdev_getblk - Get a buffer_head in a block device's buffer cache.
1410 * @bdev: The block device.
1411 * @block: The block number.
1412 * @size: The size of buffer_heads for this @bdev.
1413 * @gfp: The memory allocation flags to use.
1414 *
1415 * The returned buffer head has its reference count incremented, but is
1416 * not locked.  The caller should call brelse() when it has finished
1417 * with the buffer.  The buffer may not be uptodate.  If needed, the
1418 * caller can bring it uptodate either by reading it or overwriting it.
1419 *
1420 * Return: The buffer head, or NULL if memory could not be allocated.
 
1421 */
1422struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
1423		unsigned size, gfp_t gfp)
1424{
1425	struct buffer_head *bh = __find_get_block(bdev, block, size);
1426
1427	might_alloc(gfp);
1428	if (bh)
1429		return bh;
1430
1431	return __getblk_slow(bdev, block, size, gfp);
1432}
1433EXPORT_SYMBOL(bdev_getblk);
1434
1435/*
1436 * Do async read-ahead on a buffer..
1437 */
1438void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1439{
1440	struct buffer_head *bh = bdev_getblk(bdev, block, size,
1441			GFP_NOWAIT | __GFP_MOVABLE);
1442
1443	if (likely(bh)) {
1444		bh_readahead(bh, REQ_RAHEAD);
1445		brelse(bh);
1446	}
1447}
1448EXPORT_SYMBOL(__breadahead);
1449
1450/**
1451 * __bread_gfp() - Read a block.
1452 * @bdev: The block device to read from.
1453 * @block: Block number in units of block size.
1454 * @size: The block size of this device in bytes.
1455 * @gfp: Not page allocation flags; see below.
1456 *
1457 * You are not expected to call this function.  You should use one of
1458 * sb_bread(), sb_bread_unmovable() or __bread().
1459 *
1460 * Read a specified block, and return the buffer head that refers to it.
1461 * If @gfp is 0, the memory will be allocated using the block device's
1462 * default GFP flags.  If @gfp is __GFP_MOVABLE, the memory may be
1463 * allocated from a movable area.  Do not pass in a complete set of
1464 * GFP flags.
1465 *
1466 * The returned buffer head has its refcount increased.  The caller should
1467 * call brelse() when it has finished with the buffer.
1468 *
1469 * Context: May sleep waiting for I/O.
1470 * Return: NULL if the block was unreadable.
1471 */
1472struct buffer_head *__bread_gfp(struct block_device *bdev, sector_t block,
1473		unsigned size, gfp_t gfp)
1474{
1475	struct buffer_head *bh;
1476
1477	gfp |= mapping_gfp_constraint(bdev->bd_mapping, ~__GFP_FS);
1478
1479	/*
1480	 * Prefer looping in the allocator rather than here, at least that
1481	 * code knows what it's doing.
1482	 */
1483	gfp |= __GFP_NOFAIL;
1484
1485	bh = bdev_getblk(bdev, block, size, gfp);
1486
1487	if (likely(bh) && !buffer_uptodate(bh))
1488		bh = __bread_slow(bh);
1489	return bh;
1490}
1491EXPORT_SYMBOL(__bread_gfp);
1492
1493static void __invalidate_bh_lrus(struct bh_lru *b)
1494{
1495	int i;
1496
1497	for (i = 0; i < BH_LRU_SIZE; i++) {
1498		brelse(b->bhs[i]);
1499		b->bhs[i] = NULL;
1500	}
1501}
1502/*
1503 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1504 * This doesn't race because it runs in each cpu either in irq
1505 * or with preempt disabled.
1506 */
1507static void invalidate_bh_lru(void *arg)
1508{
1509	struct bh_lru *b = &get_cpu_var(bh_lrus);
1510
1511	__invalidate_bh_lrus(b);
1512	put_cpu_var(bh_lrus);
1513}
1514
1515bool has_bh_in_lru(int cpu, void *dummy)
1516{
1517	struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
1518	int i;
1519	
1520	for (i = 0; i < BH_LRU_SIZE; i++) {
1521		if (b->bhs[i])
1522			return true;
1523	}
1524
1525	return false;
1526}
1527
1528void invalidate_bh_lrus(void)
1529{
1530	on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
1531}
1532EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1533
1534/*
1535 * It's called from workqueue context so we need a bh_lru_lock to close
1536 * the race with preemption/irq.
1537 */
1538void invalidate_bh_lrus_cpu(void)
1539{
1540	struct bh_lru *b;
1541
1542	bh_lru_lock();
1543	b = this_cpu_ptr(&bh_lrus);
1544	__invalidate_bh_lrus(b);
1545	bh_lru_unlock();
1546}
1547
1548void folio_set_bh(struct buffer_head *bh, struct folio *folio,
1549		  unsigned long offset)
1550{
1551	bh->b_folio = folio;
1552	BUG_ON(offset >= folio_size(folio));
1553	if (folio_test_highmem(folio))
1554		/*
1555		 * This catches illegal uses and preserves the offset:
1556		 */
1557		bh->b_data = (char *)(0 + offset);
1558	else
1559		bh->b_data = folio_address(folio) + offset;
1560}
1561EXPORT_SYMBOL(folio_set_bh);
1562
1563/*
1564 * Called when truncating a buffer on a page completely.
1565 */
1566
1567/* Bits that are cleared during an invalidate */
1568#define BUFFER_FLAGS_DISCARD \
1569	(1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
1570	 1 << BH_Delay | 1 << BH_Unwritten)
1571
1572static void discard_buffer(struct buffer_head * bh)
1573{
1574	unsigned long b_state;
1575
1576	lock_buffer(bh);
1577	clear_buffer_dirty(bh);
1578	bh->b_bdev = NULL;
1579	b_state = READ_ONCE(bh->b_state);
1580	do {
1581	} while (!try_cmpxchg(&bh->b_state, &b_state,
1582			      b_state & ~BUFFER_FLAGS_DISCARD));
 
1583	unlock_buffer(bh);
1584}
1585
1586/**
1587 * block_invalidate_folio - Invalidate part or all of a buffer-backed folio.
1588 * @folio: The folio which is affected.
1589 * @offset: start of the range to invalidate
1590 * @length: length of the range to invalidate
1591 *
1592 * block_invalidate_folio() is called when all or part of the folio has been
1593 * invalidated by a truncate operation.
1594 *
1595 * block_invalidate_folio() does not have to release all buffers, but it must
 
 
 
1596 * ensure that no dirty buffer is left outside @offset and that no I/O
1597 * is underway against any of the blocks which are outside the truncation
1598 * point.  Because the caller is about to free (and possibly reuse) those
1599 * blocks on-disk.
1600 */
1601void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
1602{
1603	struct buffer_head *head, *bh, *next;
1604	size_t curr_off = 0;
1605	size_t stop = length + offset;
1606
1607	BUG_ON(!folio_test_locked(folio));
1608
1609	/*
1610	 * Check for overflow
1611	 */
1612	BUG_ON(stop > folio_size(folio) || stop < length);
1613
1614	head = folio_buffers(folio);
1615	if (!head)
1616		return;
1617
 
1618	bh = head;
1619	do {
1620		size_t next_off = curr_off + bh->b_size;
1621		next = bh->b_this_page;
1622
1623		/*
1624		 * Are we still fully in range ?
1625		 */
1626		if (next_off > stop)
1627			goto out;
1628
1629		/*
1630		 * is this block fully invalidated?
1631		 */
1632		if (offset <= curr_off)
1633			discard_buffer(bh);
1634		curr_off = next_off;
1635		bh = next;
1636	} while (bh != head);
1637
1638	/*
1639	 * We release buffers only if the entire folio is being invalidated.
1640	 * The get_block cached value has been unconditionally invalidated,
1641	 * so real IO is not possible anymore.
1642	 */
1643	if (length == folio_size(folio))
1644		filemap_release_folio(folio, 0);
1645out:
1646	folio_clear_mappedtodisk(folio);
1647	return;
1648}
1649EXPORT_SYMBOL(block_invalidate_folio);
1650
1651/*
1652 * We attach and possibly dirty the buffers atomically wrt
1653 * block_dirty_folio() via i_private_lock.  try_to_free_buffers
1654 * is already excluded via the folio lock.
1655 */
1656struct buffer_head *create_empty_buffers(struct folio *folio,
1657		unsigned long blocksize, unsigned long b_state)
1658{
1659	struct buffer_head *bh, *head, *tail;
1660	gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL;
1661
1662	head = folio_alloc_buffers(folio, blocksize, gfp);
1663	bh = head;
1664	do {
1665		bh->b_state |= b_state;
1666		tail = bh;
1667		bh = bh->b_this_page;
1668	} while (bh);
1669	tail->b_this_page = head;
1670
1671	spin_lock(&folio->mapping->i_private_lock);
1672	if (folio_test_uptodate(folio) || folio_test_dirty(folio)) {
1673		bh = head;
1674		do {
1675			if (folio_test_dirty(folio))
1676				set_buffer_dirty(bh);
1677			if (folio_test_uptodate(folio))
1678				set_buffer_uptodate(bh);
1679			bh = bh->b_this_page;
1680		} while (bh != head);
1681	}
1682	folio_attach_private(folio, head);
1683	spin_unlock(&folio->mapping->i_private_lock);
1684
1685	return head;
1686}
1687EXPORT_SYMBOL(create_empty_buffers);
1688
1689/**
1690 * clean_bdev_aliases: clean a range of buffers in block device
1691 * @bdev: Block device to clean buffers in
1692 * @block: Start of a range of blocks to clean
1693 * @len: Number of blocks to clean
1694 *
1695 * We are taking a range of blocks for data and we don't want writeback of any
1696 * buffer-cache aliases starting from return from this function and until the
1697 * moment when something will explicitly mark the buffer dirty (hopefully that
1698 * will not happen until we will free that block ;-) We don't even need to mark
1699 * it not-uptodate - nobody can expect anything from a newly allocated buffer
1700 * anyway. We used to use unmap_buffer() for such invalidation, but that was
1701 * wrong. We definitely don't want to mark the alias unmapped, for example - it
1702 * would confuse anyone who might pick it with bread() afterwards...
1703 *
1704 * Also..  Note that bforget() doesn't lock the buffer.  So there can be
1705 * writeout I/O going on against recently-freed buffers.  We don't wait on that
1706 * I/O in bforget() - it's more efficient to wait on the I/O only if we really
1707 * need to.  That happens here.
1708 */
1709void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
1710{
1711	struct address_space *bd_mapping = bdev->bd_mapping;
1712	const int blkbits = bd_mapping->host->i_blkbits;
1713	struct folio_batch fbatch;
1714	pgoff_t index = ((loff_t)block << blkbits) / PAGE_SIZE;
1715	pgoff_t end;
1716	int i, count;
1717	struct buffer_head *bh;
1718	struct buffer_head *head;
1719
1720	end = ((loff_t)(block + len - 1) << blkbits) / PAGE_SIZE;
1721	folio_batch_init(&fbatch);
1722	while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) {
1723		count = folio_batch_count(&fbatch);
1724		for (i = 0; i < count; i++) {
1725			struct folio *folio = fbatch.folios[i];
1726
1727			if (!folio_buffers(folio))
1728				continue;
1729			/*
1730			 * We use folio lock instead of bd_mapping->i_private_lock
1731			 * to pin buffers here since we can afford to sleep and
1732			 * it scales better than a global spinlock lock.
1733			 */
1734			folio_lock(folio);
1735			/* Recheck when the folio is locked which pins bhs */
1736			head = folio_buffers(folio);
1737			if (!head)
1738				goto unlock_page;
1739			bh = head;
1740			do {
1741				if (!buffer_mapped(bh) || (bh->b_blocknr < block))
1742					goto next;
1743				if (bh->b_blocknr >= block + len)
1744					break;
1745				clear_buffer_dirty(bh);
1746				wait_on_buffer(bh);
1747				clear_buffer_req(bh);
1748next:
1749				bh = bh->b_this_page;
1750			} while (bh != head);
1751unlock_page:
1752			folio_unlock(folio);
1753		}
1754		folio_batch_release(&fbatch);
1755		cond_resched();
1756		/* End of range already reached? */
1757		if (index > end || !index)
1758			break;
1759	}
1760}
1761EXPORT_SYMBOL(clean_bdev_aliases);
1762
1763static struct buffer_head *folio_create_buffers(struct folio *folio,
1764						struct inode *inode,
1765						unsigned int b_state)
1766{
1767	struct buffer_head *bh;
1768
1769	BUG_ON(!folio_test_locked(folio));
1770
1771	bh = folio_buffers(folio);
1772	if (!bh)
1773		bh = create_empty_buffers(folio,
1774				1 << READ_ONCE(inode->i_blkbits), b_state);
1775	return bh;
1776}
1777
1778/*
1779 * NOTE! All mapped/uptodate combinations are valid:
1780 *
1781 *	Mapped	Uptodate	Meaning
1782 *
1783 *	No	No		"unknown" - must do get_block()
1784 *	No	Yes		"hole" - zero-filled
1785 *	Yes	No		"allocated" - allocated on disk, not read in
1786 *	Yes	Yes		"valid" - allocated and up-to-date in memory.
1787 *
1788 * "Dirty" is valid only with the last case (mapped+uptodate).
1789 */
1790
1791/*
1792 * While block_write_full_folio is writing back the dirty buffers under
1793 * the page lock, whoever dirtied the buffers may decide to clean them
1794 * again at any time.  We handle that by only looking at the buffer
1795 * state inside lock_buffer().
1796 *
1797 * If block_write_full_folio() is called for regular writeback
1798 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1799 * locked buffer.   This only can happen if someone has written the buffer
1800 * directly, with submit_bh().  At the address_space level PageWriteback
1801 * prevents this contention from occurring.
1802 *
1803 * If block_write_full_folio() is called with wbc->sync_mode ==
1804 * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
1805 * causes the writes to be flagged as synchronous writes.
1806 */
1807int __block_write_full_folio(struct inode *inode, struct folio *folio,
1808			get_block_t *get_block, struct writeback_control *wbc)
 
1809{
1810	int err;
1811	sector_t block;
1812	sector_t last_block;
1813	struct buffer_head *bh, *head;
1814	size_t blocksize;
1815	int nr_underway = 0;
1816	blk_opf_t write_flags = wbc_to_write_flags(wbc);
 
1817
1818	head = folio_create_buffers(folio, inode,
1819				    (1 << BH_Dirty) | (1 << BH_Uptodate));
 
 
 
 
 
 
1820
1821	/*
1822	 * Be very careful.  We have no exclusion from block_dirty_folio
1823	 * here, and the (potentially unmapped) buffers may become dirty at
1824	 * any time.  If a buffer becomes dirty here after we've inspected it
1825	 * then we just miss that fact, and the folio stays dirty.
1826	 *
1827	 * Buffers outside i_size may be dirtied by block_dirty_folio;
1828	 * handle that here by just cleaning them.
1829	 */
1830
 
 
1831	bh = head;
1832	blocksize = bh->b_size;
1833
1834	block = div_u64(folio_pos(folio), blocksize);
1835	last_block = div_u64(i_size_read(inode) - 1, blocksize);
1836
1837	/*
1838	 * Get all the dirty buffers mapped to disk addresses and
1839	 * handle any aliases from the underlying blockdev's mapping.
1840	 */
1841	do {
1842		if (block > last_block) {
1843			/*
1844			 * mapped buffers outside i_size will occur, because
1845			 * this folio can be outside i_size when there is a
1846			 * truncate in progress.
1847			 */
1848			/*
1849			 * The buffer was zeroed by block_write_full_folio()
1850			 */
1851			clear_buffer_dirty(bh);
1852			set_buffer_uptodate(bh);
1853		} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1854			   buffer_dirty(bh)) {
1855			WARN_ON(bh->b_size != blocksize);
1856			err = get_block(inode, block, bh, 1);
1857			if (err)
1858				goto recover;
1859			clear_buffer_delay(bh);
1860			if (buffer_new(bh)) {
1861				/* blockdev mappings never come here */
1862				clear_buffer_new(bh);
1863				clean_bdev_bh_alias(bh);
 
1864			}
1865		}
1866		bh = bh->b_this_page;
1867		block++;
1868	} while (bh != head);
1869
1870	do {
1871		if (!buffer_mapped(bh))
1872			continue;
1873		/*
1874		 * If it's a fully non-blocking write attempt and we cannot
1875		 * lock the buffer then redirty the folio.  Note that this can
1876		 * potentially cause a busy-wait loop from writeback threads
1877		 * and kswapd activity, but those code paths have their own
1878		 * higher-level throttling.
1879		 */
1880		if (wbc->sync_mode != WB_SYNC_NONE) {
1881			lock_buffer(bh);
1882		} else if (!trylock_buffer(bh)) {
1883			folio_redirty_for_writepage(wbc, folio);
1884			continue;
1885		}
1886		if (test_clear_buffer_dirty(bh)) {
1887			mark_buffer_async_write_endio(bh,
1888				end_buffer_async_write);
1889		} else {
1890			unlock_buffer(bh);
1891		}
1892	} while ((bh = bh->b_this_page) != head);
1893
1894	/*
1895	 * The folio and its buffers are protected by the writeback flag,
1896	 * so we can drop the bh refcounts early.
1897	 */
1898	BUG_ON(folio_test_writeback(folio));
1899	folio_start_writeback(folio);
1900
1901	do {
1902		struct buffer_head *next = bh->b_this_page;
1903		if (buffer_async_write(bh)) {
1904			submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
1905				      inode->i_write_hint, wbc);
1906			nr_underway++;
1907		}
1908		bh = next;
1909	} while (bh != head);
1910	folio_unlock(folio);
1911
1912	err = 0;
1913done:
1914	if (nr_underway == 0) {
1915		/*
1916		 * The folio was marked dirty, but the buffers were
1917		 * clean.  Someone wrote them back by hand with
1918		 * write_dirty_buffer/submit_bh.  A rare case.
1919		 */
1920		folio_end_writeback(folio);
1921
1922		/*
1923		 * The folio and buffer_heads can be released at any time from
1924		 * here on.
1925		 */
1926	}
1927	return err;
1928
1929recover:
1930	/*
1931	 * ENOSPC, or some other error.  We may already have added some
1932	 * blocks to the file, so we need to write these out to avoid
1933	 * exposing stale data.
1934	 * The folio is currently locked and not marked for writeback
1935	 */
1936	bh = head;
1937	/* Recovery: lock and submit the mapped buffers */
1938	do {
1939		if (buffer_mapped(bh) && buffer_dirty(bh) &&
1940		    !buffer_delay(bh)) {
1941			lock_buffer(bh);
1942			mark_buffer_async_write_endio(bh,
1943				end_buffer_async_write);
1944		} else {
1945			/*
1946			 * The buffer may have been set dirty during
1947			 * attachment to a dirty folio.
1948			 */
1949			clear_buffer_dirty(bh);
1950		}
1951	} while ((bh = bh->b_this_page) != head);
1952	BUG_ON(folio_test_writeback(folio));
1953	mapping_set_error(folio->mapping, err);
1954	folio_start_writeback(folio);
 
1955	do {
1956		struct buffer_head *next = bh->b_this_page;
1957		if (buffer_async_write(bh)) {
1958			clear_buffer_dirty(bh);
1959			submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
1960				      inode->i_write_hint, wbc);
1961			nr_underway++;
1962		}
1963		bh = next;
1964	} while (bh != head);
1965	folio_unlock(folio);
1966	goto done;
1967}
1968EXPORT_SYMBOL(__block_write_full_folio);
1969
1970/*
1971 * If a folio has any new buffers, zero them out here, and mark them uptodate
1972 * and dirty so they'll be written out (in order to prevent uninitialised
1973 * block data from leaking). And clear the new bit.
1974 */
1975void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to)
1976{
1977	size_t block_start, block_end;
1978	struct buffer_head *head, *bh;
1979
1980	BUG_ON(!folio_test_locked(folio));
1981	head = folio_buffers(folio);
1982	if (!head)
1983		return;
1984
1985	bh = head;
1986	block_start = 0;
1987	do {
1988		block_end = block_start + bh->b_size;
1989
1990		if (buffer_new(bh)) {
1991			if (block_end > from && block_start < to) {
1992				if (!folio_test_uptodate(folio)) {
1993					size_t start, xend;
1994
1995					start = max(from, block_start);
1996					xend = min(to, block_end);
1997
1998					folio_zero_segment(folio, start, xend);
1999					set_buffer_uptodate(bh);
2000				}
2001
2002				clear_buffer_new(bh);
2003				mark_buffer_dirty(bh);
2004			}
2005		}
2006
2007		block_start = block_end;
2008		bh = bh->b_this_page;
2009	} while (bh != head);
2010}
2011EXPORT_SYMBOL(folio_zero_new_buffers);
2012
2013static int
2014iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
2015		const struct iomap *iomap)
2016{
2017	loff_t offset = (loff_t)block << inode->i_blkbits;
2018
2019	bh->b_bdev = iomap->bdev;
2020
2021	/*
2022	 * Block points to offset in file we need to map, iomap contains
2023	 * the offset at which the map starts. If the map ends before the
2024	 * current block, then do not map the buffer and let the caller
2025	 * handle it.
2026	 */
2027	if (offset >= iomap->offset + iomap->length)
2028		return -EIO;
2029
2030	switch (iomap->type) {
2031	case IOMAP_HOLE:
2032		/*
2033		 * If the buffer is not up to date or beyond the current EOF,
2034		 * we need to mark it as new to ensure sub-block zeroing is
2035		 * executed if necessary.
2036		 */
2037		if (!buffer_uptodate(bh) ||
2038		    (offset >= i_size_read(inode)))
2039			set_buffer_new(bh);
2040		return 0;
2041	case IOMAP_DELALLOC:
2042		if (!buffer_uptodate(bh) ||
2043		    (offset >= i_size_read(inode)))
2044			set_buffer_new(bh);
2045		set_buffer_uptodate(bh);
2046		set_buffer_mapped(bh);
2047		set_buffer_delay(bh);
2048		return 0;
2049	case IOMAP_UNWRITTEN:
2050		/*
2051		 * For unwritten regions, we always need to ensure that regions
2052		 * in the block we are not writing to are zeroed. Mark the
2053		 * buffer as new to ensure this.
2054		 */
2055		set_buffer_new(bh);
2056		set_buffer_unwritten(bh);
2057		fallthrough;
2058	case IOMAP_MAPPED:
2059		if ((iomap->flags & IOMAP_F_NEW) ||
2060		    offset >= i_size_read(inode)) {
2061			/*
2062			 * This can happen if truncating the block device races
2063			 * with the check in the caller as i_size updates on
2064			 * block devices aren't synchronized by i_rwsem for
2065			 * block devices.
2066			 */
2067			if (S_ISBLK(inode->i_mode))
2068				return -EIO;
2069			set_buffer_new(bh);
2070		}
2071		bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
2072				inode->i_blkbits;
2073		set_buffer_mapped(bh);
2074		return 0;
2075	default:
2076		WARN_ON_ONCE(1);
2077		return -EIO;
2078	}
2079}
2080
2081int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
2082		get_block_t *get_block, const struct iomap *iomap)
2083{
2084	size_t from = offset_in_folio(folio, pos);
2085	size_t to = from + len;
2086	struct inode *inode = folio->mapping->host;
2087	size_t block_start, block_end;
2088	sector_t block;
2089	int err = 0;
2090	size_t blocksize;
2091	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
2092
2093	BUG_ON(!folio_test_locked(folio));
2094	BUG_ON(to > folio_size(folio));
 
2095	BUG_ON(from > to);
2096
2097	head = folio_create_buffers(folio, inode, 0);
2098	blocksize = head->b_size;
2099	block = div_u64(folio_pos(folio), blocksize);
 
 
 
 
2100
2101	for (bh = head, block_start = 0; bh != head || !block_start;
2102	    block++, block_start=block_end, bh = bh->b_this_page) {
2103		block_end = block_start + blocksize;
2104		if (block_end <= from || block_start >= to) {
2105			if (folio_test_uptodate(folio)) {
2106				if (!buffer_uptodate(bh))
2107					set_buffer_uptodate(bh);
2108			}
2109			continue;
2110		}
2111		if (buffer_new(bh))
2112			clear_buffer_new(bh);
2113		if (!buffer_mapped(bh)) {
2114			WARN_ON(bh->b_size != blocksize);
2115			if (get_block)
2116				err = get_block(inode, block, bh, 1);
2117			else
2118				err = iomap_to_bh(inode, block, bh, iomap);
2119			if (err)
2120				break;
2121
2122			if (buffer_new(bh)) {
2123				clean_bdev_bh_alias(bh);
2124				if (folio_test_uptodate(folio)) {
 
2125					clear_buffer_new(bh);
2126					set_buffer_uptodate(bh);
2127					mark_buffer_dirty(bh);
2128					continue;
2129				}
2130				if (block_end > to || block_start < from)
2131					folio_zero_segments(folio,
2132						to, block_end,
2133						block_start, from);
2134				continue;
2135			}
2136		}
2137		if (folio_test_uptodate(folio)) {
2138			if (!buffer_uptodate(bh))
2139				set_buffer_uptodate(bh);
2140			continue; 
2141		}
2142		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
2143		    !buffer_unwritten(bh) &&
2144		     (block_start < from || block_end > to)) {
2145			bh_read_nowait(bh, 0);
2146			*wait_bh++=bh;
2147		}
2148	}
2149	/*
2150	 * If we issued read requests - let them complete.
2151	 */
2152	while(wait_bh > wait) {
2153		wait_on_buffer(*--wait_bh);
2154		if (!buffer_uptodate(*wait_bh))
2155			err = -EIO;
2156	}
2157	if (unlikely(err))
2158		folio_zero_new_buffers(folio, from, to);
2159	return err;
2160}
2161
2162int __block_write_begin(struct folio *folio, loff_t pos, unsigned len,
2163		get_block_t *get_block)
2164{
2165	return __block_write_begin_int(folio, pos, len, get_block, NULL);
2166}
2167EXPORT_SYMBOL(__block_write_begin);
2168
2169static void __block_commit_write(struct folio *folio, size_t from, size_t to)
 
2170{
2171	size_t block_start, block_end;
2172	bool partial = false;
2173	unsigned blocksize;
2174	struct buffer_head *bh, *head;
2175
2176	bh = head = folio_buffers(folio);
2177	if (!bh)
2178		return;
2179	blocksize = bh->b_size;
2180
2181	block_start = 0;
2182	do {
 
2183		block_end = block_start + blocksize;
2184		if (block_end <= from || block_start >= to) {
2185			if (!buffer_uptodate(bh))
2186				partial = true;
2187		} else {
2188			set_buffer_uptodate(bh);
2189			mark_buffer_dirty(bh);
2190		}
2191		if (buffer_new(bh))
2192			clear_buffer_new(bh);
2193
2194		block_start = block_end;
2195		bh = bh->b_this_page;
2196	} while (bh != head);
2197
2198	/*
2199	 * If this is a partial write which happened to make all buffers
2200	 * uptodate then we can optimize away a bogus read_folio() for
2201	 * the next read(). Here we 'discover' whether the folio went
2202	 * uptodate as a result of this (potentially partial) write.
2203	 */
2204	if (!partial)
2205		folio_mark_uptodate(folio);
 
2206}
2207
2208/*
2209 * block_write_begin takes care of the basic task of block allocation and
2210 * bringing partial write blocks uptodate first.
2211 *
2212 * The filesystem needs to handle block truncation upon failure.
2213 */
2214int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
2215		struct folio **foliop, get_block_t *get_block)
2216{
2217	pgoff_t index = pos >> PAGE_SHIFT;
2218	struct folio *folio;
2219	int status;
2220
2221	folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
2222			mapping_gfp_mask(mapping));
2223	if (IS_ERR(folio))
2224		return PTR_ERR(folio);
2225
2226	status = __block_write_begin_int(folio, pos, len, get_block, NULL);
2227	if (unlikely(status)) {
2228		folio_unlock(folio);
2229		folio_put(folio);
2230		folio = NULL;
2231	}
2232
2233	*foliop = folio;
2234	return status;
2235}
2236EXPORT_SYMBOL(block_write_begin);
2237
2238int block_write_end(struct file *file, struct address_space *mapping,
2239			loff_t pos, unsigned len, unsigned copied,
2240			struct folio *folio, void *fsdata)
2241{
2242	size_t start = pos - folio_pos(folio);
 
 
 
2243
2244	if (unlikely(copied < len)) {
2245		/*
2246		 * The buffers that were written will now be uptodate, so
2247		 * we don't have to worry about a read_folio reading them
2248		 * and overwriting a partial write. However if we have
2249		 * encountered a short write and only partially written
2250		 * into a buffer, it will not be marked uptodate, so a
2251		 * read_folio might come in and destroy our partial write.
2252		 *
2253		 * Do the simplest thing, and just treat any short write to a
2254		 * non uptodate folio as a zero-length write, and force the
2255		 * caller to redo the whole thing.
2256		 */
2257		if (!folio_test_uptodate(folio))
2258			copied = 0;
2259
2260		folio_zero_new_buffers(folio, start+copied, start+len);
2261	}
2262	flush_dcache_folio(folio);
2263
2264	/* This could be a short (even 0-length) commit */
2265	__block_commit_write(folio, start, start + copied);
2266
2267	return copied;
2268}
2269EXPORT_SYMBOL(block_write_end);
2270
2271int generic_write_end(struct file *file, struct address_space *mapping,
2272			loff_t pos, unsigned len, unsigned copied,
2273			struct folio *folio, void *fsdata)
2274{
2275	struct inode *inode = mapping->host;
2276	loff_t old_size = inode->i_size;
2277	bool i_size_changed = false;
2278
2279	copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata);
2280
2281	/*
2282	 * No need to use i_size_read() here, the i_size cannot change under us
2283	 * because we hold i_rwsem.
2284	 *
2285	 * But it's important to update i_size while still holding folio lock:
2286	 * page writeout could otherwise come in and zero beyond i_size.
2287	 */
2288	if (pos + copied > inode->i_size) {
2289		i_size_write(inode, pos + copied);
2290		i_size_changed = true;
2291	}
2292
2293	folio_unlock(folio);
2294	folio_put(folio);
2295
2296	if (old_size < pos)
2297		pagecache_isize_extended(inode, old_size, pos);
2298	/*
2299	 * Don't mark the inode dirty under page lock. First, it unnecessarily
2300	 * makes the holding time of page lock longer. Second, it forces lock
2301	 * ordering of page lock and transaction start for journaling
2302	 * filesystems.
2303	 */
2304	if (i_size_changed)
2305		mark_inode_dirty(inode);
 
2306	return copied;
2307}
2308EXPORT_SYMBOL(generic_write_end);
2309
2310/*
2311 * block_is_partially_uptodate checks whether buffers within a folio are
2312 * uptodate or not.
2313 *
2314 * Returns true if all buffers which correspond to the specified part
2315 * of the folio are uptodate.
2316 */
2317bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
 
2318{
 
2319	unsigned block_start, block_end, blocksize;
2320	unsigned to;
2321	struct buffer_head *bh, *head;
2322	bool ret = true;
2323
2324	head = folio_buffers(folio);
2325	if (!head)
2326		return false;
2327	blocksize = head->b_size;
2328	to = min_t(unsigned, folio_size(folio) - from, count);
2329	to = from + to;
2330	if (from < blocksize && to > folio_size(folio) - blocksize)
2331		return false;
2332
 
2333	bh = head;
2334	block_start = 0;
2335	do {
2336		block_end = block_start + blocksize;
2337		if (block_end > from && block_start < to) {
2338			if (!buffer_uptodate(bh)) {
2339				ret = false;
2340				break;
2341			}
2342			if (block_end >= to)
2343				break;
2344		}
2345		block_start = block_end;
2346		bh = bh->b_this_page;
2347	} while (bh != head);
2348
2349	return ret;
2350}
2351EXPORT_SYMBOL(block_is_partially_uptodate);
2352
2353/*
2354 * Generic "read_folio" function for block devices that have the normal
2355 * get_block functionality. This is most of the block device filesystems.
2356 * Reads the folio asynchronously --- the unlock_buffer() and
2357 * set/clear_buffer_uptodate() functions propagate buffer state into the
2358 * folio once IO has completed.
2359 */
2360int block_read_full_folio(struct folio *folio, get_block_t *get_block)
2361{
2362	struct inode *inode = folio->mapping->host;
2363	sector_t iblock, lblock;
2364	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2365	size_t blocksize;
2366	int nr, i;
2367	int fully_mapped = 1;
2368	bool page_error = false;
2369	loff_t limit = i_size_read(inode);
2370
2371	/* This is needed for ext4. */
2372	if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
2373		limit = inode->i_sb->s_maxbytes;
2374
2375	VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
2376
2377	head = folio_create_buffers(folio, inode, 0);
2378	blocksize = head->b_size;
 
 
 
2379
2380	iblock = div_u64(folio_pos(folio), blocksize);
2381	lblock = div_u64(limit + blocksize - 1, blocksize);
2382	bh = head;
2383	nr = 0;
2384	i = 0;
2385
2386	do {
2387		if (buffer_uptodate(bh))
2388			continue;
2389
2390		if (!buffer_mapped(bh)) {
2391			int err = 0;
2392
2393			fully_mapped = 0;
2394			if (iblock < lblock) {
2395				WARN_ON(bh->b_size != blocksize);
2396				err = get_block(inode, iblock, bh, 0);
2397				if (err)
2398					page_error = true;
2399			}
2400			if (!buffer_mapped(bh)) {
2401				folio_zero_range(folio, i * blocksize,
2402						blocksize);
2403				if (!err)
2404					set_buffer_uptodate(bh);
2405				continue;
2406			}
2407			/*
2408			 * get_block() might have updated the buffer
2409			 * synchronously
2410			 */
2411			if (buffer_uptodate(bh))
2412				continue;
2413		}
2414		arr[nr++] = bh;
2415	} while (i++, iblock++, (bh = bh->b_this_page) != head);
2416
2417	if (fully_mapped)
2418		folio_set_mappedtodisk(folio);
2419
2420	if (!nr) {
2421		/*
2422		 * All buffers are uptodate or get_block() returned an
2423		 * error when trying to map them - we can finish the read.
2424		 */
2425		folio_end_read(folio, !page_error);
 
 
2426		return 0;
2427	}
2428
2429	/* Stage two: lock the buffers */
2430	for (i = 0; i < nr; i++) {
2431		bh = arr[i];
2432		lock_buffer(bh);
2433		mark_buffer_async_read(bh);
2434	}
2435
2436	/*
2437	 * Stage 3: start the IO.  Check for uptodateness
2438	 * inside the buffer lock in case another process reading
2439	 * the underlying blockdev brought it uptodate (the sct fix).
2440	 */
2441	for (i = 0; i < nr; i++) {
2442		bh = arr[i];
2443		if (buffer_uptodate(bh))
2444			end_buffer_async_read(bh, 1);
2445		else
2446			submit_bh(REQ_OP_READ, bh);
2447	}
2448	return 0;
2449}
2450EXPORT_SYMBOL(block_read_full_folio);
2451
2452/* utility function for filesystems that need to do work on expanding
2453 * truncates.  Uses filesystem pagecache writes to allow the filesystem to
2454 * deal with the hole.  
2455 */
2456int generic_cont_expand_simple(struct inode *inode, loff_t size)
2457{
2458	struct address_space *mapping = inode->i_mapping;
2459	const struct address_space_operations *aops = mapping->a_ops;
2460	struct folio *folio;
2461	void *fsdata = NULL;
2462	int err;
2463
2464	err = inode_newsize_ok(inode, size);
2465	if (err)
2466		goto out;
2467
2468	err = aops->write_begin(NULL, mapping, size, 0, &folio, &fsdata);
 
 
2469	if (err)
2470		goto out;
2471
2472	err = aops->write_end(NULL, mapping, size, 0, 0, folio, fsdata);
2473	BUG_ON(err > 0);
2474
2475out:
2476	return err;
2477}
2478EXPORT_SYMBOL(generic_cont_expand_simple);
2479
2480static int cont_expand_zero(struct file *file, struct address_space *mapping,
2481			    loff_t pos, loff_t *bytes)
2482{
2483	struct inode *inode = mapping->host;
2484	const struct address_space_operations *aops = mapping->a_ops;
2485	unsigned int blocksize = i_blocksize(inode);
2486	struct folio *folio;
2487	void *fsdata = NULL;
2488	pgoff_t index, curidx;
2489	loff_t curpos;
2490	unsigned zerofrom, offset, len;
2491	int err = 0;
2492
2493	index = pos >> PAGE_SHIFT;
2494	offset = pos & ~PAGE_MASK;
2495
2496	while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
2497		zerofrom = curpos & ~PAGE_MASK;
2498		if (zerofrom & (blocksize-1)) {
2499			*bytes |= (blocksize-1);
2500			(*bytes)++;
2501		}
2502		len = PAGE_SIZE - zerofrom;
2503
2504		err = aops->write_begin(file, mapping, curpos, len,
2505					    &folio, &fsdata);
 
2506		if (err)
2507			goto out;
2508		folio_zero_range(folio, offset_in_folio(folio, curpos), len);
2509		err = aops->write_end(file, mapping, curpos, len, len,
2510						folio, fsdata);
2511		if (err < 0)
2512			goto out;
2513		BUG_ON(err != len);
2514		err = 0;
2515
2516		balance_dirty_pages_ratelimited(mapping);
2517
2518		if (fatal_signal_pending(current)) {
2519			err = -EINTR;
2520			goto out;
2521		}
2522	}
2523
2524	/* page covers the boundary, find the boundary offset */
2525	if (index == curidx) {
2526		zerofrom = curpos & ~PAGE_MASK;
2527		/* if we will expand the thing last block will be filled */
2528		if (offset <= zerofrom) {
2529			goto out;
2530		}
2531		if (zerofrom & (blocksize-1)) {
2532			*bytes |= (blocksize-1);
2533			(*bytes)++;
2534		}
2535		len = offset - zerofrom;
2536
2537		err = aops->write_begin(file, mapping, curpos, len,
2538					    &folio, &fsdata);
 
2539		if (err)
2540			goto out;
2541		folio_zero_range(folio, offset_in_folio(folio, curpos), len);
2542		err = aops->write_end(file, mapping, curpos, len, len,
2543						folio, fsdata);
2544		if (err < 0)
2545			goto out;
2546		BUG_ON(err != len);
2547		err = 0;
2548	}
2549out:
2550	return err;
2551}
2552
2553/*
2554 * For moronic filesystems that do not allow holes in file.
2555 * We may have to extend the file.
2556 */
2557int cont_write_begin(struct file *file, struct address_space *mapping,
2558			loff_t pos, unsigned len,
2559			struct folio **foliop, void **fsdata,
2560			get_block_t *get_block, loff_t *bytes)
2561{
2562	struct inode *inode = mapping->host;
2563	unsigned int blocksize = i_blocksize(inode);
2564	unsigned int zerofrom;
2565	int err;
2566
2567	err = cont_expand_zero(file, mapping, pos, bytes);
2568	if (err)
2569		return err;
2570
2571	zerofrom = *bytes & ~PAGE_MASK;
2572	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2573		*bytes |= (blocksize-1);
2574		(*bytes)++;
2575	}
2576
2577	return block_write_begin(mapping, pos, len, foliop, get_block);
2578}
2579EXPORT_SYMBOL(cont_write_begin);
2580
2581void block_commit_write(struct page *page, unsigned from, unsigned to)
2582{
2583	struct folio *folio = page_folio(page);
2584	__block_commit_write(folio, from, to);
 
2585}
2586EXPORT_SYMBOL(block_commit_write);
2587
2588/*
2589 * block_page_mkwrite() is not allowed to change the file size as it gets
2590 * called from a page fault handler when a page is first dirtied. Hence we must
2591 * be careful to check for EOF conditions here. We set the page up correctly
2592 * for a written page which means we get ENOSPC checking when writing into
2593 * holes and correct delalloc and unwritten extent mapping on filesystems that
2594 * support these features.
2595 *
2596 * We are not allowed to take the i_mutex here so we have to play games to
2597 * protect against truncate races as the page could now be beyond EOF.  Because
2598 * truncate writes the inode size before removing pages, once we have the
2599 * page lock we can determine safely if the page is beyond EOF. If it is not
2600 * beyond EOF, then the page is guaranteed safe against truncation until we
2601 * unlock the page.
2602 *
2603 * Direct callers of this function should protect against filesystem freezing
2604 * using sb_start_pagefault() - sb_end_pagefault() functions.
2605 */
2606int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2607			 get_block_t get_block)
2608{
2609	struct folio *folio = page_folio(vmf->page);
2610	struct inode *inode = file_inode(vma->vm_file);
2611	unsigned long end;
2612	loff_t size;
2613	int ret;
2614
2615	folio_lock(folio);
2616	size = i_size_read(inode);
2617	if ((folio->mapping != inode->i_mapping) ||
2618	    (folio_pos(folio) >= size)) {
2619		/* We overload EFAULT to mean page got truncated */
2620		ret = -EFAULT;
2621		goto out_unlock;
2622	}
2623
2624	end = folio_size(folio);
2625	/* folio is wholly or partially inside EOF */
2626	if (folio_pos(folio) + end > size)
2627		end = size - folio_pos(folio);
2628
2629	ret = __block_write_begin_int(folio, 0, end, get_block, NULL);
2630	if (unlikely(ret))
2631		goto out_unlock;
2632
2633	__block_commit_write(folio, 0, end);
 
 
2634
2635	folio_mark_dirty(folio);
2636	folio_wait_stable(folio);
 
 
 
 
 
 
 
 
 
 
 
 
 
2637	return 0;
2638out_unlock:
2639	folio_unlock(folio);
2640	return ret;
2641}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2642EXPORT_SYMBOL(block_page_mkwrite);
2643
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2644int block_truncate_page(struct address_space *mapping,
2645			loff_t from, get_block_t *get_block)
2646{
2647	pgoff_t index = from >> PAGE_SHIFT;
 
2648	unsigned blocksize;
2649	sector_t iblock;
2650	size_t offset, length, pos;
2651	struct inode *inode = mapping->host;
2652	struct folio *folio;
2653	struct buffer_head *bh;
2654	int err = 0;
2655
2656	blocksize = i_blocksize(inode);
2657	length = from & (blocksize - 1);
2658
2659	/* Block boundary? Nothing to do */
2660	if (!length)
2661		return 0;
2662
2663	length = blocksize - length;
2664	iblock = ((loff_t)index * PAGE_SIZE) >> inode->i_blkbits;
 
 
 
 
 
2665
2666	folio = filemap_grab_folio(mapping, index);
2667	if (IS_ERR(folio))
2668		return PTR_ERR(folio);
2669
2670	bh = folio_buffers(folio);
2671	if (!bh)
2672		bh = create_empty_buffers(folio, blocksize, 0);
2673
2674	/* Find the buffer that contains "offset" */
2675	offset = offset_in_folio(folio, from);
2676	pos = blocksize;
2677	while (offset >= pos) {
2678		bh = bh->b_this_page;
2679		iblock++;
2680		pos += blocksize;
2681	}
2682
 
2683	if (!buffer_mapped(bh)) {
2684		WARN_ON(bh->b_size != blocksize);
2685		err = get_block(inode, iblock, bh, 0);
2686		if (err)
2687			goto unlock;
2688		/* unmapped? It's a hole - nothing to do */
2689		if (!buffer_mapped(bh))
2690			goto unlock;
2691	}
2692
2693	/* Ok, it's mapped. Make sure it's up-to-date */
2694	if (folio_test_uptodate(folio))
2695		set_buffer_uptodate(bh);
2696
2697	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2698		err = bh_read(bh, 0);
 
 
2699		/* Uhhuh. Read error. Complain and punt. */
2700		if (err < 0)
2701			goto unlock;
2702	}
2703
2704	folio_zero_range(folio, offset, length);
2705	mark_buffer_dirty(bh);
 
2706
2707unlock:
2708	folio_unlock(folio);
2709	folio_put(folio);
2710
2711	return err;
2712}
2713EXPORT_SYMBOL(block_truncate_page);
2714
2715/*
2716 * The generic ->writepage function for buffer-backed address_spaces
 
2717 */
2718int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
2719		void *get_block)
2720{
2721	struct inode * const inode = folio->mapping->host;
2722	loff_t i_size = i_size_read(inode);
 
 
2723
2724	/* Is the folio fully inside i_size? */
2725	if (folio_pos(folio) + folio_size(folio) <= i_size)
2726		return __block_write_full_folio(inode, folio, get_block, wbc);
2727
2728	/* Is the folio fully outside i_size? (truncate in progress) */
2729	if (folio_pos(folio) >= i_size) {
2730		folio_unlock(folio);
 
 
 
 
 
 
 
 
2731		return 0; /* don't care */
2732	}
2733
2734	/*
2735	 * The folio straddles i_size.  It must be zeroed out on each and every
2736	 * writepage invocation because it may be mmapped.  "A file is mapped
2737	 * in multiples of the page size.  For a file that is not a multiple of
2738	 * the page size, the remaining memory is zeroed when mapped, and
2739	 * writes to that region are not written out to the file."
2740	 */
2741	folio_zero_segment(folio, offset_in_folio(folio, i_size),
2742			folio_size(folio));
2743	return __block_write_full_folio(inode, folio, get_block, wbc);
 
 
 
 
 
 
 
 
 
 
2744}
 
2745
2746sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2747			    get_block_t *get_block)
2748{
 
2749	struct inode *inode = mapping->host;
2750	struct buffer_head tmp = {
2751		.b_size = i_blocksize(inode),
2752	};
2753
2754	get_block(inode, block, &tmp, 0);
2755	return tmp.b_blocknr;
2756}
2757EXPORT_SYMBOL(generic_block_bmap);
2758
2759static void end_bio_bh_io_sync(struct bio *bio)
2760{
2761	struct buffer_head *bh = bio->bi_private;
2762
2763	if (unlikely(bio_flagged(bio, BIO_QUIET)))
 
 
 
 
2764		set_bit(BH_Quiet, &bh->b_state);
2765
2766	bh->b_end_io(bh, !bio->bi_status);
2767	bio_put(bio);
2768}
2769
2770static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
2771			  enum rw_hint write_hint,
2772			  struct writeback_control *wbc)
2773{
2774	const enum req_op op = opf & REQ_OP_MASK;
2775	struct bio *bio;
 
2776
2777	BUG_ON(!buffer_locked(bh));
2778	BUG_ON(!buffer_mapped(bh));
2779	BUG_ON(!bh->b_end_io);
2780	BUG_ON(buffer_delay(bh));
2781	BUG_ON(buffer_unwritten(bh));
2782
2783	/*
2784	 * Only clear out a write error when rewriting
2785	 */
2786	if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
2787		clear_buffer_write_io_error(bh);
2788
2789	if (buffer_meta(bh))
2790		opf |= REQ_META;
2791	if (buffer_prio(bh))
2792		opf |= REQ_PRIO;
2793
2794	bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO);
2795
2796	fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
2797
2798	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2799	bio->bi_write_hint = write_hint;
2800
2801	bio_add_folio_nofail(bio, bh->b_folio, bh->b_size, bh_offset(bh));
 
 
 
 
 
 
 
 
2802
2803	bio->bi_end_io = end_bio_bh_io_sync;
2804	bio->bi_private = bh;
2805
2806	/* Take care of bh's that straddle the end of the device */
2807	guard_bio_eod(bio);
2808
2809	if (wbc) {
2810		wbc_init_bio(wbc, bio);
2811		wbc_account_cgroup_owner(wbc, bh->b_folio, bh->b_size);
2812	}
2813
2814	submit_bio(bio);
 
2815}
 
2816
2817void submit_bh(blk_opf_t opf, struct buffer_head *bh)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2818{
2819	submit_bh_wbc(opf, bh, WRITE_LIFE_NOT_SET, NULL);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2820}
2821EXPORT_SYMBOL(submit_bh);
2822
2823void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
2824{
2825	lock_buffer(bh);
2826	if (!test_clear_buffer_dirty(bh)) {
2827		unlock_buffer(bh);
2828		return;
2829	}
2830	bh->b_end_io = end_buffer_write_sync;
2831	get_bh(bh);
2832	submit_bh(REQ_OP_WRITE | op_flags, bh);
2833}
2834EXPORT_SYMBOL(write_dirty_buffer);
2835
2836/*
2837 * For a data-integrity writeout, we need to wait upon any in-progress I/O
2838 * and then start new I/O and then wait upon it.  The caller must have a ref on
2839 * the buffer_head.
2840 */
2841int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
2842{
 
 
2843	WARN_ON(atomic_read(&bh->b_count) < 1);
2844	lock_buffer(bh);
2845	if (test_clear_buffer_dirty(bh)) {
2846		/*
2847		 * The bh should be mapped, but it might not be if the
2848		 * device was hot-removed. Not much we can do but fail the I/O.
2849		 */
2850		if (!buffer_mapped(bh)) {
2851			unlock_buffer(bh);
2852			return -EIO;
2853		}
2854
2855		get_bh(bh);
2856		bh->b_end_io = end_buffer_write_sync;
2857		submit_bh(REQ_OP_WRITE | op_flags, bh);
2858		wait_on_buffer(bh);
2859		if (!buffer_uptodate(bh))
2860			return -EIO;
2861	} else {
2862		unlock_buffer(bh);
2863	}
2864	return 0;
2865}
2866EXPORT_SYMBOL(__sync_dirty_buffer);
2867
2868int sync_dirty_buffer(struct buffer_head *bh)
2869{
2870	return __sync_dirty_buffer(bh, REQ_SYNC);
2871}
2872EXPORT_SYMBOL(sync_dirty_buffer);
2873
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2874static inline int buffer_busy(struct buffer_head *bh)
2875{
2876	return atomic_read(&bh->b_count) |
2877		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2878}
2879
2880static bool
2881drop_buffers(struct folio *folio, struct buffer_head **buffers_to_free)
2882{
2883	struct buffer_head *head = folio_buffers(folio);
2884	struct buffer_head *bh;
2885
2886	bh = head;
2887	do {
 
 
2888		if (buffer_busy(bh))
2889			goto failed;
2890		bh = bh->b_this_page;
2891	} while (bh != head);
2892
2893	do {
2894		struct buffer_head *next = bh->b_this_page;
2895
2896		if (bh->b_assoc_map)
2897			__remove_assoc_queue(bh);
2898		bh = next;
2899	} while (bh != head);
2900	*buffers_to_free = head;
2901	folio_detach_private(folio);
2902	return true;
2903failed:
2904	return false;
2905}
2906
2907/**
2908 * try_to_free_buffers - Release buffers attached to this folio.
2909 * @folio: The folio.
2910 *
2911 * If any buffers are in use (dirty, under writeback, elevated refcount),
2912 * no buffers will be freed.
2913 *
2914 * If the folio is dirty but all the buffers are clean then we need to
2915 * be sure to mark the folio clean as well.  This is because the folio
2916 * may be against a block device, and a later reattachment of buffers
2917 * to a dirty folio will set *all* buffers dirty.  Which would corrupt
2918 * filesystem data on the same device.
2919 *
2920 * The same applies to regular filesystem folios: if all the buffers are
2921 * clean then we set the folio clean and proceed.  To do that, we require
2922 * total exclusion from block_dirty_folio().  That is obtained with
2923 * i_private_lock.
2924 *
2925 * Exclusion against try_to_free_buffers may be obtained by either
2926 * locking the folio or by holding its mapping's i_private_lock.
2927 *
2928 * Context: Process context.  @folio must be locked.  Will not sleep.
2929 * Return: true if all buffers attached to this folio were freed.
2930 */
2931bool try_to_free_buffers(struct folio *folio)
2932{
2933	struct address_space * const mapping = folio->mapping;
2934	struct buffer_head *buffers_to_free = NULL;
2935	bool ret = 0;
2936
2937	BUG_ON(!folio_test_locked(folio));
2938	if (folio_test_writeback(folio))
2939		return false;
2940
2941	if (mapping == NULL) {		/* can this still happen? */
2942		ret = drop_buffers(folio, &buffers_to_free);
2943		goto out;
2944	}
2945
2946	spin_lock(&mapping->i_private_lock);
2947	ret = drop_buffers(folio, &buffers_to_free);
2948
2949	/*
2950	 * If the filesystem writes its buffers by hand (eg ext3)
2951	 * then we can have clean buffers against a dirty folio.  We
2952	 * clean the folio here; otherwise the VM will never notice
2953	 * that the filesystem did any IO at all.
2954	 *
2955	 * Also, during truncate, discard_buffer will have marked all
2956	 * the folio's buffers clean.  We discover that here and clean
2957	 * the folio also.
2958	 *
2959	 * i_private_lock must be held over this entire operation in order
2960	 * to synchronise against block_dirty_folio and prevent the
2961	 * dirty bit from being lost.
2962	 */
2963	if (ret)
2964		folio_cancel_dirty(folio);
2965	spin_unlock(&mapping->i_private_lock);
2966out:
2967	if (buffers_to_free) {
2968		struct buffer_head *bh = buffers_to_free;
2969
2970		do {
2971			struct buffer_head *next = bh->b_this_page;
2972			free_buffer_head(bh);
2973			bh = next;
2974		} while (bh != buffers_to_free);
2975	}
2976	return ret;
2977}
2978EXPORT_SYMBOL(try_to_free_buffers);
2979
2980/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2981 * Buffer-head allocation
2982 */
2983static struct kmem_cache *bh_cachep __ro_after_init;
2984
2985/*
2986 * Once the number of bh's in the machine exceeds this level, we start
2987 * stripping them in writeback.
2988 */
2989static unsigned long max_buffer_heads __ro_after_init;
2990
2991int buffer_heads_over_limit;
2992
2993struct bh_accounting {
2994	int nr;			/* Number of live bh's */
2995	int ratelimit;		/* Limit cacheline bouncing */
2996};
2997
2998static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
2999
3000static void recalc_bh_state(void)
3001{
3002	int i;
3003	int tot = 0;
3004
3005	if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3006		return;
3007	__this_cpu_write(bh_accounting.ratelimit, 0);
3008	for_each_online_cpu(i)
3009		tot += per_cpu(bh_accounting, i).nr;
3010	buffer_heads_over_limit = (tot > max_buffer_heads);
3011}
3012
3013struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3014{
3015	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3016	if (ret) {
3017		INIT_LIST_HEAD(&ret->b_assoc_buffers);
3018		spin_lock_init(&ret->b_uptodate_lock);
3019		preempt_disable();
3020		__this_cpu_inc(bh_accounting.nr);
3021		recalc_bh_state();
3022		preempt_enable();
3023	}
3024	return ret;
3025}
3026EXPORT_SYMBOL(alloc_buffer_head);
3027
3028void free_buffer_head(struct buffer_head *bh)
3029{
3030	BUG_ON(!list_empty(&bh->b_assoc_buffers));
3031	kmem_cache_free(bh_cachep, bh);
3032	preempt_disable();
3033	__this_cpu_dec(bh_accounting.nr);
3034	recalc_bh_state();
3035	preempt_enable();
3036}
3037EXPORT_SYMBOL(free_buffer_head);
3038
3039static int buffer_exit_cpu_dead(unsigned int cpu)
3040{
3041	int i;
3042	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3043
3044	for (i = 0; i < BH_LRU_SIZE; i++) {
3045		brelse(b->bhs[i]);
3046		b->bhs[i] = NULL;
3047	}
3048	this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3049	per_cpu(bh_accounting, cpu).nr = 0;
3050	return 0;
 
 
 
 
 
 
 
3051}
3052
3053/**
3054 * bh_uptodate_or_lock - Test whether the buffer is uptodate
3055 * @bh: struct buffer_head
3056 *
3057 * Return true if the buffer is up-to-date and false,
3058 * with the buffer locked, if not.
3059 */
3060int bh_uptodate_or_lock(struct buffer_head *bh)
3061{
3062	if (!buffer_uptodate(bh)) {
3063		lock_buffer(bh);
3064		if (!buffer_uptodate(bh))
3065			return 0;
3066		unlock_buffer(bh);
3067	}
3068	return 1;
3069}
3070EXPORT_SYMBOL(bh_uptodate_or_lock);
3071
3072/**
3073 * __bh_read - Submit read for a locked buffer
3074 * @bh: struct buffer_head
3075 * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
3076 * @wait: wait until reading finish
3077 *
3078 * Returns zero on success or don't wait, and -EIO on error.
3079 */
3080int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
3081{
3082	int ret = 0;
3083
3084	BUG_ON(!buffer_locked(bh));
3085
3086	get_bh(bh);
3087	bh->b_end_io = end_buffer_read_sync;
3088	submit_bh(REQ_OP_READ | op_flags, bh);
3089	if (wait) {
3090		wait_on_buffer(bh);
3091		if (!buffer_uptodate(bh))
3092			ret = -EIO;
3093	}
3094	return ret;
3095}
3096EXPORT_SYMBOL(__bh_read);
3097
3098/**
3099 * __bh_read_batch - Submit read for a batch of unlocked buffers
3100 * @nr: entry number of the buffer batch
3101 * @bhs: a batch of struct buffer_head
3102 * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
3103 * @force_lock: force to get a lock on the buffer if set, otherwise drops any
3104 *              buffer that cannot lock.
3105 *
3106 * Returns zero on success or don't wait, and -EIO on error.
3107 */
3108void __bh_read_batch(int nr, struct buffer_head *bhs[],
3109		     blk_opf_t op_flags, bool force_lock)
3110{
3111	int i;
3112
3113	for (i = 0; i < nr; i++) {
3114		struct buffer_head *bh = bhs[i];
3115
3116		if (buffer_uptodate(bh))
3117			continue;
3118
3119		if (force_lock)
3120			lock_buffer(bh);
3121		else
3122			if (!trylock_buffer(bh))
3123				continue;
3124
3125		if (buffer_uptodate(bh)) {
3126			unlock_buffer(bh);
3127			continue;
3128		}
3129
3130		bh->b_end_io = end_buffer_read_sync;
3131		get_bh(bh);
3132		submit_bh(REQ_OP_READ | op_flags, bh);
3133	}
3134}
3135EXPORT_SYMBOL(__bh_read_batch);
3136
3137void __init buffer_init(void)
3138{
3139	unsigned long nrpages;
3140	int ret;
 
 
 
 
 
3141
3142	bh_cachep = KMEM_CACHE(buffer_head,
3143				SLAB_RECLAIM_ACCOUNT|SLAB_PANIC);
3144	/*
3145	 * Limit the bh occupancy to 10% of ZONE_NORMAL
3146	 */
3147	nrpages = (nr_free_buffer_pages() * 10) / 100;
3148	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3149	ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
3150					NULL, buffer_exit_cpu_dead);
3151	WARN_ON(ret < 0);
3152}

 
   1/*
   2 *  linux/fs/buffer.c
   3 *
   4 *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
   5 */
   6
   7/*
   8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
   9 *
  10 * Removed a lot of unnecessary code and simplified things now that
  11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  12 *
  13 * Speed up hash, lru, and free list operations.  Use gfp() for allocating
  14 * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
  15 *
  16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
  17 *
  18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
  19 */
  20
  21#include <linux/kernel.h>
 
  22#include <linux/syscalls.h>
  23#include <linux/fs.h>
 
  24#include <linux/mm.h>
  25#include <linux/percpu.h>
  26#include <linux/slab.h>
  27#include <linux/capability.h>
  28#include <linux/blkdev.h>
  29#include <linux/file.h>
  30#include <linux/quotaops.h>
  31#include <linux/highmem.h>
  32#include <linux/module.h>
 
  33#include <linux/writeback.h>
  34#include <linux/hash.h>
  35#include <linux/suspend.h>
  36#include <linux/buffer_head.h>
  37#include <linux/task_io_accounting_ops.h>
  38#include <linux/bio.h>
  39#include <linux/notifier.h>
  40#include <linux/cpu.h>
  41#include <linux/bitops.h>
  42#include <linux/mpage.h>
  43#include <linux/bit_spinlock.h>
  44#include <linux/cleancache.h>
 
 
 
 
 
 
 
  45
  46static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 
 
  47
  48#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
  49
  50inline void
  51init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
  52{
  53	bh->b_end_io = handler;
  54	bh->b_private = private;
  55}
  56EXPORT_SYMBOL(init_buffer);
  57
  58static int sleep_on_buffer(void *word)
  59{
  60	io_schedule();
  61	return 0;
  62}
 
  63
  64void __lock_buffer(struct buffer_head *bh)
  65{
  66	wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer,
  67							TASK_UNINTERRUPTIBLE);
  68}
  69EXPORT_SYMBOL(__lock_buffer);
  70
  71void unlock_buffer(struct buffer_head *bh)
  72{
  73	clear_bit_unlock(BH_Lock, &bh->b_state);
  74	smp_mb__after_clear_bit();
  75	wake_up_bit(&bh->b_state, BH_Lock);
  76}
  77EXPORT_SYMBOL(unlock_buffer);
  78
  79/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  80 * Block until a buffer comes unlocked.  This doesn't stop it
  81 * from becoming locked again - you have to lock it yourself
  82 * if you want to preserve its state.
  83 */
  84void __wait_on_buffer(struct buffer_head * bh)
  85{
  86	wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE);
  87}
  88EXPORT_SYMBOL(__wait_on_buffer);
  89
  90static void
  91__clear_page_buffers(struct page *page)
  92{
  93	ClearPagePrivate(page);
  94	set_page_private(page, 0);
  95	page_cache_release(page);
  96}
  97
  98
  99static int quiet_error(struct buffer_head *bh)
 100{
 101	if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
 102		return 0;
 103	return 1;
 104}
 105
 106
 107static void buffer_io_error(struct buffer_head *bh)
 108{
 109	char b[BDEVNAME_SIZE];
 110	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
 111			bdevname(bh->b_bdev, b),
 112			(unsigned long long)bh->b_blocknr);
 113}
 114
 115/*
 116 * End-of-IO handler helper function which does not touch the bh after
 117 * unlocking it.
 118 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
 119 * a race there is benign: unlock_buffer() only use the bh's address for
 120 * hashing after unlocking the buffer, so it doesn't actually touch the bh
 121 * itself.
 122 */
 123static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
 124{
 125	if (uptodate) {
 126		set_buffer_uptodate(bh);
 127	} else {
 128		/* This happens, due to failed READA attempts. */
 129		clear_buffer_uptodate(bh);
 130	}
 131	unlock_buffer(bh);
 132}
 133
 134/*
 135 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 136 * unlock the buffer. This is what ll_rw_block uses too.
 137 */
 138void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
 139{
 140	__end_buffer_read_notouch(bh, uptodate);
 141	put_bh(bh);
 142}
 143EXPORT_SYMBOL(end_buffer_read_sync);
 144
 145void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 146{
 147	char b[BDEVNAME_SIZE];
 148
 149	if (uptodate) {
 150		set_buffer_uptodate(bh);
 151	} else {
 152		if (!quiet_error(bh)) {
 153			buffer_io_error(bh);
 154			printk(KERN_WARNING "lost page write due to "
 155					"I/O error on %s\n",
 156				       bdevname(bh->b_bdev, b));
 157		}
 158		set_buffer_write_io_error(bh);
 159		clear_buffer_uptodate(bh);
 160	}
 161	unlock_buffer(bh);
 162	put_bh(bh);
 163}
 164EXPORT_SYMBOL(end_buffer_write_sync);
 165
 166/*
 167 * Various filesystems appear to want __find_get_block to be non-blocking.
 168 * But it's the page lock which protects the buffers.  To get around this,
 169 * we get exclusion from try_to_free_buffers with the blockdev mapping's
 170 * private_lock.
 171 *
 172 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
 173 * may be quite high.  This code could TryLock the page, and if that
 174 * succeeds, there is no need to take private_lock. (But if
 175 * private_lock is contended then so is mapping->tree_lock).
 176 */
 177static struct buffer_head *
 178__find_get_block_slow(struct block_device *bdev, sector_t block)
 179{
 180	struct inode *bd_inode = bdev->bd_inode;
 181	struct address_space *bd_mapping = bd_inode->i_mapping;
 182	struct buffer_head *ret = NULL;
 183	pgoff_t index;
 184	struct buffer_head *bh;
 185	struct buffer_head *head;
 186	struct page *page;
 187	int all_mapped = 1;
 
 188
 189	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
 190	page = find_get_page(bd_mapping, index);
 191	if (!page)
 192		goto out;
 193
 194	spin_lock(&bd_mapping->private_lock);
 195	if (!page_has_buffers(page))
 
 196		goto out_unlock;
 197	head = page_buffers(page);
 198	bh = head;
 199	do {
 200		if (!buffer_mapped(bh))
 201			all_mapped = 0;
 202		else if (bh->b_blocknr == block) {
 203			ret = bh;
 204			get_bh(bh);
 205			goto out_unlock;
 206		}
 207		bh = bh->b_this_page;
 208	} while (bh != head);
 209
 210	/* we might be here because some of the buffers on this page are
 211	 * not mapped.  This is due to various races between
 212	 * file io on the block device and getblk.  It gets dealt with
 213	 * elsewhere, don't buffer_error if we had some unmapped buffers
 214	 */
 215	if (all_mapped) {
 216		printk("__find_get_block_slow() failed. "
 217			"block=%llu, b_blocknr=%llu\n",
 218			(unsigned long long)block,
 219			(unsigned long long)bh->b_blocknr);
 220		printk("b_state=0x%08lx, b_size=%zu\n",
 221			bh->b_state, bh->b_size);
 222		printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
 
 223	}
 224out_unlock:
 225	spin_unlock(&bd_mapping->private_lock);
 226	page_cache_release(page);
 227out:
 228	return ret;
 229}
 230
 231/* If invalidate_buffers() will trash dirty buffers, it means some kind
 232   of fs corruption is going on. Trashing dirty data always imply losing
 233   information that was supposed to be just stored on the physical layer
 234   by the user.
 235
 236   Thus invalidate_buffers in general usage is not allwowed to trash
 237   dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
 238   be preserved.  These buffers are simply skipped.
 239  
 240   We also skip buffers which are still in use.  For example this can
 241   happen if a userspace program is reading the block device.
 242
 243   NOTE: In the case where the user removed a removable-media-disk even if
 244   there's still dirty data not synced on disk (due a bug in the device driver
 245   or due an error of the user), by not destroying the dirty buffers we could
 246   generate corruption also on the next media inserted, thus a parameter is
 247   necessary to handle this case in the most safe way possible (trying
 248   to not corrupt also the new disk inserted with the data belonging to
 249   the old now corrupted disk). Also for the ramdisk the natural thing
 250   to do in order to release the ramdisk memory is to destroy dirty buffers.
 251
 252   These are two special cases. Normal usage imply the device driver
 253   to issue a sync on the device (without waiting I/O completion) and
 254   then an invalidate_buffers call that doesn't trash dirty buffers.
 255
 256   For handling cache coherency with the blkdev pagecache the 'update' case
 257   is been introduced. It is needed to re-read from disk any pinned
 258   buffer. NOTE: re-reading from disk is destructive so we can do it only
 259   when we assume nobody is changing the buffercache under our I/O and when
 260   we think the disk contains more recent information than the buffercache.
 261   The update == 1 pass marks the buffers we need to update, the update == 2
 262   pass does the actual I/O. */
 263void invalidate_bdev(struct block_device *bdev)
 264{
 265	struct address_space *mapping = bdev->bd_inode->i_mapping;
 266
 267	if (mapping->nrpages == 0)
 268		return;
 269
 270	invalidate_bh_lrus();
 271	lru_add_drain_all();	/* make sure all lru add caches are flushed */
 272	invalidate_mapping_pages(mapping, 0, -1);
 273	/* 99% of the time, we don't need to flush the cleancache on the bdev.
 274	 * But, for the strange corners, lets be cautious
 275	 */
 276	cleancache_flush_inode(mapping);
 277}
 278EXPORT_SYMBOL(invalidate_bdev);
 279
 280/*
 281 * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
 282 */
 283static void free_more_memory(void)
 284{
 285	struct zone *zone;
 286	int nid;
 287
 288	wakeup_flusher_threads(1024);
 289	yield();
 290
 291	for_each_online_node(nid) {
 292		(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
 293						gfp_zone(GFP_NOFS), NULL,
 294						&zone);
 295		if (zone)
 296			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
 297						GFP_NOFS, NULL);
 298	}
 299}
 300
 301/*
 302 * I/O completion handler for block_read_full_page() - pages
 303 * which come unlocked at the end of I/O.
 304 */
 305static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 306{
 307	unsigned long flags;
 308	struct buffer_head *first;
 309	struct buffer_head *tmp;
 310	struct page *page;
 311	int page_uptodate = 1;
 312
 313	BUG_ON(!buffer_async_read(bh));
 314
 315	page = bh->b_page;
 316	if (uptodate) {
 317		set_buffer_uptodate(bh);
 318	} else {
 319		clear_buffer_uptodate(bh);
 320		if (!quiet_error(bh))
 321			buffer_io_error(bh);
 322		SetPageError(page);
 323	}
 324
 325	/*
 326	 * Be _very_ careful from here on. Bad things can happen if
 327	 * two buffer heads end IO at almost the same time and both
 328	 * decide that the page is now completely done.
 329	 */
 330	first = page_buffers(page);
 331	local_irq_save(flags);
 332	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 333	clear_buffer_async_read(bh);
 334	unlock_buffer(bh);
 335	tmp = bh;
 336	do {
 337		if (!buffer_uptodate(tmp))
 338			page_uptodate = 0;
 339		if (buffer_async_read(tmp)) {
 340			BUG_ON(!buffer_locked(tmp));
 341			goto still_busy;
 342		}
 343		tmp = tmp->b_this_page;
 344	} while (tmp != bh);
 345	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 346	local_irq_restore(flags);
 347
 348	/*
 349	 * If none of the buffers had errors and they are all
 350	 * uptodate then we can set the page uptodate.
 351	 */
 352	if (page_uptodate && !PageError(page))
 353		SetPageUptodate(page);
 354	unlock_page(page);
 355	return;
 356
 357still_busy:
 358	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 359	local_irq_restore(flags);
 360	return;
 361}
 362
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 363/*
 364 * Completion handler for block_write_full_page() - pages which are unlocked
 365 * during I/O, and which have PageWriteback cleared upon I/O completion.
 366 */
 367void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 368{
 369	char b[BDEVNAME_SIZE];
 370	unsigned long flags;
 371	struct buffer_head *first;
 372	struct buffer_head *tmp;
 373	struct page *page;
 374
 375	BUG_ON(!buffer_async_write(bh));
 376
 377	page = bh->b_page;
 378	if (uptodate) {
 379		set_buffer_uptodate(bh);
 380	} else {
 381		if (!quiet_error(bh)) {
 382			buffer_io_error(bh);
 383			printk(KERN_WARNING "lost page write due to "
 384					"I/O error on %s\n",
 385			       bdevname(bh->b_bdev, b));
 386		}
 387		set_bit(AS_EIO, &page->mapping->flags);
 388		set_buffer_write_io_error(bh);
 389		clear_buffer_uptodate(bh);
 390		SetPageError(page);
 391	}
 392
 393	first = page_buffers(page);
 394	local_irq_save(flags);
 395	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 396
 397	clear_buffer_async_write(bh);
 398	unlock_buffer(bh);
 399	tmp = bh->b_this_page;
 400	while (tmp != bh) {
 401		if (buffer_async_write(tmp)) {
 402			BUG_ON(!buffer_locked(tmp));
 403			goto still_busy;
 404		}
 405		tmp = tmp->b_this_page;
 406	}
 407	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 408	local_irq_restore(flags);
 409	end_page_writeback(page);
 410	return;
 411
 412still_busy:
 413	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 414	local_irq_restore(flags);
 415	return;
 416}
 417EXPORT_SYMBOL(end_buffer_async_write);
 418
 419/*
 420 * If a page's buffers are under async readin (end_buffer_async_read
 421 * completion) then there is a possibility that another thread of
 422 * control could lock one of the buffers after it has completed
 423 * but while some of the other buffers have not completed.  This
 424 * locked buffer would confuse end_buffer_async_read() into not unlocking
 425 * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 426 * that this buffer is not under async I/O.
 427 *
 428 * The page comes unlocked when it has no locked buffer_async buffers
 429 * left.
 430 *
 431 * PageLocked prevents anyone starting new async I/O reads any of
 432 * the buffers.
 433 *
 434 * PageWriteback is used to prevent simultaneous writeout of the same
 435 * page.
 436 *
 437 * PageLocked prevents anyone from starting writeback of a page which is
 438 * under read I/O (PageWriteback is only ever set against a locked page).
 439 */
 440static void mark_buffer_async_read(struct buffer_head *bh)
 441{
 442	bh->b_end_io = end_buffer_async_read;
 443	set_buffer_async_read(bh);
 444}
 445
 446static void mark_buffer_async_write_endio(struct buffer_head *bh,
 447					  bh_end_io_t *handler)
 448{
 449	bh->b_end_io = handler;
 450	set_buffer_async_write(bh);
 451}
 452
 453void mark_buffer_async_write(struct buffer_head *bh)
 454{
 455	mark_buffer_async_write_endio(bh, end_buffer_async_write);
 456}
 457EXPORT_SYMBOL(mark_buffer_async_write);
 458
 459
 460/*
 461 * fs/buffer.c contains helper functions for buffer-backed address space's
 462 * fsync functions.  A common requirement for buffer-based filesystems is
 463 * that certain data from the backing blockdev needs to be written out for
 464 * a successful fsync().  For example, ext2 indirect blocks need to be
 465 * written back and waited upon before fsync() returns.
 466 *
 467 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
 468 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 469 * management of a list of dependent buffers at ->i_mapping->private_list.
 470 *
 471 * Locking is a little subtle: try_to_free_buffers() will remove buffers
 472 * from their controlling inode's queue when they are being freed.  But
 473 * try_to_free_buffers() will be operating against the *blockdev* mapping
 474 * at the time, not against the S_ISREG file which depends on those buffers.
 475 * So the locking for private_list is via the private_lock in the address_space
 476 * which backs the buffers.  Which is different from the address_space 
 477 * against which the buffers are listed.  So for a particular address_space,
 478 * mapping->private_lock does *not* protect mapping->private_list!  In fact,
 479 * mapping->private_list will always be protected by the backing blockdev's
 480 * ->private_lock.
 481 *
 482 * Which introduces a requirement: all buffers on an address_space's
 483 * ->private_list must be from the same address_space: the blockdev's.
 484 *
 485 * address_spaces which do not place buffers at ->private_list via these
 486 * utility functions are free to use private_lock and private_list for
 487 * whatever they want.  The only requirement is that list_empty(private_list)
 488 * be true at clear_inode() time.
 489 *
 490 * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 491 * filesystems should do that.  invalidate_inode_buffers() should just go
 492 * BUG_ON(!list_empty).
 493 *
 494 * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 495 * take an address_space, not an inode.  And it should be called
 496 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 497 * queued up.
 498 *
 499 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 500 * list if it is already on a list.  Because if the buffer is on a list,
 501 * it *must* already be on the right one.  If not, the filesystem is being
 502 * silly.  This will save a ton of locking.  But first we have to ensure
 503 * that buffers are taken *off* the old inode's list when they are freed
 504 * (presumably in truncate).  That requires careful auditing of all
 505 * filesystems (do it inside bforget()).  It could also be done by bringing
 506 * b_inode back.
 507 */
 508
 509/*
 510 * The buffer's backing address_space's private_lock must be held
 511 */
 512static void __remove_assoc_queue(struct buffer_head *bh)
 513{
 514	list_del_init(&bh->b_assoc_buffers);
 515	WARN_ON(!bh->b_assoc_map);
 516	if (buffer_write_io_error(bh))
 517		set_bit(AS_EIO, &bh->b_assoc_map->flags);
 518	bh->b_assoc_map = NULL;
 519}
 520
 521int inode_has_buffers(struct inode *inode)
 522{
 523	return !list_empty(&inode->i_data.private_list);
 524}
 525
 526/*
 527 * osync is designed to support O_SYNC io.  It waits synchronously for
 528 * all already-submitted IO to complete, but does not queue any new
 529 * writes to the disk.
 530 *
 531 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 532 * you dirty the buffers, and then use osync_inode_buffers to wait for
 533 * completion.  Any other dirty buffers which are not yet queued for
 534 * write will not be flushed to disk by the osync.
 535 */
 536static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
 537{
 538	struct buffer_head *bh;
 539	struct list_head *p;
 540	int err = 0;
 541
 542	spin_lock(lock);
 543repeat:
 544	list_for_each_prev(p, list) {
 545		bh = BH_ENTRY(p);
 546		if (buffer_locked(bh)) {
 547			get_bh(bh);
 548			spin_unlock(lock);
 549			wait_on_buffer(bh);
 550			if (!buffer_uptodate(bh))
 551				err = -EIO;
 552			brelse(bh);
 553			spin_lock(lock);
 554			goto repeat;
 555		}
 556	}
 557	spin_unlock(lock);
 558	return err;
 559}
 560
 561static void do_thaw_one(struct super_block *sb, void *unused)
 
 
 
 
 
 
 
 
 
 
 
 562{
 563	char b[BDEVNAME_SIZE];
 564	while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
 565		printk(KERN_WARNING "Emergency Thaw on %s\n",
 566		       bdevname(sb->s_bdev, b));
 567}
 568
 569static void do_thaw_all(struct work_struct *work)
 570{
 571	iterate_supers(do_thaw_one, NULL);
 572	kfree(work);
 573	printk(KERN_WARNING "Emergency Thaw complete\n");
 574}
 
 575
 576/**
 577 * emergency_thaw_all -- forcibly thaw every frozen filesystem
 
 578 *
 579 * Used for emergency unfreeze of all filesystems via SysRq
 
 
 
 
 
 
 
 580 */
 581void emergency_thaw_all(void)
 
 582{
 583	struct work_struct *work;
 
 
 
 
 
 
 584
 585	work = kmalloc(sizeof(*work), GFP_ATOMIC);
 586	if (work) {
 587		INIT_WORK(work, do_thaw_all);
 588		schedule_work(work);
 589	}
 
 
 
 
 
 
 
 
 
 
 
 590}
 
 591
 592/**
 593 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
 594 * @mapping: the mapping which wants those buffers written
 595 *
 596 * Starts I/O against the buffers at mapping->private_list, and waits upon
 597 * that I/O.
 598 *
 599 * Basically, this is a convenience function for fsync().
 600 * @mapping is a file or directory which needs those buffers to be written for
 601 * a successful fsync().
 
 
 
 
 
 
 602 */
 603int sync_mapping_buffers(struct address_space *mapping)
 
 604{
 605	struct address_space *buffer_mapping = mapping->assoc_mapping;
 
 606
 607	if (buffer_mapping == NULL || list_empty(&mapping->private_list))
 608		return 0;
 609
 610	return fsync_buffers_list(&buffer_mapping->private_lock,
 611					&mapping->private_list);
 612}
 613EXPORT_SYMBOL(sync_mapping_buffers);
 614
 615/*
 616 * Called when we've recently written block `bblock', and it is known that
 617 * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 618 * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 619 * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 620 */
 621void write_boundary_block(struct block_device *bdev,
 622			sector_t bblock, unsigned blocksize)
 623{
 624	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
 625	if (bh) {
 626		if (buffer_dirty(bh))
 627			ll_rw_block(WRITE, 1, &bh);
 628		put_bh(bh);
 629	}
 630}
 631
 632void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 633{
 634	struct address_space *mapping = inode->i_mapping;
 635	struct address_space *buffer_mapping = bh->b_page->mapping;
 636
 637	mark_buffer_dirty(bh);
 638	if (!mapping->assoc_mapping) {
 639		mapping->assoc_mapping = buffer_mapping;
 640	} else {
 641		BUG_ON(mapping->assoc_mapping != buffer_mapping);
 642	}
 643	if (!bh->b_assoc_map) {
 644		spin_lock(&buffer_mapping->private_lock);
 645		list_move_tail(&bh->b_assoc_buffers,
 646				&mapping->private_list);
 647		bh->b_assoc_map = mapping;
 648		spin_unlock(&buffer_mapping->private_lock);
 649	}
 650}
 651EXPORT_SYMBOL(mark_buffer_dirty_inode);
 652
 653/*
 654 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
 655 * dirty.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 656 *
 657 * If warn is true, then emit a warning if the page is not uptodate and has
 658 * not been truncated.
 659 */
 660static void __set_page_dirty(struct page *page,
 661		struct address_space *mapping, int warn)
 662{
 663	spin_lock_irq(&mapping->tree_lock);
 664	if (page->mapping) {	/* Race with truncate? */
 665		WARN_ON_ONCE(warn && !PageUptodate(page));
 666		account_page_dirtied(page, mapping);
 667		radix_tree_tag_set(&mapping->page_tree,
 668				page_index(page), PAGECACHE_TAG_DIRTY);
 669	}
 670	spin_unlock_irq(&mapping->tree_lock);
 671	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 672}
 673
 674/*
 675 * Add a page to the dirty page list.
 676 *
 677 * It is a sad fact of life that this function is called from several places
 678 * deeply under spinlocking.  It may not sleep.
 679 *
 680 * If the page has buffers, the uptodate buffers are set dirty, to preserve
 681 * dirty-state coherency between the page and the buffers.  It the page does
 682 * not have buffers then when they are later attached they will all be set
 683 * dirty.
 684 *
 685 * The buffers are dirtied before the page is dirtied.  There's a small race
 686 * window in which a writepage caller may see the page cleanness but not the
 687 * buffer dirtiness.  That's fine.  If this code were to set the page dirty
 688 * before the buffers, a concurrent writepage caller could clear the page dirty
 689 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
 690 * page on the dirty page list.
 691 *
 692 * We use private_lock to lock against try_to_free_buffers while using the
 693 * page's buffer list.  Also use this to protect against clean buffers being
 694 * added to the page after it was set dirty.
 695 *
 696 * FIXME: may need to call ->reservepage here as well.  That's rather up to the
 697 * address_space though.
 698 */
 699int __set_page_dirty_buffers(struct page *page)
 700{
 701	int newly_dirty;
 702	struct address_space *mapping = page_mapping(page);
 703
 704	if (unlikely(!mapping))
 705		return !TestSetPageDirty(page);
 706
 707	spin_lock(&mapping->private_lock);
 708	if (page_has_buffers(page)) {
 709		struct buffer_head *head = page_buffers(page);
 710		struct buffer_head *bh = head;
 711
 712		do {
 713			set_buffer_dirty(bh);
 714			bh = bh->b_this_page;
 715		} while (bh != head);
 716	}
 717	newly_dirty = !TestSetPageDirty(page);
 718	spin_unlock(&mapping->private_lock);
 
 
 
 
 
 
 
 719
 720	if (newly_dirty)
 721		__set_page_dirty(page, mapping, 1);
 
 722	return newly_dirty;
 723}
 724EXPORT_SYMBOL(__set_page_dirty_buffers);
 725
 726/*
 727 * Write out and wait upon a list of buffers.
 728 *
 729 * We have conflicting pressures: we want to make sure that all
 730 * initially dirty buffers get waited on, but that any subsequently
 731 * dirtied buffers don't.  After all, we don't want fsync to last
 732 * forever if somebody is actively writing to the file.
 733 *
 734 * Do this in two main stages: first we copy dirty buffers to a
 735 * temporary inode list, queueing the writes as we go.  Then we clean
 736 * up, waiting for those writes to complete.
 737 * 
 738 * During this second stage, any subsequent updates to the file may end
 739 * up refiling the buffer on the original inode's dirty list again, so
 740 * there is a chance we will end up with a buffer queued for write but
 741 * not yet completed on that list.  So, as a final cleanup we go through
 742 * the osync code to catch these locked, dirty buffers without requeuing
 743 * any newly dirty buffers for write.
 744 */
 745static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 746{
 747	struct buffer_head *bh;
 748	struct list_head tmp;
 749	struct address_space *mapping;
 750	int err = 0, err2;
 751	struct blk_plug plug;
 
 752
 753	INIT_LIST_HEAD(&tmp);
 754	blk_start_plug(&plug);
 755
 756	spin_lock(lock);
 757	while (!list_empty(list)) {
 758		bh = BH_ENTRY(list->next);
 759		mapping = bh->b_assoc_map;
 760		__remove_assoc_queue(bh);
 761		/* Avoid race with mark_buffer_dirty_inode() which does
 762		 * a lockless check and we rely on seeing the dirty bit */
 763		smp_mb();
 764		if (buffer_dirty(bh) || buffer_locked(bh)) {
 765			list_add(&bh->b_assoc_buffers, &tmp);
 766			bh->b_assoc_map = mapping;
 767			if (buffer_dirty(bh)) {
 768				get_bh(bh);
 769				spin_unlock(lock);
 770				/*
 771				 * Ensure any pending I/O completes so that
 772				 * write_dirty_buffer() actually writes the
 773				 * current contents - it is a noop if I/O is
 774				 * still in flight on potentially older
 775				 * contents.
 776				 */
 777				write_dirty_buffer(bh, WRITE_SYNC);
 778
 779				/*
 780				 * Kick off IO for the previous mapping. Note
 781				 * that we will not run the very last mapping,
 782				 * wait_on_buffer() will do that for us
 783				 * through sync_buffer().
 784				 */
 785				brelse(bh);
 786				spin_lock(lock);
 787			}
 788		}
 789	}
 790
 791	spin_unlock(lock);
 792	blk_finish_plug(&plug);
 793	spin_lock(lock);
 794
 795	while (!list_empty(&tmp)) {
 796		bh = BH_ENTRY(tmp.prev);
 797		get_bh(bh);
 798		mapping = bh->b_assoc_map;
 799		__remove_assoc_queue(bh);
 800		/* Avoid race with mark_buffer_dirty_inode() which does
 801		 * a lockless check and we rely on seeing the dirty bit */
 802		smp_mb();
 803		if (buffer_dirty(bh)) {
 804			list_add(&bh->b_assoc_buffers,
 805				 &mapping->private_list);
 806			bh->b_assoc_map = mapping;
 807		}
 808		spin_unlock(lock);
 809		wait_on_buffer(bh);
 810		if (!buffer_uptodate(bh))
 811			err = -EIO;
 812		brelse(bh);
 813		spin_lock(lock);
 814	}
 815	
 816	spin_unlock(lock);
 817	err2 = osync_buffers_list(lock, list);
 818	if (err)
 819		return err;
 820	else
 821		return err2;
 822}
 823
 824/*
 825 * Invalidate any and all dirty buffers on a given inode.  We are
 826 * probably unmounting the fs, but that doesn't mean we have already
 827 * done a sync().  Just drop the buffers from the inode list.
 828 *
 829 * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
 830 * assumes that all the buffers are against the blockdev.  Not true
 831 * for reiserfs.
 832 */
 833void invalidate_inode_buffers(struct inode *inode)
 834{
 835	if (inode_has_buffers(inode)) {
 836		struct address_space *mapping = &inode->i_data;
 837		struct list_head *list = &mapping->private_list;
 838		struct address_space *buffer_mapping = mapping->assoc_mapping;
 839
 840		spin_lock(&buffer_mapping->private_lock);
 841		while (!list_empty(list))
 842			__remove_assoc_queue(BH_ENTRY(list->next));
 843		spin_unlock(&buffer_mapping->private_lock);
 844	}
 845}
 846EXPORT_SYMBOL(invalidate_inode_buffers);
 847
 848/*
 849 * Remove any clean buffers from the inode's buffer list.  This is called
 850 * when we're trying to free the inode itself.  Those buffers can pin it.
 851 *
 852 * Returns true if all buffers were removed.
 853 */
 854int remove_inode_buffers(struct inode *inode)
 855{
 856	int ret = 1;
 857
 858	if (inode_has_buffers(inode)) {
 859		struct address_space *mapping = &inode->i_data;
 860		struct list_head *list = &mapping->private_list;
 861		struct address_space *buffer_mapping = mapping->assoc_mapping;
 862
 863		spin_lock(&buffer_mapping->private_lock);
 864		while (!list_empty(list)) {
 865			struct buffer_head *bh = BH_ENTRY(list->next);
 866			if (buffer_dirty(bh)) {
 867				ret = 0;
 868				break;
 869			}
 870			__remove_assoc_queue(bh);
 871		}
 872		spin_unlock(&buffer_mapping->private_lock);
 873	}
 874	return ret;
 875}
 876
 877/*
 878 * Create the appropriate buffers when given a page for data area and
 879 * the size of each buffer.. Use the bh->b_this_page linked list to
 880 * follow the buffers created.  Return NULL if unable to create more
 881 * buffers.
 882 *
 883 * The retry flag is used to differentiate async IO (paging, swapping)
 884 * which may not fail from ordinary buffer allocations.
 885 */
 886struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 887		int retry)
 888{
 889	struct buffer_head *bh, *head;
 890	long offset;
 
 
 
 
 
 891
 892try_again:
 893	head = NULL;
 894	offset = PAGE_SIZE;
 895	while ((offset -= size) >= 0) {
 896		bh = alloc_buffer_head(GFP_NOFS);
 897		if (!bh)
 898			goto no_grow;
 899
 900		bh->b_bdev = NULL;
 901		bh->b_this_page = head;
 902		bh->b_blocknr = -1;
 903		head = bh;
 904
 905		bh->b_state = 0;
 906		atomic_set(&bh->b_count, 0);
 907		bh->b_size = size;
 908
 909		/* Link the buffer to its page */
 910		set_bh_page(bh, page, offset);
 911
 912		init_buffer(bh, NULL, NULL);
 913	}
 
 
 914	return head;
 915/*
 916 * In case anything failed, we just free everything we got.
 917 */
 918no_grow:
 919	if (head) {
 920		do {
 921			bh = head;
 922			head = head->b_this_page;
 923			free_buffer_head(bh);
 924		} while (head);
 925	}
 926
 927	/*
 928	 * Return failure for non-async IO requests.  Async IO requests
 929	 * are not allowed to fail, so we have to wait until buffer heads
 930	 * become available.  But we don't want tasks sleeping with 
 931	 * partially complete buffers, so all were released above.
 932	 */
 933	if (!retry)
 934		return NULL;
 935
 936	/* We're _really_ low on memory. Now we just
 937	 * wait for old buffer heads to become free due to
 938	 * finishing IO.  Since this is an async request and
 939	 * the reserve list is empty, we're sure there are 
 940	 * async buffer heads in use.
 941	 */
 942	free_more_memory();
 943	goto try_again;
 944}
 945EXPORT_SYMBOL_GPL(alloc_page_buffers);
 946
 947static inline void
 948link_dev_buffers(struct page *page, struct buffer_head *head)
 949{
 950	struct buffer_head *bh, *tail;
 951
 952	bh = head;
 953	do {
 954		tail = bh;
 955		bh = bh->b_this_page;
 956	} while (bh);
 957	tail->b_this_page = head;
 958	attach_page_buffers(page, head);
 
 
 
 
 
 
 
 
 
 
 
 
 959}
 960
 961/*
 962 * Initialise the state of a blockdev page's buffers.
 963 */ 
 964static void
 965init_page_buffers(struct page *page, struct block_device *bdev,
 966			sector_t block, int size)
 967{
 968	struct buffer_head *head = page_buffers(page);
 969	struct buffer_head *bh = head;
 970	int uptodate = PageUptodate(page);
 
 
 971
 972	do {
 973		if (!buffer_mapped(bh)) {
 974			init_buffer(bh, NULL, NULL);
 
 975			bh->b_bdev = bdev;
 976			bh->b_blocknr = block;
 977			if (uptodate)
 978				set_buffer_uptodate(bh);
 979			set_buffer_mapped(bh);
 
 980		}
 981		block++;
 982		bh = bh->b_this_page;
 983	} while (bh != head);
 
 
 
 
 
 984}
 985
 986/*
 987 * Create the page-cache page that contains the requested block.
 
 
 988 *
 989 * This is user purely for blockdev mappings.
 
 990 */
 991static struct page *
 992grow_dev_page(struct block_device *bdev, sector_t block,
 993		pgoff_t index, int size)
 994{
 995	struct inode *inode = bdev->bd_inode;
 996	struct page *page;
 997	struct buffer_head *bh;
 
 998
 999	page = find_or_create_page(inode->i_mapping, index,
1000		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
1001	if (!page)
1002		return NULL;
1003
1004	BUG_ON(!PageLocked(page));
 
 
 
 
 
1005
1006	if (page_has_buffers(page)) {
1007		bh = page_buffers(page);
1008		if (bh->b_size == size) {
1009			init_page_buffers(page, bdev, block, size);
1010			return page;
 
 
 
 
 
1011		}
1012		if (!try_to_free_buffers(page))
1013			goto failed;
1014	}
1015
1016	/*
1017	 * Allocate some buffers for this page
1018	 */
1019	bh = alloc_page_buffers(page, size, 0);
1020	if (!bh)
1021		goto failed;
1022
1023	/*
1024	 * Link the page to the buffers and initialise them.  Take the
1025	 * lock to be atomic wrt __find_get_block(), which does not
1026	 * run under the page lock.
1027	 */
1028	spin_lock(&inode->i_mapping->private_lock);
1029	link_dev_buffers(page, bh);
1030	init_page_buffers(page, bdev, block, size);
1031	spin_unlock(&inode->i_mapping->private_lock);
1032	return page;
1033
1034failed:
1035	BUG();
1036	unlock_page(page);
1037	page_cache_release(page);
1038	return NULL;
1039}
1040
1041/*
1042 * Create buffers for the specified block device block's page.  If
1043 * that page was dirty, the buffers are set dirty also.
 
1044 */
1045static int
1046grow_buffers(struct block_device *bdev, sector_t block, int size)
1047{
1048	struct page *page;
1049	pgoff_t index;
1050	int sizebits;
1051
1052	sizebits = -1;
1053	do {
1054		sizebits++;
1055	} while ((size << sizebits) < PAGE_SIZE);
1056
1057	index = block >> sizebits;
1058
1059	/*
1060	 * Check for a block which wants to lie outside our maximum possible
1061	 * pagecache index.  (this comparison is done using sector_t types).
1062	 */
1063	if (unlikely(index != block >> sizebits)) {
1064		char b[BDEVNAME_SIZE];
1065
1066		printk(KERN_ERR "%s: requested out-of-range block %llu for "
1067			"device %s\n",
1068			__func__, (unsigned long long)block,
1069			bdevname(bdev, b));
1070		return -EIO;
1071	}
1072	block = index << sizebits;
1073	/* Create a page with the proper size buffers.. */
1074	page = grow_dev_page(bdev, block, index, size);
1075	if (!page)
1076		return 0;
1077	unlock_page(page);
1078	page_cache_release(page);
1079	return 1;
1080}
1081
1082static struct buffer_head *
1083__getblk_slow(struct block_device *bdev, sector_t block, int size)
 
1084{
1085	/* Size must be multiple of hard sectorsize */
1086	if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1087			(size < 512 || size > PAGE_SIZE))) {
1088		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1089					size);
1090		printk(KERN_ERR "logical block size: %d\n",
1091					bdev_logical_block_size(bdev));
1092
1093		dump_stack();
1094		return NULL;
1095	}
1096
1097	for (;;) {
1098		struct buffer_head * bh;
1099		int ret;
1100
1101		bh = __find_get_block(bdev, block, size);
1102		if (bh)
1103			return bh;
1104
1105		ret = grow_buffers(bdev, block, size);
1106		if (ret < 0)
1107			return NULL;
1108		if (ret == 0)
1109			free_more_memory();
1110	}
1111}
1112
1113/*
1114 * The relationship between dirty buffers and dirty pages:
1115 *
1116 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1117 * the page is tagged dirty in its radix tree.
1118 *
1119 * At all times, the dirtiness of the buffers represents the dirtiness of
1120 * subsections of the page.  If the page has buffers, the page dirty bit is
1121 * merely a hint about the true dirty state.
1122 *
1123 * When a page is set dirty in its entirety, all its buffers are marked dirty
1124 * (if the page has buffers).
1125 *
1126 * When a buffer is marked dirty, its page is dirtied, but the page's other
1127 * buffers are not.
1128 *
1129 * Also.  When blockdev buffers are explicitly read with bread(), they
1130 * individually become uptodate.  But their backing page remains not
1131 * uptodate - even if all of its buffers are uptodate.  A subsequent
1132 * block_read_full_page() against that page will discover all the uptodate
1133 * buffers, will set the page uptodate and will perform no I/O.
1134 */
1135
1136/**
1137 * mark_buffer_dirty - mark a buffer_head as needing writeout
1138 * @bh: the buffer_head to mark dirty
1139 *
1140 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1141 * backing page dirty, then tag the page as dirty in its address_space's radix
1142 * tree and then attach the address_space's inode to its superblock's dirty
1143 * inode list.
1144 *
1145 * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1146 * mapping->tree_lock and mapping->host->i_lock.
1147 */
1148void mark_buffer_dirty(struct buffer_head *bh)
1149{
1150	WARN_ON_ONCE(!buffer_uptodate(bh));
1151
 
 
1152	/*
1153	 * Very *carefully* optimize the it-is-already-dirty case.
1154	 *
1155	 * Don't let the final "is it dirty" escape to before we
1156	 * perhaps modified the buffer.
1157	 */
1158	if (buffer_dirty(bh)) {
1159		smp_mb();
1160		if (buffer_dirty(bh))
1161			return;
1162	}
1163
1164	if (!test_set_buffer_dirty(bh)) {
1165		struct page *page = bh->b_page;
1166		if (!TestSetPageDirty(page)) {
1167			struct address_space *mapping = page_mapping(page);
 
 
1168			if (mapping)
1169				__set_page_dirty(page, mapping, 0);
1170		}
 
 
1171	}
1172}
1173EXPORT_SYMBOL(mark_buffer_dirty);
1174
1175/*
1176 * Decrement a buffer_head's reference count.  If all buffers against a page
1177 * have zero reference count, are clean and unlocked, and if the page is clean
1178 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1179 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1180 * a page but it ends up not being freed, and buffers may later be reattached).
 
 
 
 
 
 
 
 
 
 
 
 
1181 */
1182void __brelse(struct buffer_head * buf)
1183{
1184	if (atomic_read(&buf->b_count)) {
1185		put_bh(buf);
1186		return;
1187	}
1188	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1189}
1190EXPORT_SYMBOL(__brelse);
1191
1192/*
1193 * bforget() is like brelse(), except it discards any
1194 * potentially dirty data.
 
 
 
1195 */
1196void __bforget(struct buffer_head *bh)
1197{
1198	clear_buffer_dirty(bh);
1199	if (bh->b_assoc_map) {
1200		struct address_space *buffer_mapping = bh->b_page->mapping;
1201
1202		spin_lock(&buffer_mapping->private_lock);
1203		list_del_init(&bh->b_assoc_buffers);
1204		bh->b_assoc_map = NULL;
1205		spin_unlock(&buffer_mapping->private_lock);
1206	}
1207	__brelse(bh);
1208}
1209EXPORT_SYMBOL(__bforget);
1210
1211static struct buffer_head *__bread_slow(struct buffer_head *bh)
1212{
1213	lock_buffer(bh);
1214	if (buffer_uptodate(bh)) {
1215		unlock_buffer(bh);
1216		return bh;
1217	} else {
1218		get_bh(bh);
1219		bh->b_end_io = end_buffer_read_sync;
1220		submit_bh(READ, bh);
1221		wait_on_buffer(bh);
1222		if (buffer_uptodate(bh))
1223			return bh;
1224	}
1225	brelse(bh);
1226	return NULL;
1227}
1228
1229/*
1230 * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1231 * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1232 * refcount elevated by one when they're in an LRU.  A buffer can only appear
1233 * once in a particular CPU's LRU.  A single buffer can be present in multiple
1234 * CPU's LRUs at the same time.
1235 *
1236 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1237 * sb_find_get_block().
1238 *
1239 * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1240 * a local interrupt disable for that.
1241 */
1242
1243#define BH_LRU_SIZE	8
1244
1245struct bh_lru {
1246	struct buffer_head *bhs[BH_LRU_SIZE];
1247};
1248
1249static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1250
1251#ifdef CONFIG_SMP
1252#define bh_lru_lock()	local_irq_disable()
1253#define bh_lru_unlock()	local_irq_enable()
1254#else
1255#define bh_lru_lock()	preempt_disable()
1256#define bh_lru_unlock()	preempt_enable()
1257#endif
1258
1259static inline void check_irqs_on(void)
1260{
1261#ifdef irqs_disabled
1262	BUG_ON(irqs_disabled());
1263#endif
1264}
1265
1266/*
1267 * The LRU management algorithm is dopey-but-simple.  Sorry.
 
 
1268 */
1269static void bh_lru_install(struct buffer_head *bh)
1270{
1271	struct buffer_head *evictee = NULL;
 
 
1272
1273	check_irqs_on();
1274	bh_lru_lock();
1275	if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
1276		struct buffer_head *bhs[BH_LRU_SIZE];
1277		int in;
1278		int out = 0;
1279
1280		get_bh(bh);
1281		bhs[out++] = bh;
1282		for (in = 0; in < BH_LRU_SIZE; in++) {
1283			struct buffer_head *bh2 =
1284				__this_cpu_read(bh_lrus.bhs[in]);
 
 
 
 
 
1285
1286			if (bh2 == bh) {
1287				__brelse(bh2);
1288			} else {
1289				if (out >= BH_LRU_SIZE) {
1290					BUG_ON(evictee != NULL);
1291					evictee = bh2;
1292				} else {
1293					bhs[out++] = bh2;
1294				}
1295			}
1296		}
1297		while (out < BH_LRU_SIZE)
1298			bhs[out++] = NULL;
1299		memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
1300	}
 
 
1301	bh_lru_unlock();
1302
1303	if (evictee)
1304		__brelse(evictee);
1305}
1306
1307/*
1308 * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1309 */
1310static struct buffer_head *
1311lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1312{
1313	struct buffer_head *ret = NULL;
1314	unsigned int i;
1315
1316	check_irqs_on();
1317	bh_lru_lock();
 
 
 
 
1318	for (i = 0; i < BH_LRU_SIZE; i++) {
1319		struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1320
1321		if (bh && bh->b_bdev == bdev &&
1322				bh->b_blocknr == block && bh->b_size == size) {
1323			if (i) {
1324				while (i) {
1325					__this_cpu_write(bh_lrus.bhs[i],
1326						__this_cpu_read(bh_lrus.bhs[i - 1]));
1327					i--;
1328				}
1329				__this_cpu_write(bh_lrus.bhs[0], bh);
1330			}
1331			get_bh(bh);
1332			ret = bh;
1333			break;
1334		}
1335	}
1336	bh_lru_unlock();
1337	return ret;
1338}
1339
1340/*
1341 * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1342 * it in the LRU and mark it as accessed.  If it is not present then return
1343 * NULL
1344 */
1345struct buffer_head *
1346__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1347{
1348	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1349
1350	if (bh == NULL) {
 
1351		bh = __find_get_block_slow(bdev, block);
1352		if (bh)
1353			bh_lru_install(bh);
1354	}
1355	if (bh)
1356		touch_buffer(bh);
 
1357	return bh;
1358}
1359EXPORT_SYMBOL(__find_get_block);
1360
1361/*
1362 * __getblk will locate (and, if necessary, create) the buffer_head
1363 * which corresponds to the passed block_device, block and size. The
1364 * returned buffer has its reference count incremented.
1365 *
1366 * __getblk() cannot fail - it just keeps trying.  If you pass it an
1367 * illegal block number, __getblk() will happily return a buffer_head
1368 * which represents the non-existent block.  Very weird.
 
 
 
1369 *
1370 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1371 * attempt is failing.  FIXME, perhaps?
1372 */
1373struct buffer_head *
1374__getblk(struct block_device *bdev, sector_t block, unsigned size)
1375{
1376	struct buffer_head *bh = __find_get_block(bdev, block, size);
1377
1378	might_sleep();
1379	if (bh == NULL)
1380		bh = __getblk_slow(bdev, block, size);
1381	return bh;
 
1382}
1383EXPORT_SYMBOL(__getblk);
1384
1385/*
1386 * Do async read-ahead on a buffer..
1387 */
1388void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1389{
1390	struct buffer_head *bh = __getblk(bdev, block, size);
 
 
1391	if (likely(bh)) {
1392		ll_rw_block(READA, 1, &bh);
1393		brelse(bh);
1394	}
1395}
1396EXPORT_SYMBOL(__breadahead);
1397
1398/**
1399 *  __bread() - reads a specified block and returns the bh
1400 *  @bdev: the block_device to read from
1401 *  @block: number of block
1402 *  @size: size (in bytes) to read
1403 * 
1404 *  Reads a specified block, and returns buffer head that contains it.
1405 *  It returns NULL if the block was unreadable.
 
 
 
 
 
 
 
 
 
 
 
 
 
1406 */
1407struct buffer_head *
1408__bread(struct block_device *bdev, sector_t block, unsigned size)
1409{
1410	struct buffer_head *bh = __getblk(bdev, block, size);
 
 
 
 
 
 
 
 
 
 
1411
1412	if (likely(bh) && !buffer_uptodate(bh))
1413		bh = __bread_slow(bh);
1414	return bh;
1415}
1416EXPORT_SYMBOL(__bread);
1417
 
 
 
 
 
 
 
 
 
1418/*
1419 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1420 * This doesn't race because it runs in each cpu either in irq
1421 * or with preempt disabled.
1422 */
1423static void invalidate_bh_lru(void *arg)
1424{
1425	struct bh_lru *b = &get_cpu_var(bh_lrus);
 
 
 
 
 
 
 
 
1426	int i;
1427
1428	for (i = 0; i < BH_LRU_SIZE; i++) {
1429		brelse(b->bhs[i]);
1430		b->bhs[i] = NULL;
1431	}
1432	put_cpu_var(bh_lrus);
 
1433}
1434	
1435void invalidate_bh_lrus(void)
1436{
1437	on_each_cpu(invalidate_bh_lru, NULL, 1);
1438}
1439EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1440
1441void set_bh_page(struct buffer_head *bh,
1442		struct page *page, unsigned long offset)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1443{
1444	bh->b_page = page;
1445	BUG_ON(offset >= PAGE_SIZE);
1446	if (PageHighMem(page))
1447		/*
1448		 * This catches illegal uses and preserves the offset:
1449		 */
1450		bh->b_data = (char *)(0 + offset);
1451	else
1452		bh->b_data = page_address(page) + offset;
1453}
1454EXPORT_SYMBOL(set_bh_page);
1455
1456/*
1457 * Called when truncating a buffer on a page completely.
1458 */
 
 
 
 
 
 
1459static void discard_buffer(struct buffer_head * bh)
1460{
 
 
1461	lock_buffer(bh);
1462	clear_buffer_dirty(bh);
1463	bh->b_bdev = NULL;
1464	clear_buffer_mapped(bh);
1465	clear_buffer_req(bh);
1466	clear_buffer_new(bh);
1467	clear_buffer_delay(bh);
1468	clear_buffer_unwritten(bh);
1469	unlock_buffer(bh);
1470}
1471
1472/**
1473 * block_invalidatepage - invalidate part of all of a buffer-backed page
 
 
 
1474 *
1475 * @page: the page which is affected
1476 * @offset: the index of the truncation point
1477 *
1478 * block_invalidatepage() is called when all or part of the page has become
1479 * invalidatedby a truncate operation.
1480 *
1481 * block_invalidatepage() does not have to release all buffers, but it must
1482 * ensure that no dirty buffer is left outside @offset and that no I/O
1483 * is underway against any of the blocks which are outside the truncation
1484 * point.  Because the caller is about to free (and possibly reuse) those
1485 * blocks on-disk.
1486 */
1487void block_invalidatepage(struct page *page, unsigned long offset)
1488{
1489	struct buffer_head *head, *bh, *next;
1490	unsigned int curr_off = 0;
 
 
 
 
 
 
 
 
1491
1492	BUG_ON(!PageLocked(page));
1493	if (!page_has_buffers(page))
1494		goto out;
1495
1496	head = page_buffers(page);
1497	bh = head;
1498	do {
1499		unsigned int next_off = curr_off + bh->b_size;
1500		next = bh->b_this_page;
1501
1502		/*
 
 
 
 
 
 
1503		 * is this block fully invalidated?
1504		 */
1505		if (offset <= curr_off)
1506			discard_buffer(bh);
1507		curr_off = next_off;
1508		bh = next;
1509	} while (bh != head);
1510
1511	/*
1512	 * We release buffers only if the entire page is being invalidated.
1513	 * The get_block cached value has been unconditionally invalidated,
1514	 * so real IO is not possible anymore.
1515	 */
1516	if (offset == 0)
1517		try_to_release_page(page, 0);
1518out:
 
1519	return;
1520}
1521EXPORT_SYMBOL(block_invalidatepage);
1522
1523/*
1524 * We attach and possibly dirty the buffers atomically wrt
1525 * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1526 * is already excluded via the page lock.
1527 */
1528void create_empty_buffers(struct page *page,
1529			unsigned long blocksize, unsigned long b_state)
1530{
1531	struct buffer_head *bh, *head, *tail;
 
1532
1533	head = alloc_page_buffers(page, blocksize, 1);
1534	bh = head;
1535	do {
1536		bh->b_state |= b_state;
1537		tail = bh;
1538		bh = bh->b_this_page;
1539	} while (bh);
1540	tail->b_this_page = head;
1541
1542	spin_lock(&page->mapping->private_lock);
1543	if (PageUptodate(page) || PageDirty(page)) {
1544		bh = head;
1545		do {
1546			if (PageDirty(page))
1547				set_buffer_dirty(bh);
1548			if (PageUptodate(page))
1549				set_buffer_uptodate(bh);
1550			bh = bh->b_this_page;
1551		} while (bh != head);
1552	}
1553	attach_page_buffers(page, head);
1554	spin_unlock(&page->mapping->private_lock);
 
 
1555}
1556EXPORT_SYMBOL(create_empty_buffers);
1557
1558/*
1559 * We are taking a block for data and we don't want any output from any
1560 * buffer-cache aliases starting from return from that function and
1561 * until the moment when something will explicitly mark the buffer
1562 * dirty (hopefully that will not happen until we will free that block ;-)
1563 * We don't even need to mark it not-uptodate - nobody can expect
1564 * anything from a newly allocated buffer anyway. We used to used
1565 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1566 * don't want to mark the alias unmapped, for example - it would confuse
1567 * anyone who might pick it with bread() afterwards...
1568 *
1569 * Also..  Note that bforget() doesn't lock the buffer.  So there can
1570 * be writeout I/O going on against recently-freed buffers.  We don't
1571 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1572 * only if we really need to.  That happens here.
1573 */
1574void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1575{
1576	struct buffer_head *old_bh;
1577
1578	might_sleep();
1579
1580	old_bh = __find_get_block_slow(bdev, block);
1581	if (old_bh) {
1582		clear_buffer_dirty(old_bh);
1583		wait_on_buffer(old_bh);
1584		clear_buffer_req(old_bh);
1585		__brelse(old_bh);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1586	}
1587}
1588EXPORT_SYMBOL(unmap_underlying_metadata);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1589
1590/*
1591 * NOTE! All mapped/uptodate combinations are valid:
1592 *
1593 *	Mapped	Uptodate	Meaning
1594 *
1595 *	No	No		"unknown" - must do get_block()
1596 *	No	Yes		"hole" - zero-filled
1597 *	Yes	No		"allocated" - allocated on disk, not read in
1598 *	Yes	Yes		"valid" - allocated and up-to-date in memory.
1599 *
1600 * "Dirty" is valid only with the last case (mapped+uptodate).
1601 */
1602
1603/*
1604 * While block_write_full_page is writing back the dirty buffers under
1605 * the page lock, whoever dirtied the buffers may decide to clean them
1606 * again at any time.  We handle that by only looking at the buffer
1607 * state inside lock_buffer().
1608 *
1609 * If block_write_full_page() is called for regular writeback
1610 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1611 * locked buffer.   This only can happen if someone has written the buffer
1612 * directly, with submit_bh().  At the address_space level PageWriteback
1613 * prevents this contention from occurring.
1614 *
1615 * If block_write_full_page() is called with wbc->sync_mode ==
1616 * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
1617 * causes the writes to be flagged as synchronous writes.
1618 */
1619static int __block_write_full_page(struct inode *inode, struct page *page,
1620			get_block_t *get_block, struct writeback_control *wbc,
1621			bh_end_io_t *handler)
1622{
1623	int err;
1624	sector_t block;
1625	sector_t last_block;
1626	struct buffer_head *bh, *head;
1627	const unsigned blocksize = 1 << inode->i_blkbits;
1628	int nr_underway = 0;
1629	int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1630			WRITE_SYNC : WRITE);
1631
1632	BUG_ON(!PageLocked(page));
1633
1634	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1635
1636	if (!page_has_buffers(page)) {
1637		create_empty_buffers(page, blocksize,
1638					(1 << BH_Dirty)|(1 << BH_Uptodate));
1639	}
1640
1641	/*
1642	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1643	 * here, and the (potentially unmapped) buffers may become dirty at
1644	 * any time.  If a buffer becomes dirty here after we've inspected it
1645	 * then we just miss that fact, and the page stays dirty.
1646	 *
1647	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1648	 * handle that here by just cleaning them.
1649	 */
1650
1651	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1652	head = page_buffers(page);
1653	bh = head;
 
 
 
 
1654
1655	/*
1656	 * Get all the dirty buffers mapped to disk addresses and
1657	 * handle any aliases from the underlying blockdev's mapping.
1658	 */
1659	do {
1660		if (block > last_block) {
1661			/*
1662			 * mapped buffers outside i_size will occur, because
1663			 * this page can be outside i_size when there is a
1664			 * truncate in progress.
1665			 */
1666			/*
1667			 * The buffer was zeroed by block_write_full_page()
1668			 */
1669			clear_buffer_dirty(bh);
1670			set_buffer_uptodate(bh);
1671		} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1672			   buffer_dirty(bh)) {
1673			WARN_ON(bh->b_size != blocksize);
1674			err = get_block(inode, block, bh, 1);
1675			if (err)
1676				goto recover;
1677			clear_buffer_delay(bh);
1678			if (buffer_new(bh)) {
1679				/* blockdev mappings never come here */
1680				clear_buffer_new(bh);
1681				unmap_underlying_metadata(bh->b_bdev,
1682							bh->b_blocknr);
1683			}
1684		}
1685		bh = bh->b_this_page;
1686		block++;
1687	} while (bh != head);
1688
1689	do {
1690		if (!buffer_mapped(bh))
1691			continue;
1692		/*
1693		 * If it's a fully non-blocking write attempt and we cannot
1694		 * lock the buffer then redirty the page.  Note that this can
1695		 * potentially cause a busy-wait loop from writeback threads
1696		 * and kswapd activity, but those code paths have their own
1697		 * higher-level throttling.
1698		 */
1699		if (wbc->sync_mode != WB_SYNC_NONE) {
1700			lock_buffer(bh);
1701		} else if (!trylock_buffer(bh)) {
1702			redirty_page_for_writepage(wbc, page);
1703			continue;
1704		}
1705		if (test_clear_buffer_dirty(bh)) {
1706			mark_buffer_async_write_endio(bh, handler);
 
1707		} else {
1708			unlock_buffer(bh);
1709		}
1710	} while ((bh = bh->b_this_page) != head);
1711
1712	/*
1713	 * The page and its buffers are protected by PageWriteback(), so we can
1714	 * drop the bh refcounts early.
1715	 */
1716	BUG_ON(PageWriteback(page));
1717	set_page_writeback(page);
1718
1719	do {
1720		struct buffer_head *next = bh->b_this_page;
1721		if (buffer_async_write(bh)) {
1722			submit_bh(write_op, bh);
 
1723			nr_underway++;
1724		}
1725		bh = next;
1726	} while (bh != head);
1727	unlock_page(page);
1728
1729	err = 0;
1730done:
1731	if (nr_underway == 0) {
1732		/*
1733		 * The page was marked dirty, but the buffers were
1734		 * clean.  Someone wrote them back by hand with
1735		 * ll_rw_block/submit_bh.  A rare case.
1736		 */
1737		end_page_writeback(page);
1738
1739		/*
1740		 * The page and buffer_heads can be released at any time from
1741		 * here on.
1742		 */
1743	}
1744	return err;
1745
1746recover:
1747	/*
1748	 * ENOSPC, or some other error.  We may already have added some
1749	 * blocks to the file, so we need to write these out to avoid
1750	 * exposing stale data.
1751	 * The page is currently locked and not marked for writeback
1752	 */
1753	bh = head;
1754	/* Recovery: lock and submit the mapped buffers */
1755	do {
1756		if (buffer_mapped(bh) && buffer_dirty(bh) &&
1757		    !buffer_delay(bh)) {
1758			lock_buffer(bh);
1759			mark_buffer_async_write_endio(bh, handler);
 
1760		} else {
1761			/*
1762			 * The buffer may have been set dirty during
1763			 * attachment to a dirty page.
1764			 */
1765			clear_buffer_dirty(bh);
1766		}
1767	} while ((bh = bh->b_this_page) != head);
1768	SetPageError(page);
1769	BUG_ON(PageWriteback(page));
1770	mapping_set_error(page->mapping, err);
1771	set_page_writeback(page);
1772	do {
1773		struct buffer_head *next = bh->b_this_page;
1774		if (buffer_async_write(bh)) {
1775			clear_buffer_dirty(bh);
1776			submit_bh(write_op, bh);
 
1777			nr_underway++;
1778		}
1779		bh = next;
1780	} while (bh != head);
1781	unlock_page(page);
1782	goto done;
1783}
 
1784
1785/*
1786 * If a page has any new buffers, zero them out here, and mark them uptodate
1787 * and dirty so they'll be written out (in order to prevent uninitialised
1788 * block data from leaking). And clear the new bit.
1789 */
1790void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1791{
1792	unsigned int block_start, block_end;
1793	struct buffer_head *head, *bh;
1794
1795	BUG_ON(!PageLocked(page));
1796	if (!page_has_buffers(page))
 
1797		return;
1798
1799	bh = head = page_buffers(page);
1800	block_start = 0;
1801	do {
1802		block_end = block_start + bh->b_size;
1803
1804		if (buffer_new(bh)) {
1805			if (block_end > from && block_start < to) {
1806				if (!PageUptodate(page)) {
1807					unsigned start, size;
1808
1809					start = max(from, block_start);
1810					size = min(to, block_end) - start;
1811
1812					zero_user(page, start, size);
1813					set_buffer_uptodate(bh);
1814				}
1815
1816				clear_buffer_new(bh);
1817				mark_buffer_dirty(bh);
1818			}
1819		}
1820
1821		block_start = block_end;
1822		bh = bh->b_this_page;
1823	} while (bh != head);
1824}
1825EXPORT_SYMBOL(page_zero_new_buffers);
1826
1827int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1828		get_block_t *get_block)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1829{
1830	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1831	unsigned to = from + len;
1832	struct inode *inode = page->mapping->host;
1833	unsigned block_start, block_end;
1834	sector_t block;
1835	int err = 0;
1836	unsigned blocksize, bbits;
1837	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1838
1839	BUG_ON(!PageLocked(page));
1840	BUG_ON(from > PAGE_CACHE_SIZE);
1841	BUG_ON(to > PAGE_CACHE_SIZE);
1842	BUG_ON(from > to);
1843
1844	blocksize = 1 << inode->i_blkbits;
1845	if (!page_has_buffers(page))
1846		create_empty_buffers(page, blocksize, 0);
1847	head = page_buffers(page);
1848
1849	bbits = inode->i_blkbits;
1850	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1851
1852	for(bh = head, block_start = 0; bh != head || !block_start;
1853	    block++, block_start=block_end, bh = bh->b_this_page) {
1854		block_end = block_start + blocksize;
1855		if (block_end <= from || block_start >= to) {
1856			if (PageUptodate(page)) {
1857				if (!buffer_uptodate(bh))
1858					set_buffer_uptodate(bh);
1859			}
1860			continue;
1861		}
1862		if (buffer_new(bh))
1863			clear_buffer_new(bh);
1864		if (!buffer_mapped(bh)) {
1865			WARN_ON(bh->b_size != blocksize);
1866			err = get_block(inode, block, bh, 1);
 
 
 
1867			if (err)
1868				break;
 
1869			if (buffer_new(bh)) {
1870				unmap_underlying_metadata(bh->b_bdev,
1871							bh->b_blocknr);
1872				if (PageUptodate(page)) {
1873					clear_buffer_new(bh);
1874					set_buffer_uptodate(bh);
1875					mark_buffer_dirty(bh);
1876					continue;
1877				}
1878				if (block_end > to || block_start < from)
1879					zero_user_segments(page,
1880						to, block_end,
1881						block_start, from);
1882				continue;
1883			}
1884		}
1885		if (PageUptodate(page)) {
1886			if (!buffer_uptodate(bh))
1887				set_buffer_uptodate(bh);
1888			continue; 
1889		}
1890		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1891		    !buffer_unwritten(bh) &&
1892		     (block_start < from || block_end > to)) {
1893			ll_rw_block(READ, 1, &bh);
1894			*wait_bh++=bh;
1895		}
1896	}
1897	/*
1898	 * If we issued read requests - let them complete.
1899	 */
1900	while(wait_bh > wait) {
1901		wait_on_buffer(*--wait_bh);
1902		if (!buffer_uptodate(*wait_bh))
1903			err = -EIO;
1904	}
1905	if (unlikely(err))
1906		page_zero_new_buffers(page, from, to);
1907	return err;
1908}
 
 
 
 
 
 
1909EXPORT_SYMBOL(__block_write_begin);
1910
1911static int __block_commit_write(struct inode *inode, struct page *page,
1912		unsigned from, unsigned to)
1913{
1914	unsigned block_start, block_end;
1915	int partial = 0;
1916	unsigned blocksize;
1917	struct buffer_head *bh, *head;
1918
1919	blocksize = 1 << inode->i_blkbits;
 
 
 
1920
1921	for(bh = head = page_buffers(page), block_start = 0;
1922	    bh != head || !block_start;
1923	    block_start=block_end, bh = bh->b_this_page) {
1924		block_end = block_start + blocksize;
1925		if (block_end <= from || block_start >= to) {
1926			if (!buffer_uptodate(bh))
1927				partial = 1;
1928		} else {
1929			set_buffer_uptodate(bh);
1930			mark_buffer_dirty(bh);
1931		}
1932		clear_buffer_new(bh);
1933	}
 
 
 
 
1934
1935	/*
1936	 * If this is a partial write which happened to make all buffers
1937	 * uptodate then we can optimize away a bogus readpage() for
1938	 * the next read(). Here we 'discover' whether the page went
1939	 * uptodate as a result of this (potentially partial) write.
1940	 */
1941	if (!partial)
1942		SetPageUptodate(page);
1943	return 0;
1944}
1945
1946/*
1947 * block_write_begin takes care of the basic task of block allocation and
1948 * bringing partial write blocks uptodate first.
1949 *
1950 * The filesystem needs to handle block truncation upon failure.
1951 */
1952int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
1953		unsigned flags, struct page **pagep, get_block_t *get_block)
1954{
1955	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1956	struct page *page;
1957	int status;
1958
1959	page = grab_cache_page_write_begin(mapping, index, flags);
1960	if (!page)
1961		return -ENOMEM;
 
1962
1963	status = __block_write_begin(page, pos, len, get_block);
1964	if (unlikely(status)) {
1965		unlock_page(page);
1966		page_cache_release(page);
1967		page = NULL;
1968	}
1969
1970	*pagep = page;
1971	return status;
1972}
1973EXPORT_SYMBOL(block_write_begin);
1974
1975int block_write_end(struct file *file, struct address_space *mapping,
1976			loff_t pos, unsigned len, unsigned copied,
1977			struct page *page, void *fsdata)
1978{
1979	struct inode *inode = mapping->host;
1980	unsigned start;
1981
1982	start = pos & (PAGE_CACHE_SIZE - 1);
1983
1984	if (unlikely(copied < len)) {
1985		/*
1986		 * The buffers that were written will now be uptodate, so we
1987		 * don't have to worry about a readpage reading them and
1988		 * overwriting a partial write. However if we have encountered
1989		 * a short write and only partially written into a buffer, it
1990		 * will not be marked uptodate, so a readpage might come in and
1991		 * destroy our partial write.
1992		 *
1993		 * Do the simplest thing, and just treat any short write to a
1994		 * non uptodate page as a zero-length write, and force the
1995		 * caller to redo the whole thing.
1996		 */
1997		if (!PageUptodate(page))
1998			copied = 0;
1999
2000		page_zero_new_buffers(page, start+copied, start+len);
2001	}
2002	flush_dcache_page(page);
2003
2004	/* This could be a short (even 0-length) commit */
2005	__block_commit_write(inode, page, start, start+copied);
2006
2007	return copied;
2008}
2009EXPORT_SYMBOL(block_write_end);
2010
2011int generic_write_end(struct file *file, struct address_space *mapping,
2012			loff_t pos, unsigned len, unsigned copied,
2013			struct page *page, void *fsdata)
2014{
2015	struct inode *inode = mapping->host;
2016	int i_size_changed = 0;
 
2017
2018	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2019
2020	/*
2021	 * No need to use i_size_read() here, the i_size
2022	 * cannot change under us because we hold i_mutex.
2023	 *
2024	 * But it's important to update i_size while still holding page lock:
2025	 * page writeout could otherwise come in and zero beyond i_size.
2026	 */
2027	if (pos+copied > inode->i_size) {
2028		i_size_write(inode, pos+copied);
2029		i_size_changed = 1;
2030	}
2031
2032	unlock_page(page);
2033	page_cache_release(page);
2034
 
 
2035	/*
2036	 * Don't mark the inode dirty under page lock. First, it unnecessarily
2037	 * makes the holding time of page lock longer. Second, it forces lock
2038	 * ordering of page lock and transaction start for journaling
2039	 * filesystems.
2040	 */
2041	if (i_size_changed)
2042		mark_inode_dirty(inode);
2043
2044	return copied;
2045}
2046EXPORT_SYMBOL(generic_write_end);
2047
2048/*
2049 * block_is_partially_uptodate checks whether buffers within a page are
2050 * uptodate or not.
2051 *
2052 * Returns true if all buffers which correspond to a file portion
2053 * we want to read are uptodate.
2054 */
2055int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2056					unsigned long from)
2057{
2058	struct inode *inode = page->mapping->host;
2059	unsigned block_start, block_end, blocksize;
2060	unsigned to;
2061	struct buffer_head *bh, *head;
2062	int ret = 1;
2063
2064	if (!page_has_buffers(page))
2065		return 0;
2066
2067	blocksize = 1 << inode->i_blkbits;
2068	to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
2069	to = from + to;
2070	if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2071		return 0;
2072
2073	head = page_buffers(page);
2074	bh = head;
2075	block_start = 0;
2076	do {
2077		block_end = block_start + blocksize;
2078		if (block_end > from && block_start < to) {
2079			if (!buffer_uptodate(bh)) {
2080				ret = 0;
2081				break;
2082			}
2083			if (block_end >= to)
2084				break;
2085		}
2086		block_start = block_end;
2087		bh = bh->b_this_page;
2088	} while (bh != head);
2089
2090	return ret;
2091}
2092EXPORT_SYMBOL(block_is_partially_uptodate);
2093
2094/*
2095 * Generic "read page" function for block devices that have the normal
2096 * get_block functionality. This is most of the block device filesystems.
2097 * Reads the page asynchronously --- the unlock_buffer() and
2098 * set/clear_buffer_uptodate() functions propagate buffer state into the
2099 * page struct once IO has completed.
2100 */
2101int block_read_full_page(struct page *page, get_block_t *get_block)
2102{
2103	struct inode *inode = page->mapping->host;
2104	sector_t iblock, lblock;
2105	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2106	unsigned int blocksize;
2107	int nr, i;
2108	int fully_mapped = 1;
 
 
 
 
 
 
 
 
2109
2110	BUG_ON(!PageLocked(page));
2111	blocksize = 1 << inode->i_blkbits;
2112	if (!page_has_buffers(page))
2113		create_empty_buffers(page, blocksize, 0);
2114	head = page_buffers(page);
2115
2116	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2117	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2118	bh = head;
2119	nr = 0;
2120	i = 0;
2121
2122	do {
2123		if (buffer_uptodate(bh))
2124			continue;
2125
2126		if (!buffer_mapped(bh)) {
2127			int err = 0;
2128
2129			fully_mapped = 0;
2130			if (iblock < lblock) {
2131				WARN_ON(bh->b_size != blocksize);
2132				err = get_block(inode, iblock, bh, 0);
2133				if (err)
2134					SetPageError(page);
2135			}
2136			if (!buffer_mapped(bh)) {
2137				zero_user(page, i * blocksize, blocksize);
 
2138				if (!err)
2139					set_buffer_uptodate(bh);
2140				continue;
2141			}
2142			/*
2143			 * get_block() might have updated the buffer
2144			 * synchronously
2145			 */
2146			if (buffer_uptodate(bh))
2147				continue;
2148		}
2149		arr[nr++] = bh;
2150	} while (i++, iblock++, (bh = bh->b_this_page) != head);
2151
2152	if (fully_mapped)
2153		SetPageMappedToDisk(page);
2154
2155	if (!nr) {
2156		/*
2157		 * All buffers are uptodate - we can set the page uptodate
2158		 * as well. But not if get_block() returned an error.
2159		 */
2160		if (!PageError(page))
2161			SetPageUptodate(page);
2162		unlock_page(page);
2163		return 0;
2164	}
2165
2166	/* Stage two: lock the buffers */
2167	for (i = 0; i < nr; i++) {
2168		bh = arr[i];
2169		lock_buffer(bh);
2170		mark_buffer_async_read(bh);
2171	}
2172
2173	/*
2174	 * Stage 3: start the IO.  Check for uptodateness
2175	 * inside the buffer lock in case another process reading
2176	 * the underlying blockdev brought it uptodate (the sct fix).
2177	 */
2178	for (i = 0; i < nr; i++) {
2179		bh = arr[i];
2180		if (buffer_uptodate(bh))
2181			end_buffer_async_read(bh, 1);
2182		else
2183			submit_bh(READ, bh);
2184	}
2185	return 0;
2186}
2187EXPORT_SYMBOL(block_read_full_page);
2188
2189/* utility function for filesystems that need to do work on expanding
2190 * truncates.  Uses filesystem pagecache writes to allow the filesystem to
2191 * deal with the hole.  
2192 */
2193int generic_cont_expand_simple(struct inode *inode, loff_t size)
2194{
2195	struct address_space *mapping = inode->i_mapping;
2196	struct page *page;
2197	void *fsdata;
 
2198	int err;
2199
2200	err = inode_newsize_ok(inode, size);
2201	if (err)
2202		goto out;
2203
2204	err = pagecache_write_begin(NULL, mapping, size, 0,
2205				AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2206				&page, &fsdata);
2207	if (err)
2208		goto out;
2209
2210	err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2211	BUG_ON(err > 0);
2212
2213out:
2214	return err;
2215}
2216EXPORT_SYMBOL(generic_cont_expand_simple);
2217
2218static int cont_expand_zero(struct file *file, struct address_space *mapping,
2219			    loff_t pos, loff_t *bytes)
2220{
2221	struct inode *inode = mapping->host;
2222	unsigned blocksize = 1 << inode->i_blkbits;
2223	struct page *page;
2224	void *fsdata;
 
2225	pgoff_t index, curidx;
2226	loff_t curpos;
2227	unsigned zerofrom, offset, len;
2228	int err = 0;
2229
2230	index = pos >> PAGE_CACHE_SHIFT;
2231	offset = pos & ~PAGE_CACHE_MASK;
2232
2233	while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2234		zerofrom = curpos & ~PAGE_CACHE_MASK;
2235		if (zerofrom & (blocksize-1)) {
2236			*bytes |= (blocksize-1);
2237			(*bytes)++;
2238		}
2239		len = PAGE_CACHE_SIZE - zerofrom;
2240
2241		err = pagecache_write_begin(file, mapping, curpos, len,
2242						AOP_FLAG_UNINTERRUPTIBLE,
2243						&page, &fsdata);
2244		if (err)
2245			goto out;
2246		zero_user(page, zerofrom, len);
2247		err = pagecache_write_end(file, mapping, curpos, len, len,
2248						page, fsdata);
2249		if (err < 0)
2250			goto out;
2251		BUG_ON(err != len);
2252		err = 0;
2253
2254		balance_dirty_pages_ratelimited(mapping);
 
 
 
 
 
2255	}
2256
2257	/* page covers the boundary, find the boundary offset */
2258	if (index == curidx) {
2259		zerofrom = curpos & ~PAGE_CACHE_MASK;
2260		/* if we will expand the thing last block will be filled */
2261		if (offset <= zerofrom) {
2262			goto out;
2263		}
2264		if (zerofrom & (blocksize-1)) {
2265			*bytes |= (blocksize-1);
2266			(*bytes)++;
2267		}
2268		len = offset - zerofrom;
2269
2270		err = pagecache_write_begin(file, mapping, curpos, len,
2271						AOP_FLAG_UNINTERRUPTIBLE,
2272						&page, &fsdata);
2273		if (err)
2274			goto out;
2275		zero_user(page, zerofrom, len);
2276		err = pagecache_write_end(file, mapping, curpos, len, len,
2277						page, fsdata);
2278		if (err < 0)
2279			goto out;
2280		BUG_ON(err != len);
2281		err = 0;
2282	}
2283out:
2284	return err;
2285}
2286
2287/*
2288 * For moronic filesystems that do not allow holes in file.
2289 * We may have to extend the file.
2290 */
2291int cont_write_begin(struct file *file, struct address_space *mapping,
2292			loff_t pos, unsigned len, unsigned flags,
2293			struct page **pagep, void **fsdata,
2294			get_block_t *get_block, loff_t *bytes)
2295{
2296	struct inode *inode = mapping->host;
2297	unsigned blocksize = 1 << inode->i_blkbits;
2298	unsigned zerofrom;
2299	int err;
2300
2301	err = cont_expand_zero(file, mapping, pos, bytes);
2302	if (err)
2303		return err;
2304
2305	zerofrom = *bytes & ~PAGE_CACHE_MASK;
2306	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2307		*bytes |= (blocksize-1);
2308		(*bytes)++;
2309	}
2310
2311	return block_write_begin(mapping, pos, len, flags, pagep, get_block);
2312}
2313EXPORT_SYMBOL(cont_write_begin);
2314
2315int block_commit_write(struct page *page, unsigned from, unsigned to)
2316{
2317	struct inode *inode = page->mapping->host;
2318	__block_commit_write(inode,page,from,to);
2319	return 0;
2320}
2321EXPORT_SYMBOL(block_commit_write);
2322
2323/*
2324 * block_page_mkwrite() is not allowed to change the file size as it gets
2325 * called from a page fault handler when a page is first dirtied. Hence we must
2326 * be careful to check for EOF conditions here. We set the page up correctly
2327 * for a written page which means we get ENOSPC checking when writing into
2328 * holes and correct delalloc and unwritten extent mapping on filesystems that
2329 * support these features.
2330 *
2331 * We are not allowed to take the i_mutex here so we have to play games to
2332 * protect against truncate races as the page could now be beyond EOF.  Because
2333 * truncate writes the inode size before removing pages, once we have the
2334 * page lock we can determine safely if the page is beyond EOF. If it is not
2335 * beyond EOF, then the page is guaranteed safe against truncation until we
2336 * unlock the page.
2337 *
2338 * Direct callers of this function should call vfs_check_frozen() so that page
2339 * fault does not busyloop until the fs is thawed.
2340 */
2341int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2342			 get_block_t get_block)
2343{
2344	struct page *page = vmf->page;
2345	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2346	unsigned long end;
2347	loff_t size;
2348	int ret;
2349
2350	lock_page(page);
2351	size = i_size_read(inode);
2352	if ((page->mapping != inode->i_mapping) ||
2353	    (page_offset(page) > size)) {
2354		/* We overload EFAULT to mean page got truncated */
2355		ret = -EFAULT;
2356		goto out_unlock;
2357	}
2358
2359	/* page is wholly or partially inside EOF */
2360	if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2361		end = size & ~PAGE_CACHE_MASK;
2362	else
2363		end = PAGE_CACHE_SIZE;
 
 
 
2364
2365	ret = __block_write_begin(page, 0, end, get_block);
2366	if (!ret)
2367		ret = block_commit_write(page, 0, end);
2368
2369	if (unlikely(ret < 0))
2370		goto out_unlock;
2371	/*
2372	 * Freezing in progress? We check after the page is marked dirty and
2373	 * with page lock held so if the test here fails, we are sure freezing
2374	 * code will wait during syncing until the page fault is done - at that
2375	 * point page will be dirty and unlocked so freezing code will write it
2376	 * and writeprotect it again.
2377	 */
2378	set_page_dirty(page);
2379	if (inode->i_sb->s_frozen != SB_UNFROZEN) {
2380		ret = -EAGAIN;
2381		goto out_unlock;
2382	}
2383	wait_on_page_writeback(page);
2384	return 0;
2385out_unlock:
2386	unlock_page(page);
2387	return ret;
2388}
2389EXPORT_SYMBOL(__block_page_mkwrite);
2390
2391int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2392		   get_block_t get_block)
2393{
2394	int ret;
2395	struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
2396
2397	/*
2398	 * This check is racy but catches the common case. The check in
2399	 * __block_page_mkwrite() is reliable.
2400	 */
2401	vfs_check_frozen(sb, SB_FREEZE_WRITE);
2402	ret = __block_page_mkwrite(vma, vmf, get_block);
2403	return block_page_mkwrite_return(ret);
2404}
2405EXPORT_SYMBOL(block_page_mkwrite);
2406
2407/*
2408 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2409 * immediately, while under the page lock.  So it needs a special end_io
2410 * handler which does not touch the bh after unlocking it.
2411 */
2412static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2413{
2414	__end_buffer_read_notouch(bh, uptodate);
2415}
2416
2417/*
2418 * Attach the singly-linked list of buffers created by nobh_write_begin, to
2419 * the page (converting it to circular linked list and taking care of page
2420 * dirty races).
2421 */
2422static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2423{
2424	struct buffer_head *bh;
2425
2426	BUG_ON(!PageLocked(page));
2427
2428	spin_lock(&page->mapping->private_lock);
2429	bh = head;
2430	do {
2431		if (PageDirty(page))
2432			set_buffer_dirty(bh);
2433		if (!bh->b_this_page)
2434			bh->b_this_page = head;
2435		bh = bh->b_this_page;
2436	} while (bh != head);
2437	attach_page_buffers(page, head);
2438	spin_unlock(&page->mapping->private_lock);
2439}
2440
2441/*
2442 * On entry, the page is fully not uptodate.
2443 * On exit the page is fully uptodate in the areas outside (from,to)
2444 * The filesystem needs to handle block truncation upon failure.
2445 */
2446int nobh_write_begin(struct address_space *mapping,
2447			loff_t pos, unsigned len, unsigned flags,
2448			struct page **pagep, void **fsdata,
2449			get_block_t *get_block)
2450{
2451	struct inode *inode = mapping->host;
2452	const unsigned blkbits = inode->i_blkbits;
2453	const unsigned blocksize = 1 << blkbits;
2454	struct buffer_head *head, *bh;
2455	struct page *page;
2456	pgoff_t index;
2457	unsigned from, to;
2458	unsigned block_in_page;
2459	unsigned block_start, block_end;
2460	sector_t block_in_file;
2461	int nr_reads = 0;
2462	int ret = 0;
2463	int is_mapped_to_disk = 1;
2464
2465	index = pos >> PAGE_CACHE_SHIFT;
2466	from = pos & (PAGE_CACHE_SIZE - 1);
2467	to = from + len;
2468
2469	page = grab_cache_page_write_begin(mapping, index, flags);
2470	if (!page)
2471		return -ENOMEM;
2472	*pagep = page;
2473	*fsdata = NULL;
2474
2475	if (page_has_buffers(page)) {
2476		ret = __block_write_begin(page, pos, len, get_block);
2477		if (unlikely(ret))
2478			goto out_release;
2479		return ret;
2480	}
2481
2482	if (PageMappedToDisk(page))
2483		return 0;
2484
2485	/*
2486	 * Allocate buffers so that we can keep track of state, and potentially
2487	 * attach them to the page if an error occurs. In the common case of
2488	 * no error, they will just be freed again without ever being attached
2489	 * to the page (which is all OK, because we're under the page lock).
2490	 *
2491	 * Be careful: the buffer linked list is a NULL terminated one, rather
2492	 * than the circular one we're used to.
2493	 */
2494	head = alloc_page_buffers(page, blocksize, 0);
2495	if (!head) {
2496		ret = -ENOMEM;
2497		goto out_release;
2498	}
2499
2500	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2501
2502	/*
2503	 * We loop across all blocks in the page, whether or not they are
2504	 * part of the affected region.  This is so we can discover if the
2505	 * page is fully mapped-to-disk.
2506	 */
2507	for (block_start = 0, block_in_page = 0, bh = head;
2508		  block_start < PAGE_CACHE_SIZE;
2509		  block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2510		int create;
2511
2512		block_end = block_start + blocksize;
2513		bh->b_state = 0;
2514		create = 1;
2515		if (block_start >= to)
2516			create = 0;
2517		ret = get_block(inode, block_in_file + block_in_page,
2518					bh, create);
2519		if (ret)
2520			goto failed;
2521		if (!buffer_mapped(bh))
2522			is_mapped_to_disk = 0;
2523		if (buffer_new(bh))
2524			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2525		if (PageUptodate(page)) {
2526			set_buffer_uptodate(bh);
2527			continue;
2528		}
2529		if (buffer_new(bh) || !buffer_mapped(bh)) {
2530			zero_user_segments(page, block_start, from,
2531							to, block_end);
2532			continue;
2533		}
2534		if (buffer_uptodate(bh))
2535			continue;	/* reiserfs does this */
2536		if (block_start < from || block_end > to) {
2537			lock_buffer(bh);
2538			bh->b_end_io = end_buffer_read_nobh;
2539			submit_bh(READ, bh);
2540			nr_reads++;
2541		}
2542	}
2543
2544	if (nr_reads) {
2545		/*
2546		 * The page is locked, so these buffers are protected from
2547		 * any VM or truncate activity.  Hence we don't need to care
2548		 * for the buffer_head refcounts.
2549		 */
2550		for (bh = head; bh; bh = bh->b_this_page) {
2551			wait_on_buffer(bh);
2552			if (!buffer_uptodate(bh))
2553				ret = -EIO;
2554		}
2555		if (ret)
2556			goto failed;
2557	}
2558
2559	if (is_mapped_to_disk)
2560		SetPageMappedToDisk(page);
2561
2562	*fsdata = head; /* to be released by nobh_write_end */
2563
2564	return 0;
2565
2566failed:
2567	BUG_ON(!ret);
2568	/*
2569	 * Error recovery is a bit difficult. We need to zero out blocks that
2570	 * were newly allocated, and dirty them to ensure they get written out.
2571	 * Buffers need to be attached to the page at this point, otherwise
2572	 * the handling of potential IO errors during writeout would be hard
2573	 * (could try doing synchronous writeout, but what if that fails too?)
2574	 */
2575	attach_nobh_buffers(page, head);
2576	page_zero_new_buffers(page, from, to);
2577
2578out_release:
2579	unlock_page(page);
2580	page_cache_release(page);
2581	*pagep = NULL;
2582
2583	return ret;
2584}
2585EXPORT_SYMBOL(nobh_write_begin);
2586
2587int nobh_write_end(struct file *file, struct address_space *mapping,
2588			loff_t pos, unsigned len, unsigned copied,
2589			struct page *page, void *fsdata)
2590{
2591	struct inode *inode = page->mapping->host;
2592	struct buffer_head *head = fsdata;
2593	struct buffer_head *bh;
2594	BUG_ON(fsdata != NULL && page_has_buffers(page));
2595
2596	if (unlikely(copied < len) && head)
2597		attach_nobh_buffers(page, head);
2598	if (page_has_buffers(page))
2599		return generic_write_end(file, mapping, pos, len,
2600					copied, page, fsdata);
2601
2602	SetPageUptodate(page);
2603	set_page_dirty(page);
2604	if (pos+copied > inode->i_size) {
2605		i_size_write(inode, pos+copied);
2606		mark_inode_dirty(inode);
2607	}
2608
2609	unlock_page(page);
2610	page_cache_release(page);
2611
2612	while (head) {
2613		bh = head;
2614		head = head->b_this_page;
2615		free_buffer_head(bh);
2616	}
2617
2618	return copied;
2619}
2620EXPORT_SYMBOL(nobh_write_end);
2621
2622/*
2623 * nobh_writepage() - based on block_full_write_page() except
2624 * that it tries to operate without attaching bufferheads to
2625 * the page.
2626 */
2627int nobh_writepage(struct page *page, get_block_t *get_block,
2628			struct writeback_control *wbc)
2629{
2630	struct inode * const inode = page->mapping->host;
2631	loff_t i_size = i_size_read(inode);
2632	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2633	unsigned offset;
2634	int ret;
2635
2636	/* Is the page fully inside i_size? */
2637	if (page->index < end_index)
2638		goto out;
2639
2640	/* Is the page fully outside i_size? (truncate in progress) */
2641	offset = i_size & (PAGE_CACHE_SIZE-1);
2642	if (page->index >= end_index+1 || !offset) {
2643		/*
2644		 * The page may have dirty, unmapped buffers.  For example,
2645		 * they may have been added in ext3_writepage().  Make them
2646		 * freeable here, so the page does not leak.
2647		 */
2648#if 0
2649		/* Not really sure about this  - do we need this ? */
2650		if (page->mapping->a_ops->invalidatepage)
2651			page->mapping->a_ops->invalidatepage(page, offset);
2652#endif
2653		unlock_page(page);
2654		return 0; /* don't care */
2655	}
2656
2657	/*
2658	 * The page straddles i_size.  It must be zeroed out on each and every
2659	 * writepage invocation because it may be mmapped.  "A file is mapped
2660	 * in multiples of the page size.  For a file that is not a multiple of
2661	 * the  page size, the remaining memory is zeroed when mapped, and
2662	 * writes to that region are not written out to the file."
2663	 */
2664	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2665out:
2666	ret = mpage_writepage(page, get_block, wbc);
2667	if (ret == -EAGAIN)
2668		ret = __block_write_full_page(inode, page, get_block, wbc,
2669					      end_buffer_async_write);
2670	return ret;
2671}
2672EXPORT_SYMBOL(nobh_writepage);
2673
2674int nobh_truncate_page(struct address_space *mapping,
2675			loff_t from, get_block_t *get_block)
2676{
2677	pgoff_t index = from >> PAGE_CACHE_SHIFT;
2678	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2679	unsigned blocksize;
2680	sector_t iblock;
2681	unsigned length, pos;
2682	struct inode *inode = mapping->host;
2683	struct page *page;
2684	struct buffer_head map_bh;
2685	int err;
2686
2687	blocksize = 1 << inode->i_blkbits;
2688	length = offset & (blocksize - 1);
2689
2690	/* Block boundary? Nothing to do */
2691	if (!length)
2692		return 0;
2693
2694	length = blocksize - length;
2695	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2696
2697	page = grab_cache_page(mapping, index);
2698	err = -ENOMEM;
2699	if (!page)
2700		goto out;
2701
2702	if (page_has_buffers(page)) {
2703has_buffers:
2704		unlock_page(page);
2705		page_cache_release(page);
2706		return block_truncate_page(mapping, from, get_block);
2707	}
2708
2709	/* Find the buffer that contains "offset" */
2710	pos = blocksize;
2711	while (offset >= pos) {
2712		iblock++;
2713		pos += blocksize;
2714	}
2715
2716	map_bh.b_size = blocksize;
2717	map_bh.b_state = 0;
2718	err = get_block(inode, iblock, &map_bh, 0);
2719	if (err)
2720		goto unlock;
2721	/* unmapped? It's a hole - nothing to do */
2722	if (!buffer_mapped(&map_bh))
2723		goto unlock;
2724
2725	/* Ok, it's mapped. Make sure it's up-to-date */
2726	if (!PageUptodate(page)) {
2727		err = mapping->a_ops->readpage(NULL, page);
2728		if (err) {
2729			page_cache_release(page);
2730			goto out;
2731		}
2732		lock_page(page);
2733		if (!PageUptodate(page)) {
2734			err = -EIO;
2735			goto unlock;
2736		}
2737		if (page_has_buffers(page))
2738			goto has_buffers;
2739	}
2740	zero_user(page, offset, length);
2741	set_page_dirty(page);
2742	err = 0;
2743
2744unlock:
2745	unlock_page(page);
2746	page_cache_release(page);
2747out:
2748	return err;
2749}
2750EXPORT_SYMBOL(nobh_truncate_page);
2751
2752int block_truncate_page(struct address_space *mapping,
2753			loff_t from, get_block_t *get_block)
2754{
2755	pgoff_t index = from >> PAGE_CACHE_SHIFT;
2756	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2757	unsigned blocksize;
2758	sector_t iblock;
2759	unsigned length, pos;
2760	struct inode *inode = mapping->host;
2761	struct page *page;
2762	struct buffer_head *bh;
2763	int err;
2764
2765	blocksize = 1 << inode->i_blkbits;
2766	length = offset & (blocksize - 1);
2767
2768	/* Block boundary? Nothing to do */
2769	if (!length)
2770		return 0;
2771
2772	length = blocksize - length;
2773	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2774	
2775	page = grab_cache_page(mapping, index);
2776	err = -ENOMEM;
2777	if (!page)
2778		goto out;
2779
2780	if (!page_has_buffers(page))
2781		create_empty_buffers(page, blocksize, 0);
 
 
 
 
 
2782
2783	/* Find the buffer that contains "offset" */
2784	bh = page_buffers(page);
2785	pos = blocksize;
2786	while (offset >= pos) {
2787		bh = bh->b_this_page;
2788		iblock++;
2789		pos += blocksize;
2790	}
2791
2792	err = 0;
2793	if (!buffer_mapped(bh)) {
2794		WARN_ON(bh->b_size != blocksize);
2795		err = get_block(inode, iblock, bh, 0);
2796		if (err)
2797			goto unlock;
2798		/* unmapped? It's a hole - nothing to do */
2799		if (!buffer_mapped(bh))
2800			goto unlock;
2801	}
2802
2803	/* Ok, it's mapped. Make sure it's up-to-date */
2804	if (PageUptodate(page))
2805		set_buffer_uptodate(bh);
2806
2807	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2808		err = -EIO;
2809		ll_rw_block(READ, 1, &bh);
2810		wait_on_buffer(bh);
2811		/* Uhhuh. Read error. Complain and punt. */
2812		if (!buffer_uptodate(bh))
2813			goto unlock;
2814	}
2815
2816	zero_user(page, offset, length);
2817	mark_buffer_dirty(bh);
2818	err = 0;
2819
2820unlock:
2821	unlock_page(page);
2822	page_cache_release(page);
2823out:
2824	return err;
2825}
2826EXPORT_SYMBOL(block_truncate_page);
2827
2828/*
2829 * The generic ->writepage function for buffer-backed address_spaces
2830 * this form passes in the end_io handler used to finish the IO.
2831 */
2832int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2833			struct writeback_control *wbc, bh_end_io_t *handler)
2834{
2835	struct inode * const inode = page->mapping->host;
2836	loff_t i_size = i_size_read(inode);
2837	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2838	unsigned offset;
2839
2840	/* Is the page fully inside i_size? */
2841	if (page->index < end_index)
2842		return __block_write_full_page(inode, page, get_block, wbc,
2843					       handler);
2844
2845	/* Is the page fully outside i_size? (truncate in progress) */
2846	offset = i_size & (PAGE_CACHE_SIZE-1);
2847	if (page->index >= end_index+1 || !offset) {
2848		/*
2849		 * The page may have dirty, unmapped buffers.  For example,
2850		 * they may have been added in ext3_writepage().  Make them
2851		 * freeable here, so the page does not leak.
2852		 */
2853		do_invalidatepage(page, 0);
2854		unlock_page(page);
2855		return 0; /* don't care */
2856	}
2857
2858	/*
2859	 * The page straddles i_size.  It must be zeroed out on each and every
2860	 * writepage invocation because it may be mmapped.  "A file is mapped
2861	 * in multiples of the page size.  For a file that is not a multiple of
2862	 * the  page size, the remaining memory is zeroed when mapped, and
2863	 * writes to that region are not written out to the file."
2864	 */
2865	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2866	return __block_write_full_page(inode, page, get_block, wbc, handler);
2867}
2868EXPORT_SYMBOL(block_write_full_page_endio);
2869
2870/*
2871 * The generic ->writepage function for buffer-backed address_spaces
2872 */
2873int block_write_full_page(struct page *page, get_block_t *get_block,
2874			struct writeback_control *wbc)
2875{
2876	return block_write_full_page_endio(page, get_block, wbc,
2877					   end_buffer_async_write);
2878}
2879EXPORT_SYMBOL(block_write_full_page);
2880
2881sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2882			    get_block_t *get_block)
2883{
2884	struct buffer_head tmp;
2885	struct inode *inode = mapping->host;
2886	tmp.b_state = 0;
2887	tmp.b_blocknr = 0;
2888	tmp.b_size = 1 << inode->i_blkbits;
 
2889	get_block(inode, block, &tmp, 0);
2890	return tmp.b_blocknr;
2891}
2892EXPORT_SYMBOL(generic_block_bmap);
2893
2894static void end_bio_bh_io_sync(struct bio *bio, int err)
2895{
2896	struct buffer_head *bh = bio->bi_private;
2897
2898	if (err == -EOPNOTSUPP) {
2899		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2900	}
2901
2902	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
2903		set_bit(BH_Quiet, &bh->b_state);
2904
2905	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2906	bio_put(bio);
2907}
2908
2909int submit_bh(int rw, struct buffer_head * bh)
 
 
2910{
 
2911	struct bio *bio;
2912	int ret = 0;
2913
2914	BUG_ON(!buffer_locked(bh));
2915	BUG_ON(!buffer_mapped(bh));
2916	BUG_ON(!bh->b_end_io);
2917	BUG_ON(buffer_delay(bh));
2918	BUG_ON(buffer_unwritten(bh));
2919
2920	/*
2921	 * Only clear out a write error when rewriting
2922	 */
2923	if (test_set_buffer_req(bh) && (rw & WRITE))
2924		clear_buffer_write_io_error(bh);
2925
2926	/*
2927	 * from here on down, it's all bio -- do the initial mapping,
2928	 * submit_bio -> generic_make_request may further map this bio around
2929	 */
2930	bio = bio_alloc(GFP_NOIO, 1);
 
 
 
 
 
 
2931
2932	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2933	bio->bi_bdev = bh->b_bdev;
2934	bio->bi_io_vec[0].bv_page = bh->b_page;
2935	bio->bi_io_vec[0].bv_len = bh->b_size;
2936	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2937
2938	bio->bi_vcnt = 1;
2939	bio->bi_idx = 0;
2940	bio->bi_size = bh->b_size;
2941
2942	bio->bi_end_io = end_bio_bh_io_sync;
2943	bio->bi_private = bh;
2944
2945	bio_get(bio);
2946	submit_bio(rw, bio);
2947
2948	if (bio_flagged(bio, BIO_EOPNOTSUPP))
2949		ret = -EOPNOTSUPP;
 
 
2950
2951	bio_put(bio);
2952	return ret;
2953}
2954EXPORT_SYMBOL(submit_bh);
2955
2956/**
2957 * ll_rw_block: low-level access to block devices (DEPRECATED)
2958 * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
2959 * @nr: number of &struct buffer_heads in the array
2960 * @bhs: array of pointers to &struct buffer_head
2961 *
2962 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2963 * requests an I/O operation on them, either a %READ or a %WRITE.  The third
2964 * %READA option is described in the documentation for generic_make_request()
2965 * which ll_rw_block() calls.
2966 *
2967 * This function drops any buffer that it cannot get a lock on (with the
2968 * BH_Lock state bit), any buffer that appears to be clean when doing a write
2969 * request, and any buffer that appears to be up-to-date when doing read
2970 * request.  Further it marks as clean buffers that are processed for
2971 * writing (the buffer cache won't assume that they are actually clean
2972 * until the buffer gets unlocked).
2973 *
2974 * ll_rw_block sets b_end_io to simple completion handler that marks
2975 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2976 * any waiters. 
2977 *
2978 * All of the buffers must be for the same device, and must also be a
2979 * multiple of the current approved size for the device.
2980 */
2981void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2982{
2983	int i;
2984
2985	for (i = 0; i < nr; i++) {
2986		struct buffer_head *bh = bhs[i];
2987
2988		if (!trylock_buffer(bh))
2989			continue;
2990		if (rw == WRITE) {
2991			if (test_clear_buffer_dirty(bh)) {
2992				bh->b_end_io = end_buffer_write_sync;
2993				get_bh(bh);
2994				submit_bh(WRITE, bh);
2995				continue;
2996			}
2997		} else {
2998			if (!buffer_uptodate(bh)) {
2999				bh->b_end_io = end_buffer_read_sync;
3000				get_bh(bh);
3001				submit_bh(rw, bh);
3002				continue;
3003			}
3004		}
3005		unlock_buffer(bh);
3006	}
3007}
3008EXPORT_SYMBOL(ll_rw_block);
3009
3010void write_dirty_buffer(struct buffer_head *bh, int rw)
3011{
3012	lock_buffer(bh);
3013	if (!test_clear_buffer_dirty(bh)) {
3014		unlock_buffer(bh);
3015		return;
3016	}
3017	bh->b_end_io = end_buffer_write_sync;
3018	get_bh(bh);
3019	submit_bh(rw, bh);
3020}
3021EXPORT_SYMBOL(write_dirty_buffer);
3022
3023/*
3024 * For a data-integrity writeout, we need to wait upon any in-progress I/O
3025 * and then start new I/O and then wait upon it.  The caller must have a ref on
3026 * the buffer_head.
3027 */
3028int __sync_dirty_buffer(struct buffer_head *bh, int rw)
3029{
3030	int ret = 0;
3031
3032	WARN_ON(atomic_read(&bh->b_count) < 1);
3033	lock_buffer(bh);
3034	if (test_clear_buffer_dirty(bh)) {
 
 
 
 
 
 
 
 
 
3035		get_bh(bh);
3036		bh->b_end_io = end_buffer_write_sync;
3037		ret = submit_bh(rw, bh);
3038		wait_on_buffer(bh);
3039		if (!ret && !buffer_uptodate(bh))
3040			ret = -EIO;
3041	} else {
3042		unlock_buffer(bh);
3043	}
3044	return ret;
3045}
3046EXPORT_SYMBOL(__sync_dirty_buffer);
3047
3048int sync_dirty_buffer(struct buffer_head *bh)
3049{
3050	return __sync_dirty_buffer(bh, WRITE_SYNC);
3051}
3052EXPORT_SYMBOL(sync_dirty_buffer);
3053
3054/*
3055 * try_to_free_buffers() checks if all the buffers on this particular page
3056 * are unused, and releases them if so.
3057 *
3058 * Exclusion against try_to_free_buffers may be obtained by either
3059 * locking the page or by holding its mapping's private_lock.
3060 *
3061 * If the page is dirty but all the buffers are clean then we need to
3062 * be sure to mark the page clean as well.  This is because the page
3063 * may be against a block device, and a later reattachment of buffers
3064 * to a dirty page will set *all* buffers dirty.  Which would corrupt
3065 * filesystem data on the same device.
3066 *
3067 * The same applies to regular filesystem pages: if all the buffers are
3068 * clean then we set the page clean and proceed.  To do that, we require
3069 * total exclusion from __set_page_dirty_buffers().  That is obtained with
3070 * private_lock.
3071 *
3072 * try_to_free_buffers() is non-blocking.
3073 */
3074static inline int buffer_busy(struct buffer_head *bh)
3075{
3076	return atomic_read(&bh->b_count) |
3077		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3078}
3079
3080static int
3081drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3082{
3083	struct buffer_head *head = page_buffers(page);
3084	struct buffer_head *bh;
3085
3086	bh = head;
3087	do {
3088		if (buffer_write_io_error(bh) && page->mapping)
3089			set_bit(AS_EIO, &page->mapping->flags);
3090		if (buffer_busy(bh))
3091			goto failed;
3092		bh = bh->b_this_page;
3093	} while (bh != head);
3094
3095	do {
3096		struct buffer_head *next = bh->b_this_page;
3097
3098		if (bh->b_assoc_map)
3099			__remove_assoc_queue(bh);
3100		bh = next;
3101	} while (bh != head);
3102	*buffers_to_free = head;
3103	__clear_page_buffers(page);
3104	return 1;
3105failed:
3106	return 0;
3107}
3108
3109int try_to_free_buffers(struct page *page)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3110{
3111	struct address_space * const mapping = page->mapping;
3112	struct buffer_head *buffers_to_free = NULL;
3113	int ret = 0;
3114
3115	BUG_ON(!PageLocked(page));
3116	if (PageWriteback(page))
3117		return 0;
3118
3119	if (mapping == NULL) {		/* can this still happen? */
3120		ret = drop_buffers(page, &buffers_to_free);
3121		goto out;
3122	}
3123
3124	spin_lock(&mapping->private_lock);
3125	ret = drop_buffers(page, &buffers_to_free);
3126
3127	/*
3128	 * If the filesystem writes its buffers by hand (eg ext3)
3129	 * then we can have clean buffers against a dirty page.  We
3130	 * clean the page here; otherwise the VM will never notice
3131	 * that the filesystem did any IO at all.
3132	 *
3133	 * Also, during truncate, discard_buffer will have marked all
3134	 * the page's buffers clean.  We discover that here and clean
3135	 * the page also.
3136	 *
3137	 * private_lock must be held over this entire operation in order
3138	 * to synchronise against __set_page_dirty_buffers and prevent the
3139	 * dirty bit from being lost.
3140	 */
3141	if (ret)
3142		cancel_dirty_page(page, PAGE_CACHE_SIZE);
3143	spin_unlock(&mapping->private_lock);
3144out:
3145	if (buffers_to_free) {
3146		struct buffer_head *bh = buffers_to_free;
3147
3148		do {
3149			struct buffer_head *next = bh->b_this_page;
3150			free_buffer_head(bh);
3151			bh = next;
3152		} while (bh != buffers_to_free);
3153	}
3154	return ret;
3155}
3156EXPORT_SYMBOL(try_to_free_buffers);
3157
3158/*
3159 * There are no bdflush tunables left.  But distributions are
3160 * still running obsolete flush daemons, so we terminate them here.
3161 *
3162 * Use of bdflush() is deprecated and will be removed in a future kernel.
3163 * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3164 */
3165SYSCALL_DEFINE2(bdflush, int, func, long, data)
3166{
3167	static int msg_count;
3168
3169	if (!capable(CAP_SYS_ADMIN))
3170		return -EPERM;
3171
3172	if (msg_count < 5) {
3173		msg_count++;
3174		printk(KERN_INFO
3175			"warning: process `%s' used the obsolete bdflush"
3176			" system call\n", current->comm);
3177		printk(KERN_INFO "Fix your initscripts?\n");
3178	}
3179
3180	if (func == 1)
3181		do_exit(0);
3182	return 0;
3183}
3184
3185/*
3186 * Buffer-head allocation
3187 */
3188static struct kmem_cache *bh_cachep;
3189
3190/*
3191 * Once the number of bh's in the machine exceeds this level, we start
3192 * stripping them in writeback.
3193 */
3194static int max_buffer_heads;
3195
3196int buffer_heads_over_limit;
3197
3198struct bh_accounting {
3199	int nr;			/* Number of live bh's */
3200	int ratelimit;		/* Limit cacheline bouncing */
3201};
3202
3203static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3204
3205static void recalc_bh_state(void)
3206{
3207	int i;
3208	int tot = 0;
3209
3210	if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3211		return;
3212	__this_cpu_write(bh_accounting.ratelimit, 0);
3213	for_each_online_cpu(i)
3214		tot += per_cpu(bh_accounting, i).nr;
3215	buffer_heads_over_limit = (tot > max_buffer_heads);
3216}
3217
3218struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3219{
3220	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3221	if (ret) {
3222		INIT_LIST_HEAD(&ret->b_assoc_buffers);
 
3223		preempt_disable();
3224		__this_cpu_inc(bh_accounting.nr);
3225		recalc_bh_state();
3226		preempt_enable();
3227	}
3228	return ret;
3229}
3230EXPORT_SYMBOL(alloc_buffer_head);
3231
3232void free_buffer_head(struct buffer_head *bh)
3233{
3234	BUG_ON(!list_empty(&bh->b_assoc_buffers));
3235	kmem_cache_free(bh_cachep, bh);
3236	preempt_disable();
3237	__this_cpu_dec(bh_accounting.nr);
3238	recalc_bh_state();
3239	preempt_enable();
3240}
3241EXPORT_SYMBOL(free_buffer_head);
3242
3243static void buffer_exit_cpu(int cpu)
3244{
3245	int i;
3246	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3247
3248	for (i = 0; i < BH_LRU_SIZE; i++) {
3249		brelse(b->bhs[i]);
3250		b->bhs[i] = NULL;
3251	}
3252	this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3253	per_cpu(bh_accounting, cpu).nr = 0;
3254}
3255
3256static int buffer_cpu_notify(struct notifier_block *self,
3257			      unsigned long action, void *hcpu)
3258{
3259	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3260		buffer_exit_cpu((unsigned long)hcpu);
3261	return NOTIFY_OK;
3262}
3263
3264/**
3265 * bh_uptodate_or_lock - Test whether the buffer is uptodate
3266 * @bh: struct buffer_head
3267 *
3268 * Return true if the buffer is up-to-date and false,
3269 * with the buffer locked, if not.
3270 */
3271int bh_uptodate_or_lock(struct buffer_head *bh)
3272{
3273	if (!buffer_uptodate(bh)) {
3274		lock_buffer(bh);
3275		if (!buffer_uptodate(bh))
3276			return 0;
3277		unlock_buffer(bh);
3278	}
3279	return 1;
3280}
3281EXPORT_SYMBOL(bh_uptodate_or_lock);
3282
3283/**
3284 * bh_submit_read - Submit a locked buffer for reading
3285 * @bh: struct buffer_head
 
 
3286 *
3287 * Returns zero on success and -EIO on error.
3288 */
3289int bh_submit_read(struct buffer_head *bh)
3290{
 
 
3291	BUG_ON(!buffer_locked(bh));
3292
3293	if (buffer_uptodate(bh)) {
3294		unlock_buffer(bh);
3295		return 0;
 
 
 
 
3296	}
 
 
 
3297
3298	get_bh(bh);
3299	bh->b_end_io = end_buffer_read_sync;
3300	submit_bh(READ, bh);
3301	wait_on_buffer(bh);
3302	if (buffer_uptodate(bh))
3303		return 0;
3304	return -EIO;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3305}
3306EXPORT_SYMBOL(bh_submit_read);
3307
3308void __init buffer_init(void)
3309{
3310	int nrpages;
3311
3312	bh_cachep = kmem_cache_create("buffer_head",
3313			sizeof(struct buffer_head), 0,
3314				(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3315				SLAB_MEM_SPREAD),
3316				NULL);
3317
 
 
3318	/*
3319	 * Limit the bh occupancy to 10% of ZONE_NORMAL
3320	 */
3321	nrpages = (nr_free_buffer_pages() * 10) / 100;
3322	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3323	hotcpu_notifier(buffer_cpu_notify, 0);
 
 
3324}