xfs_aops.c - fs/xfs/xfs_aops.c - Linux source code v3.5.6

   1/*
   2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include "xfs_log.h"
  20#include "xfs_sb.h"
  21#include "xfs_ag.h"
  22#include "xfs_trans.h"
  23#include "xfs_mount.h"
  24#include "xfs_bmap_btree.h"
  25#include "xfs_dinode.h"
  26#include "xfs_inode.h"
  27#include "xfs_inode_item.h"
  28#include "xfs_alloc.h"
  29#include "xfs_error.h"
  30#include "xfs_iomap.h"
  31#include "xfs_vnodeops.h"
  32#include "xfs_trace.h"
  33#include "xfs_bmap.h"
  34#include <linux/gfp.h>
  35#include <linux/mpage.h>
  36#include <linux/pagevec.h>
  37#include <linux/writeback.h>
  38
  39void
  40xfs_count_page_state(
  41	struct page		*page,
  42	int			*delalloc,
  43	int			*unwritten)
  44{
  45	struct buffer_head	*bh, *head;
  46
  47	*delalloc = *unwritten = 0;
  48
  49	bh = head = page_buffers(page);
  50	do {
  51		if (buffer_unwritten(bh))
  52			(*unwritten) = 1;
  53		else if (buffer_delay(bh))
  54			(*delalloc) = 1;
  55	} while ((bh = bh->b_this_page) != head);
  56}
  57
  58STATIC struct block_device *
  59xfs_find_bdev_for_inode(
  60	struct inode		*inode)
  61{
  62	struct xfs_inode	*ip = XFS_I(inode);
  63	struct xfs_mount	*mp = ip->i_mount;
  64
  65	if (XFS_IS_REALTIME_INODE(ip))
  66		return mp->m_rtdev_targp->bt_bdev;
  67	else
  68		return mp->m_ddev_targp->bt_bdev;
  69}
  70
  71/*
  72 * We're now finished for good with this ioend structure.
  73 * Update the page state via the associated buffer_heads,
  74 * release holds on the inode and bio, and finally free
  75 * up memory.  Do not use the ioend after this.
  76 */
  77STATIC void
  78xfs_destroy_ioend(
  79	xfs_ioend_t		*ioend)
  80{
  81	struct buffer_head	*bh, *next;
  82
  83	for (bh = ioend->io_buffer_head; bh; bh = next) {
  84		next = bh->b_private;
  85		bh->b_end_io(bh, !ioend->io_error);
  86	}
  87
  88	if (ioend->io_iocb) {
  89		if (ioend->io_isasync) {
  90			aio_complete(ioend->io_iocb, ioend->io_error ?
  91					ioend->io_error : ioend->io_result, 0);
  92		}
  93		inode_dio_done(ioend->io_inode);
  94	}
  95
  96	mempool_free(ioend, xfs_ioend_pool);
  97}
  98
  99/*
 100 * Fast and loose check if this write could update the on-disk inode size.
 101 */
 102static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
 103{
 104	return ioend->io_offset + ioend->io_size >
 105		XFS_I(ioend->io_inode)->i_d.di_size;
 106}
 107
 108STATIC int
 109xfs_setfilesize_trans_alloc(
 110	struct xfs_ioend	*ioend)
 111{
 112	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
 113	struct xfs_trans	*tp;
 114	int			error;
 115
 116	tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
 117
 118	error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
 119	if (error) {
 120		xfs_trans_cancel(tp, 0);
 121		return error;
 122	}
 123
 124	ioend->io_append_trans = tp;
 125
 126	/*
 127	 * We hand off the transaction to the completion thread now, so
 128	 * clear the flag here.
 129	 */
 130	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
 131	return 0;
 132}
 133
 134/*
 135 * Update on-disk file size now that data has been written to disk.
 136 */
 137STATIC int
 138xfs_setfilesize(
 139	struct xfs_ioend	*ioend)
 140{
 141	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
 142	struct xfs_trans	*tp = ioend->io_append_trans;
 143	xfs_fsize_t		isize;
 144
 145	/*
 146	 * The transaction was allocated in the I/O submission thread,
 147	 * thus we need to mark ourselves as beeing in a transaction
 148	 * manually.
 149	 */
 150	current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
 151
 152	xfs_ilock(ip, XFS_ILOCK_EXCL);
 153	isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
 154	if (!isize) {
 155		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 156		xfs_trans_cancel(tp, 0);
 157		return 0;
 158	}
 159
 160	trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
 161
 162	ip->i_d.di_size = isize;
 163	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 164	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 165
 166	return xfs_trans_commit(tp, 0);
 167}
 168
 169/*
 170 * Schedule IO completion handling on the final put of an ioend.
 171 *
 172 * If there is no work to do we might as well call it a day and free the
 173 * ioend right now.
 174 */
 175STATIC void
 176xfs_finish_ioend(
 177	struct xfs_ioend	*ioend)
 178{
 179	if (atomic_dec_and_test(&ioend->io_remaining)) {
 180		struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
 181
 182		if (ioend->io_type == IO_UNWRITTEN)
 183			queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
 184		else if (ioend->io_append_trans)
 185			queue_work(mp->m_data_workqueue, &ioend->io_work);
 186		else
 187			xfs_destroy_ioend(ioend);
 188	}
 189}
 190
 191/*
 192 * IO write completion.
 193 */
 194STATIC void
 195xfs_end_io(
 196	struct work_struct *work)
 197{
 198	xfs_ioend_t	*ioend = container_of(work, xfs_ioend_t, io_work);
 199	struct xfs_inode *ip = XFS_I(ioend->io_inode);
 200	int		error = 0;
 201
 202	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 203		ioend->io_error = -EIO;
 204		goto done;
 205	}
 206	if (ioend->io_error)
 207		goto done;
 208
 209	/*
 210	 * For unwritten extents we need to issue transactions to convert a
 211	 * range to normal written extens after the data I/O has finished.
 212	 */
 213	if (ioend->io_type == IO_UNWRITTEN) {
 214		/*
 215		 * For buffered I/O we never preallocate a transaction when
 216		 * doing the unwritten extent conversion, but for direct I/O
 217		 * we do not know if we are converting an unwritten extent
 218		 * or not at the point where we preallocate the transaction.
 219		 */
 220		if (ioend->io_append_trans) {
 221			ASSERT(ioend->io_isdirect);
 222
 223			current_set_flags_nested(
 224				&ioend->io_append_trans->t_pflags, PF_FSTRANS);
 225			xfs_trans_cancel(ioend->io_append_trans, 0);
 226		}
 227
 228		error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
 229						 ioend->io_size);
 230		if (error) {
 231			ioend->io_error = -error;
 232			goto done;
 233		}
 234	} else if (ioend->io_append_trans) {
 235		error = xfs_setfilesize(ioend);
 236		if (error)
 237			ioend->io_error = -error;
 238	} else {
 239		ASSERT(!xfs_ioend_is_append(ioend));
 240	}
 241
 242done:
 243	xfs_destroy_ioend(ioend);
 244}
 245
 246/*
 247 * Call IO completion handling in caller context on the final put of an ioend.
 248 */
 249STATIC void
 250xfs_finish_ioend_sync(
 251	struct xfs_ioend	*ioend)
 252{
 253	if (atomic_dec_and_test(&ioend->io_remaining))
 254		xfs_end_io(&ioend->io_work);
 255}
 256
 257/*
 258 * Allocate and initialise an IO completion structure.
 259 * We need to track unwritten extent write completion here initially.
 260 * We'll need to extend this for updating the ondisk inode size later
 261 * (vs. incore size).
 262 */
 263STATIC xfs_ioend_t *
 264xfs_alloc_ioend(
 265	struct inode		*inode,
 266	unsigned int		type)
 267{
 268	xfs_ioend_t		*ioend;
 269
 270	ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
 271
 272	/*
 273	 * Set the count to 1 initially, which will prevent an I/O
 274	 * completion callback from happening before we have started
 275	 * all the I/O from calling the completion routine too early.
 276	 */
 277	atomic_set(&ioend->io_remaining, 1);
 278	ioend->io_isasync = 0;
 279	ioend->io_isdirect = 0;
 280	ioend->io_error = 0;
 281	ioend->io_list = NULL;
 282	ioend->io_type = type;
 283	ioend->io_inode = inode;
 284	ioend->io_buffer_head = NULL;
 285	ioend->io_buffer_tail = NULL;
 286	ioend->io_offset = 0;
 287	ioend->io_size = 0;
 288	ioend->io_iocb = NULL;
 289	ioend->io_result = 0;
 290	ioend->io_append_trans = NULL;
 291
 292	INIT_WORK(&ioend->io_work, xfs_end_io);
 293	return ioend;
 294}
 295
 296STATIC int
 297xfs_map_blocks(
 298	struct inode		*inode,
 299	loff_t			offset,
 300	struct xfs_bmbt_irec	*imap,
 301	int			type,
 302	int			nonblocking)
 303{
 304	struct xfs_inode	*ip = XFS_I(inode);
 305	struct xfs_mount	*mp = ip->i_mount;
 306	ssize_t			count = 1 << inode->i_blkbits;
 307	xfs_fileoff_t		offset_fsb, end_fsb;
 308	int			error = 0;
 309	int			bmapi_flags = XFS_BMAPI_ENTIRE;
 310	int			nimaps = 1;
 311
 312	if (XFS_FORCED_SHUTDOWN(mp))
 313		return -XFS_ERROR(EIO);
 314
 315	if (type == IO_UNWRITTEN)
 316		bmapi_flags |= XFS_BMAPI_IGSTATE;
 317
 318	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
 319		if (nonblocking)
 320			return -XFS_ERROR(EAGAIN);
 321		xfs_ilock(ip, XFS_ILOCK_SHARED);
 322	}
 323
 324	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 325	       (ip->i_df.if_flags & XFS_IFEXTENTS));
 326	ASSERT(offset <= mp->m_maxioffset);
 327
 328	if (offset + count > mp->m_maxioffset)
 329		count = mp->m_maxioffset - offset;
 330	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
 331	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 332	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
 333				imap, &nimaps, bmapi_flags);
 334	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 335
 336	if (error)
 337		return -XFS_ERROR(error);
 338
 339	if (type == IO_DELALLOC &&
 340	    (!nimaps || isnullstartblock(imap->br_startblock))) {
 341		error = xfs_iomap_write_allocate(ip, offset, count, imap);
 342		if (!error)
 343			trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
 344		return -XFS_ERROR(error);
 345	}
 346
 347#ifdef DEBUG
 348	if (type == IO_UNWRITTEN) {
 349		ASSERT(nimaps);
 350		ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 351		ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 352	}
 353#endif
 354	if (nimaps)
 355		trace_xfs_map_blocks_found(ip, offset, count, type, imap);
 356	return 0;
 357}
 358
 359STATIC int
 360xfs_imap_valid(
 361	struct inode		*inode,
 362	struct xfs_bmbt_irec	*imap,
 363	xfs_off_t		offset)
 364{
 365	offset >>= inode->i_blkbits;
 366
 367	return offset >= imap->br_startoff &&
 368		offset < imap->br_startoff + imap->br_blockcount;
 369}
 370
 371/*
 372 * BIO completion handler for buffered IO.
 373 */
 374STATIC void
 375xfs_end_bio(
 376	struct bio		*bio,
 377	int			error)
 378{
 379	xfs_ioend_t		*ioend = bio->bi_private;
 380
 381	ASSERT(atomic_read(&bio->bi_cnt) >= 1);
 382	ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
 383
 384	/* Toss bio and pass work off to an xfsdatad thread */
 385	bio->bi_private = NULL;
 386	bio->bi_end_io = NULL;
 387	bio_put(bio);
 388
 389	xfs_finish_ioend(ioend);
 390}
 391
 392STATIC void
 393xfs_submit_ioend_bio(
 394	struct writeback_control *wbc,
 395	xfs_ioend_t		*ioend,
 396	struct bio		*bio)
 397{
 398	atomic_inc(&ioend->io_remaining);
 399	bio->bi_private = ioend;
 400	bio->bi_end_io = xfs_end_bio;
 401	submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
 402}
 403
 404STATIC struct bio *
 405xfs_alloc_ioend_bio(
 406	struct buffer_head	*bh)
 407{
 408	int			nvecs = bio_get_nr_vecs(bh->b_bdev);
 409	struct bio		*bio = bio_alloc(GFP_NOIO, nvecs);
 410
 411	ASSERT(bio->bi_private == NULL);
 412	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 413	bio->bi_bdev = bh->b_bdev;
 414	return bio;
 415}
 416
 417STATIC void
 418xfs_start_buffer_writeback(
 419	struct buffer_head	*bh)
 420{
 421	ASSERT(buffer_mapped(bh));
 422	ASSERT(buffer_locked(bh));
 423	ASSERT(!buffer_delay(bh));
 424	ASSERT(!buffer_unwritten(bh));
 425
 426	mark_buffer_async_write(bh);
 427	set_buffer_uptodate(bh);
 428	clear_buffer_dirty(bh);
 429}
 430
 431STATIC void
 432xfs_start_page_writeback(
 433	struct page		*page,
 434	int			clear_dirty,
 435	int			buffers)
 436{
 437	ASSERT(PageLocked(page));
 438	ASSERT(!PageWriteback(page));
 439	if (clear_dirty)
 440		clear_page_dirty_for_io(page);
 441	set_page_writeback(page);
 442	unlock_page(page);
 443	/* If no buffers on the page are to be written, finish it here */
 444	if (!buffers)
 445		end_page_writeback(page);
 446}
 447
 448static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
 449{
 450	return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
 451}
 452
 453/*
 454 * Submit all of the bios for all of the ioends we have saved up, covering the
 455 * initial writepage page and also any probed pages.
 456 *
 457 * Because we may have multiple ioends spanning a page, we need to start
 458 * writeback on all the buffers before we submit them for I/O. If we mark the
 459 * buffers as we got, then we can end up with a page that only has buffers
 460 * marked async write and I/O complete on can occur before we mark the other
 461 * buffers async write.
 462 *
 463 * The end result of this is that we trip a bug in end_page_writeback() because
 464 * we call it twice for the one page as the code in end_buffer_async_write()
 465 * assumes that all buffers on the page are started at the same time.
 466 *
 467 * The fix is two passes across the ioend list - one to start writeback on the
 468 * buffer_heads, and then submit them for I/O on the second pass.
 469 */
 470STATIC void
 471xfs_submit_ioend(
 472	struct writeback_control *wbc,
 473	xfs_ioend_t		*ioend)
 474{
 475	xfs_ioend_t		*head = ioend;
 476	xfs_ioend_t		*next;
 477	struct buffer_head	*bh;
 478	struct bio		*bio;
 479	sector_t		lastblock = 0;
 480
 481	/* Pass 1 - start writeback */
 482	do {
 483		next = ioend->io_list;
 484		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
 485			xfs_start_buffer_writeback(bh);
 486	} while ((ioend = next) != NULL);
 487
 488	/* Pass 2 - submit I/O */
 489	ioend = head;
 490	do {
 491		next = ioend->io_list;
 492		bio = NULL;
 493
 494		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
 495
 496			if (!bio) {
 497 retry:
 498				bio = xfs_alloc_ioend_bio(bh);
 499			} else if (bh->b_blocknr != lastblock + 1) {
 500				xfs_submit_ioend_bio(wbc, ioend, bio);
 501				goto retry;
 502			}
 503
 504			if (bio_add_buffer(bio, bh) != bh->b_size) {
 505				xfs_submit_ioend_bio(wbc, ioend, bio);
 506				goto retry;
 507			}
 508
 509			lastblock = bh->b_blocknr;
 510		}
 511		if (bio)
 512			xfs_submit_ioend_bio(wbc, ioend, bio);
 513		xfs_finish_ioend(ioend);
 514	} while ((ioend = next) != NULL);
 515}
 516
 517/*
 518 * Cancel submission of all buffer_heads so far in this endio.
 519 * Toss the endio too.  Only ever called for the initial page
 520 * in a writepage request, so only ever one page.
 521 */
 522STATIC void
 523xfs_cancel_ioend(
 524	xfs_ioend_t		*ioend)
 525{
 526	xfs_ioend_t		*next;
 527	struct buffer_head	*bh, *next_bh;
 528
 529	do {
 530		next = ioend->io_list;
 531		bh = ioend->io_buffer_head;
 532		do {
 533			next_bh = bh->b_private;
 534			clear_buffer_async_write(bh);
 535			unlock_buffer(bh);
 536		} while ((bh = next_bh) != NULL);
 537
 538		mempool_free(ioend, xfs_ioend_pool);
 539	} while ((ioend = next) != NULL);
 540}
 541
 542/*
 543 * Test to see if we've been building up a completion structure for
 544 * earlier buffers -- if so, we try to append to this ioend if we
 545 * can, otherwise we finish off any current ioend and start another.
 546 * Return true if we've finished the given ioend.
 547 */
 548STATIC void
 549xfs_add_to_ioend(
 550	struct inode		*inode,
 551	struct buffer_head	*bh,
 552	xfs_off_t		offset,
 553	unsigned int		type,
 554	xfs_ioend_t		**result,
 555	int			need_ioend)
 556{
 557	xfs_ioend_t		*ioend = *result;
 558
 559	if (!ioend || need_ioend || type != ioend->io_type) {
 560		xfs_ioend_t	*previous = *result;
 561
 562		ioend = xfs_alloc_ioend(inode, type);
 563		ioend->io_offset = offset;
 564		ioend->io_buffer_head = bh;
 565		ioend->io_buffer_tail = bh;
 566		if (previous)
 567			previous->io_list = ioend;
 568		*result = ioend;
 569	} else {
 570		ioend->io_buffer_tail->b_private = bh;
 571		ioend->io_buffer_tail = bh;
 572	}
 573
 574	bh->b_private = NULL;
 575	ioend->io_size += bh->b_size;
 576}
 577
 578STATIC void
 579xfs_map_buffer(
 580	struct inode		*inode,
 581	struct buffer_head	*bh,
 582	struct xfs_bmbt_irec	*imap,
 583	xfs_off_t		offset)
 584{
 585	sector_t		bn;
 586	struct xfs_mount	*m = XFS_I(inode)->i_mount;
 587	xfs_off_t		iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
 588	xfs_daddr_t		iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
 589
 590	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 591	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 592
 593	bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
 594	      ((offset - iomap_offset) >> inode->i_blkbits);
 595
 596	ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
 597
 598	bh->b_blocknr = bn;
 599	set_buffer_mapped(bh);
 600}
 601
 602STATIC void
 603xfs_map_at_offset(
 604	struct inode		*inode,
 605	struct buffer_head	*bh,
 606	struct xfs_bmbt_irec	*imap,
 607	xfs_off_t		offset)
 608{
 609	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 610	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 611
 612	xfs_map_buffer(inode, bh, imap, offset);
 613	set_buffer_mapped(bh);
 614	clear_buffer_delay(bh);
 615	clear_buffer_unwritten(bh);
 616}
 617
 618/*
 619 * Test if a given page is suitable for writing as part of an unwritten
 620 * or delayed allocate extent.
 621 */
 622STATIC int
 623xfs_check_page_type(
 624	struct page		*page,
 625	unsigned int		type)
 626{
 627	if (PageWriteback(page))
 628		return 0;
 629
 630	if (page->mapping && page_has_buffers(page)) {
 631		struct buffer_head	*bh, *head;
 632		int			acceptable = 0;
 633
 634		bh = head = page_buffers(page);
 635		do {
 636			if (buffer_unwritten(bh))
 637				acceptable += (type == IO_UNWRITTEN);
 638			else if (buffer_delay(bh))
 639				acceptable += (type == IO_DELALLOC);
 640			else if (buffer_dirty(bh) && buffer_mapped(bh))
 641				acceptable += (type == IO_OVERWRITE);
 642			else
 643				break;
 644		} while ((bh = bh->b_this_page) != head);
 645
 646		if (acceptable)
 647			return 1;
 648	}
 649
 650	return 0;
 651}
 652
 653/*
 654 * Allocate & map buffers for page given the extent map. Write it out.
 655 * except for the original page of a writepage, this is called on
 656 * delalloc/unwritten pages only, for the original page it is possible
 657 * that the page has no mapping at all.
 658 */
 659STATIC int
 660xfs_convert_page(
 661	struct inode		*inode,
 662	struct page		*page,
 663	loff_t			tindex,
 664	struct xfs_bmbt_irec	*imap,
 665	xfs_ioend_t		**ioendp,
 666	struct writeback_control *wbc)
 667{
 668	struct buffer_head	*bh, *head;
 669	xfs_off_t		end_offset;
 670	unsigned long		p_offset;
 671	unsigned int		type;
 672	int			len, page_dirty;
 673	int			count = 0, done = 0, uptodate = 1;
 674 	xfs_off_t		offset = page_offset(page);
 675
 676	if (page->index != tindex)
 677		goto fail;
 678	if (!trylock_page(page))
 679		goto fail;
 680	if (PageWriteback(page))
 681		goto fail_unlock_page;
 682	if (page->mapping != inode->i_mapping)
 683		goto fail_unlock_page;
 684	if (!xfs_check_page_type(page, (*ioendp)->io_type))
 685		goto fail_unlock_page;
 686
 687	/*
 688	 * page_dirty is initially a count of buffers on the page before
 689	 * EOF and is decremented as we move each into a cleanable state.
 690	 *
 691	 * Derivation:
 692	 *
 693	 * End offset is the highest offset that this page should represent.
 694	 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
 695	 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
 696	 * hence give us the correct page_dirty count. On any other page,
 697	 * it will be zero and in that case we need page_dirty to be the
 698	 * count of buffers on the page.
 699	 */
 700	end_offset = min_t(unsigned long long,
 701			(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
 702			i_size_read(inode));
 703
 704	len = 1 << inode->i_blkbits;
 705	p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
 706					PAGE_CACHE_SIZE);
 707	p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
 708	page_dirty = p_offset / len;
 709
 710	bh = head = page_buffers(page);
 711	do {
 712		if (offset >= end_offset)
 713			break;
 714		if (!buffer_uptodate(bh))
 715			uptodate = 0;
 716		if (!(PageUptodate(page) || buffer_uptodate(bh))) {
 717			done = 1;
 718			continue;
 719		}
 720
 721		if (buffer_unwritten(bh) || buffer_delay(bh) ||
 722		    buffer_mapped(bh)) {
 723			if (buffer_unwritten(bh))
 724				type = IO_UNWRITTEN;
 725			else if (buffer_delay(bh))
 726				type = IO_DELALLOC;
 727			else
 728				type = IO_OVERWRITE;
 729
 730			if (!xfs_imap_valid(inode, imap, offset)) {
 731				done = 1;
 732				continue;
 733			}
 734
 735			lock_buffer(bh);
 736			if (type != IO_OVERWRITE)
 737				xfs_map_at_offset(inode, bh, imap, offset);
 738			xfs_add_to_ioend(inode, bh, offset, type,
 739					 ioendp, done);
 740
 741			page_dirty--;
 742			count++;
 743		} else {
 744			done = 1;
 745		}
 746	} while (offset += len, (bh = bh->b_this_page) != head);
 747
 748	if (uptodate && bh == head)
 749		SetPageUptodate(page);
 750
 751	if (count) {
 752		if (--wbc->nr_to_write <= 0 &&
 753		    wbc->sync_mode == WB_SYNC_NONE)
 754			done = 1;
 755	}
 756	xfs_start_page_writeback(page, !page_dirty, count);
 757
 758	return done;
 759 fail_unlock_page:
 760	unlock_page(page);
 761 fail:
 762	return 1;
 763}
 764
 765/*
 766 * Convert & write out a cluster of pages in the same extent as defined
 767 * by mp and following the start page.
 768 */
 769STATIC void
 770xfs_cluster_write(
 771	struct inode		*inode,
 772	pgoff_t			tindex,
 773	struct xfs_bmbt_irec	*imap,
 774	xfs_ioend_t		**ioendp,
 775	struct writeback_control *wbc,
 776	pgoff_t			tlast)
 777{
 778	struct pagevec		pvec;
 779	int			done = 0, i;
 780
 781	pagevec_init(&pvec, 0);
 782	while (!done && tindex <= tlast) {
 783		unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
 784
 785		if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
 786			break;
 787
 788		for (i = 0; i < pagevec_count(&pvec); i++) {
 789			done = xfs_convert_page(inode, pvec.pages[i], tindex++,
 790					imap, ioendp, wbc);
 791			if (done)
 792				break;
 793		}
 794
 795		pagevec_release(&pvec);
 796		cond_resched();
 797	}
 798}
 799
 800STATIC void
 801xfs_vm_invalidatepage(
 802	struct page		*page,
 803	unsigned long		offset)
 804{
 805	trace_xfs_invalidatepage(page->mapping->host, page, offset);
 806	block_invalidatepage(page, offset);
 807}
 808
 809/*
 810 * If the page has delalloc buffers on it, we need to punch them out before we
 811 * invalidate the page. If we don't, we leave a stale delalloc mapping on the
 812 * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
 813 * is done on that same region - the delalloc extent is returned when none is
 814 * supposed to be there.
 815 *
 816 * We prevent this by truncating away the delalloc regions on the page before
 817 * invalidating it. Because they are delalloc, we can do this without needing a
 818 * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
 819 * truncation without a transaction as there is no space left for block
 820 * reservation (typically why we see a ENOSPC in writeback).
 821 *
 822 * This is not a performance critical path, so for now just do the punching a
 823 * buffer head at a time.
 824 */
 825STATIC void
 826xfs_aops_discard_page(
 827	struct page		*page)
 828{
 829	struct inode		*inode = page->mapping->host;
 830	struct xfs_inode	*ip = XFS_I(inode);
 831	struct buffer_head	*bh, *head;
 832	loff_t			offset = page_offset(page);
 833
 834	if (!xfs_check_page_type(page, IO_DELALLOC))
 835		goto out_invalidate;
 836
 837	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 838		goto out_invalidate;
 839
 840	xfs_alert(ip->i_mount,
 841		"page discard on page %p, inode 0x%llx, offset %llu.",
 842			page, ip->i_ino, offset);
 843
 844	xfs_ilock(ip, XFS_ILOCK_EXCL);
 845	bh = head = page_buffers(page);
 846	do {
 847		int		error;
 848		xfs_fileoff_t	start_fsb;
 849
 850		if (!buffer_delay(bh))
 851			goto next_buffer;
 852
 853		start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
 854		error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
 855		if (error) {
 856			/* something screwed, just bail */
 857			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 858				xfs_alert(ip->i_mount,
 859			"page discard unable to remove delalloc mapping.");
 860			}
 861			break;
 862		}
 863next_buffer:
 864		offset += 1 << inode->i_blkbits;
 865
 866	} while ((bh = bh->b_this_page) != head);
 867
 868	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 869out_invalidate:
 870	xfs_vm_invalidatepage(page, 0);
 871	return;
 872}
 873
 874/*
 875 * Write out a dirty page.
 876 *
 877 * For delalloc space on the page we need to allocate space and flush it.
 878 * For unwritten space on the page we need to start the conversion to
 879 * regular allocated space.
 880 * For any other dirty buffer heads on the page we should flush them.
 881 */
 882STATIC int
 883xfs_vm_writepage(
 884	struct page		*page,
 885	struct writeback_control *wbc)
 886{
 887	struct inode		*inode = page->mapping->host;
 888	struct buffer_head	*bh, *head;
 889	struct xfs_bmbt_irec	imap;
 890	xfs_ioend_t		*ioend = NULL, *iohead = NULL;
 891	loff_t			offset;
 892	unsigned int		type;
 893	__uint64_t              end_offset;
 894	pgoff_t                 end_index, last_index;
 895	ssize_t			len;
 896	int			err, imap_valid = 0, uptodate = 1;
 897	int			count = 0;
 898	int			nonblocking = 0;
 899
 900	trace_xfs_writepage(inode, page, 0);
 901
 902	ASSERT(page_has_buffers(page));
 903
 904	/*
 905	 * Refuse to write the page out if we are called from reclaim context.
 906	 *
 907	 * This avoids stack overflows when called from deeply used stacks in
 908	 * random callers for direct reclaim or memcg reclaim.  We explicitly
 909	 * allow reclaim from kswapd as the stack usage there is relatively low.
 910	 *
 911	 * This should never happen except in the case of a VM regression so
 912	 * warn about it.
 913	 */
 914	if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
 915			PF_MEMALLOC))
 916		goto redirty;
 917
 918	/*
 919	 * Given that we do not allow direct reclaim to call us, we should
 920	 * never be called while in a filesystem transaction.
 921	 */
 922	if (WARN_ON(current->flags & PF_FSTRANS))
 923		goto redirty;
 924
 925	/* Is this page beyond the end of the file? */
 926	offset = i_size_read(inode);
 927	end_index = offset >> PAGE_CACHE_SHIFT;
 928	last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
 929	if (page->index >= end_index) {
 930		if ((page->index >= end_index + 1) ||
 931		    !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
 932			unlock_page(page);
 933			return 0;
 934		}
 935	}
 936
 937	end_offset = min_t(unsigned long long,
 938			(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
 939			offset);
 940	len = 1 << inode->i_blkbits;
 941
 942	bh = head = page_buffers(page);
 943	offset = page_offset(page);
 944	type = IO_OVERWRITE;
 945
 946	if (wbc->sync_mode == WB_SYNC_NONE)
 947		nonblocking = 1;
 948
 949	do {
 950		int new_ioend = 0;
 951
 952		if (offset >= end_offset)
 953			break;
 954		if (!buffer_uptodate(bh))
 955			uptodate = 0;
 956
 957		/*
 958		 * set_page_dirty dirties all buffers in a page, independent
 959		 * of their state.  The dirty state however is entirely
 960		 * meaningless for holes (!mapped && uptodate), so skip
 961		 * buffers covering holes here.
 962		 */
 963		if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
 964			imap_valid = 0;
 965			continue;
 966		}
 967
 968		if (buffer_unwritten(bh)) {
 969			if (type != IO_UNWRITTEN) {
 970				type = IO_UNWRITTEN;
 971				imap_valid = 0;
 972			}
 973		} else if (buffer_delay(bh)) {
 974			if (type != IO_DELALLOC) {
 975				type = IO_DELALLOC;
 976				imap_valid = 0;
 977			}
 978		} else if (buffer_uptodate(bh)) {
 979			if (type != IO_OVERWRITE) {
 980				type = IO_OVERWRITE;
 981				imap_valid = 0;
 982			}
 983		} else {
 984			if (PageUptodate(page))
 985				ASSERT(buffer_mapped(bh));
 986			/*
 987			 * This buffer is not uptodate and will not be
 988			 * written to disk.  Ensure that we will put any
 989			 * subsequent writeable buffers into a new
 990			 * ioend.
 991			 */
 992			imap_valid = 0;
 993			continue;
 994		}
 995
 996		if (imap_valid)
 997			imap_valid = xfs_imap_valid(inode, &imap, offset);
 998		if (!imap_valid) {
 999			/*
1000			 * If we didn't have a valid mapping then we need to
1001			 * put the new mapping into a separate ioend structure.
1002			 * This ensures non-contiguous extents always have
1003			 * separate ioends, which is particularly important
1004			 * for unwritten extent conversion at I/O completion
1005			 * time.
1006			 */
1007			new_ioend = 1;
1008			err = xfs_map_blocks(inode, offset, &imap, type,
1009					     nonblocking);
1010			if (err)
1011				goto error;
1012			imap_valid = xfs_imap_valid(inode, &imap, offset);
1013		}
1014		if (imap_valid) {
1015			lock_buffer(bh);
1016			if (type != IO_OVERWRITE)
1017				xfs_map_at_offset(inode, bh, &imap, offset);
1018			xfs_add_to_ioend(inode, bh, offset, type, &ioend,
1019					 new_ioend);
1020			count++;
1021		}
1022
1023		if (!iohead)
1024			iohead = ioend;
1025
1026	} while (offset += len, ((bh = bh->b_this_page) != head));
1027
1028	if (uptodate && bh == head)
1029		SetPageUptodate(page);
1030
1031	xfs_start_page_writeback(page, 1, count);
1032
1033	if (ioend && imap_valid) {
1034		xfs_off_t		end_index;
1035
1036		end_index = imap.br_startoff + imap.br_blockcount;
1037
1038		/* to bytes */
1039		end_index <<= inode->i_blkbits;
1040
1041		/* to pages */
1042		end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
1043
1044		/* check against file size */
1045		if (end_index > last_index)
1046			end_index = last_index;
1047
1048		xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
1049				  wbc, end_index);
1050	}
1051
1052	if (iohead) {
1053		/*
1054		 * Reserve log space if we might write beyond the on-disk
1055		 * inode size.
1056		 */
1057		if (ioend->io_type != IO_UNWRITTEN &&
1058		    xfs_ioend_is_append(ioend)) {
1059			err = xfs_setfilesize_trans_alloc(ioend);
1060			if (err)
1061				goto error;
1062		}
1063
1064		xfs_submit_ioend(wbc, iohead);
1065	}
1066
1067	return 0;
1068
1069error:
1070	if (iohead)
1071		xfs_cancel_ioend(iohead);
1072
1073	if (err == -EAGAIN)
1074		goto redirty;
1075
1076	xfs_aops_discard_page(page);
1077	ClearPageUptodate(page);
1078	unlock_page(page);
1079	return err;
1080
1081redirty:
1082	redirty_page_for_writepage(wbc, page);
1083	unlock_page(page);
1084	return 0;
1085}
1086
1087STATIC int
1088xfs_vm_writepages(
1089	struct address_space	*mapping,
1090	struct writeback_control *wbc)
1091{
1092	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
1093	return generic_writepages(mapping, wbc);
1094}
1095
1096/*
1097 * Called to move a page into cleanable state - and from there
1098 * to be released. The page should already be clean. We always
1099 * have buffer heads in this call.
1100 *
1101 * Returns 1 if the page is ok to release, 0 otherwise.
1102 */
1103STATIC int
1104xfs_vm_releasepage(
1105	struct page		*page,
1106	gfp_t			gfp_mask)
1107{
1108	int			delalloc, unwritten;
1109
1110	trace_xfs_releasepage(page->mapping->host, page, 0);
1111
1112	xfs_count_page_state(page, &delalloc, &unwritten);
1113
1114	if (WARN_ON(delalloc))
1115		return 0;
1116	if (WARN_ON(unwritten))
1117		return 0;
1118
1119	return try_to_free_buffers(page);
1120}
1121
1122STATIC int
1123__xfs_get_blocks(
1124	struct inode		*inode,
1125	sector_t		iblock,
1126	struct buffer_head	*bh_result,
1127	int			create,
1128	int			direct)
1129{
1130	struct xfs_inode	*ip = XFS_I(inode);
1131	struct xfs_mount	*mp = ip->i_mount;
1132	xfs_fileoff_t		offset_fsb, end_fsb;
1133	int			error = 0;
1134	int			lockmode = 0;
1135	struct xfs_bmbt_irec	imap;
1136	int			nimaps = 1;
1137	xfs_off_t		offset;
1138	ssize_t			size;
1139	int			new = 0;
1140
1141	if (XFS_FORCED_SHUTDOWN(mp))
1142		return -XFS_ERROR(EIO);
1143
1144	offset = (xfs_off_t)iblock << inode->i_blkbits;
1145	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
1146	size = bh_result->b_size;
1147
1148	if (!create && direct && offset >= i_size_read(inode))
1149		return 0;
1150
1151	/*
1152	 * Direct I/O is usually done on preallocated files, so try getting
1153	 * a block mapping without an exclusive lock first.  For buffered
1154	 * writes we already have the exclusive iolock anyway, so avoiding
1155	 * a lock roundtrip here by taking the ilock exclusive from the
1156	 * beginning is a useful micro optimization.
1157	 */
1158	if (create && !direct) {
1159		lockmode = XFS_ILOCK_EXCL;
1160		xfs_ilock(ip, lockmode);
1161	} else {
1162		lockmode = xfs_ilock_map_shared(ip);
1163	}
1164
1165	ASSERT(offset <= mp->m_maxioffset);
1166	if (offset + size > mp->m_maxioffset)
1167		size = mp->m_maxioffset - offset;
1168	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
1169	offset_fsb = XFS_B_TO_FSBT(mp, offset);
1170
1171	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
1172				&imap, &nimaps, XFS_BMAPI_ENTIRE);
1173	if (error)
1174		goto out_unlock;
1175
1176	if (create &&
1177	    (!nimaps ||
1178	     (imap.br_startblock == HOLESTARTBLOCK ||
1179	      imap.br_startblock == DELAYSTARTBLOCK))) {
1180		if (direct || xfs_get_extsz_hint(ip)) {
1181			/*
1182			 * Drop the ilock in preparation for starting the block
1183			 * allocation transaction.  It will be retaken
1184			 * exclusively inside xfs_iomap_write_direct for the
1185			 * actual allocation.
1186			 */
1187			xfs_iunlock(ip, lockmode);
1188			error = xfs_iomap_write_direct(ip, offset, size,
1189						       &imap, nimaps);
1190			if (error)
1191				return -error;
1192			new = 1;
1193		} else {
1194			/*
1195			 * Delalloc reservations do not require a transaction,
1196			 * we can go on without dropping the lock here. If we
1197			 * are allocating a new delalloc block, make sure that
1198			 * we set the new flag so that we mark the buffer new so
1199			 * that we know that it is newly allocated if the write
1200			 * fails.
1201			 */
1202			if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
1203				new = 1;
1204			error = xfs_iomap_write_delay(ip, offset, size, &imap);
1205			if (error)
1206				goto out_unlock;
1207
1208			xfs_iunlock(ip, lockmode);
1209		}
1210
1211		trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
1212	} else if (nimaps) {
1213		trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
1214		xfs_iunlock(ip, lockmode);
1215	} else {
1216		trace_xfs_get_blocks_notfound(ip, offset, size);
1217		goto out_unlock;
1218	}
1219
1220	if (imap.br_startblock != HOLESTARTBLOCK &&
1221	    imap.br_startblock != DELAYSTARTBLOCK) {
1222		/*
1223		 * For unwritten extents do not report a disk address on
1224		 * the read case (treat as if we're reading into a hole).
1225		 */
1226		if (create || !ISUNWRITTEN(&imap))
1227			xfs_map_buffer(inode, bh_result, &imap, offset);
1228		if (create && ISUNWRITTEN(&imap)) {
1229			if (direct)
1230				bh_result->b_private = inode;
1231			set_buffer_unwritten(bh_result);
1232		}
1233	}
1234
1235	/*
1236	 * If this is a realtime file, data may be on a different device.
1237	 * to that pointed to from the buffer_head b_bdev currently.
1238	 */
1239	bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
1240
1241	/*
1242	 * If we previously allocated a block out beyond eof and we are now
1243	 * coming back to use it then we will need to flag it as new even if it
1244	 * has a disk address.
1245	 *
1246	 * With sub-block writes into unwritten extents we also need to mark
1247	 * the buffer as new so that the unwritten parts of the buffer gets
1248	 * correctly zeroed.
1249	 */
1250	if (create &&
1251	    ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
1252	     (offset >= i_size_read(inode)) ||
1253	     (new || ISUNWRITTEN(&imap))))
1254		set_buffer_new(bh_result);
1255
1256	if (imap.br_startblock == DELAYSTARTBLOCK) {
1257		BUG_ON(direct);
1258		if (create) {
1259			set_buffer_uptodate(bh_result);
1260			set_buffer_mapped(bh_result);
1261			set_buffer_delay(bh_result);
1262		}
1263	}
1264
1265	/*
1266	 * If this is O_DIRECT or the mpage code calling tell them how large
1267	 * the mapping is, so that we can avoid repeated get_blocks calls.
1268	 */
1269	if (direct || size > (1 << inode->i_blkbits)) {
1270		xfs_off_t		mapping_size;
1271
1272		mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
1273		mapping_size <<= inode->i_blkbits;
1274
1275		ASSERT(mapping_size > 0);
1276		if (mapping_size > size)
1277			mapping_size = size;
1278		if (mapping_size > LONG_MAX)
1279			mapping_size = LONG_MAX;
1280
1281		bh_result->b_size = mapping_size;
1282	}
1283
1284	return 0;
1285
1286out_unlock:
1287	xfs_iunlock(ip, lockmode);
1288	return -error;
1289}
1290
1291int
1292xfs_get_blocks(
1293	struct inode		*inode,
1294	sector_t		iblock,
1295	struct buffer_head	*bh_result,
1296	int			create)
1297{
1298	return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
1299}
1300
1301STATIC int
1302xfs_get_blocks_direct(
1303	struct inode		*inode,
1304	sector_t		iblock,
1305	struct buffer_head	*bh_result,
1306	int			create)
1307{
1308	return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
1309}
1310
1311/*
1312 * Complete a direct I/O write request.
1313 *
1314 * If the private argument is non-NULL __xfs_get_blocks signals us that we
1315 * need to issue a transaction to convert the range from unwritten to written
1316 * extents.  In case this is regular synchronous I/O we just call xfs_end_io
1317 * to do this and we are done.  But in case this was a successful AIO
1318 * request this handler is called from interrupt context, from which we
1319 * can't start transactions.  In that case offload the I/O completion to
1320 * the workqueues we also use for buffered I/O completion.
1321 */
1322STATIC void
1323xfs_end_io_direct_write(
1324	struct kiocb		*iocb,
1325	loff_t			offset,
1326	ssize_t			size,
1327	void			*private,
1328	int			ret,
1329	bool			is_async)
1330{
1331	struct xfs_ioend	*ioend = iocb->private;
1332
1333	/*
1334	 * While the generic direct I/O code updates the inode size, it does
1335	 * so only after the end_io handler is called, which means our
1336	 * end_io handler thinks the on-disk size is outside the in-core
1337	 * size.  To prevent this just update it a little bit earlier here.
1338	 */
1339	if (offset + size > i_size_read(ioend->io_inode))
1340		i_size_write(ioend->io_inode, offset + size);
1341
1342	/*
1343	 * blockdev_direct_IO can return an error even after the I/O
1344	 * completion handler was called.  Thus we need to protect
1345	 * against double-freeing.
1346	 */
1347	iocb->private = NULL;
1348
1349	ioend->io_offset = offset;
1350	ioend->io_size = size;
1351	ioend->io_iocb = iocb;
1352	ioend->io_result = ret;
1353	if (private && size > 0)
1354		ioend->io_type = IO_UNWRITTEN;
1355
1356	if (is_async) {
1357		ioend->io_isasync = 1;
1358		xfs_finish_ioend(ioend);
1359	} else {
1360		xfs_finish_ioend_sync(ioend);
1361	}
1362}
1363
1364STATIC ssize_t
1365xfs_vm_direct_IO(
1366	int			rw,
1367	struct kiocb		*iocb,
1368	const struct iovec	*iov,
1369	loff_t			offset,
1370	unsigned long		nr_segs)
1371{
1372	struct inode		*inode = iocb->ki_filp->f_mapping->host;
1373	struct block_device	*bdev = xfs_find_bdev_for_inode(inode);
1374	struct xfs_ioend	*ioend = NULL;
1375	ssize_t			ret;
1376
1377	if (rw & WRITE) {
1378		size_t size = iov_length(iov, nr_segs);
1379
1380		/*
1381		 * We need to preallocate a transaction for a size update
1382		 * here.  In the case that this write both updates the size
1383		 * and converts at least on unwritten extent we will cancel
1384		 * the still clean transaction after the I/O has finished.
1385		 */
1386		iocb->private = ioend = xfs_alloc_ioend(inode, IO_DIRECT);
1387		if (offset + size > XFS_I(inode)->i_d.di_size) {
1388			ret = xfs_setfilesize_trans_alloc(ioend);
1389			if (ret)
1390				goto out_destroy_ioend;
1391			ioend->io_isdirect = 1;
1392		}
1393
1394		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1395					    offset, nr_segs,
1396					    xfs_get_blocks_direct,
1397					    xfs_end_io_direct_write, NULL, 0);
1398		if (ret != -EIOCBQUEUED && iocb->private)
1399			goto out_trans_cancel;
1400	} else {
1401		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1402					    offset, nr_segs,
1403					    xfs_get_blocks_direct,
1404					    NULL, NULL, 0);
1405	}
1406
1407	return ret;
1408
1409out_trans_cancel:
1410	if (ioend->io_append_trans) {
1411		current_set_flags_nested(&ioend->io_append_trans->t_pflags,
1412					 PF_FSTRANS);
1413		xfs_trans_cancel(ioend->io_append_trans, 0);
1414	}
1415out_destroy_ioend:
1416	xfs_destroy_ioend(ioend);
1417	return ret;
1418}
1419
1420/*
1421 * Punch out the delalloc blocks we have already allocated.
1422 *
1423 * Don't bother with xfs_setattr given that nothing can have made it to disk yet
1424 * as the page is still locked at this point.
1425 */
1426STATIC void
1427xfs_vm_kill_delalloc_range(
1428	struct inode		*inode,
1429	loff_t			start,
1430	loff_t			end)
1431{
1432	struct xfs_inode	*ip = XFS_I(inode);
1433	xfs_fileoff_t		start_fsb;
1434	xfs_fileoff_t		end_fsb;
1435	int			error;
1436
1437	start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
1438	end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
1439	if (end_fsb <= start_fsb)
1440		return;
1441
1442	xfs_ilock(ip, XFS_ILOCK_EXCL);
1443	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
1444						end_fsb - start_fsb);
1445	if (error) {
1446		/* something screwed, just bail */
1447		if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1448			xfs_alert(ip->i_mount,
1449		"xfs_vm_write_failed: unable to clean up ino %lld",
1450					ip->i_ino);
1451		}
1452	}
1453	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1454}
1455
1456STATIC void
1457xfs_vm_write_failed(
1458	struct inode		*inode,
1459	struct page		*page,
1460	loff_t			pos,
1461	unsigned		len)
1462{
1463	loff_t			block_offset = pos & PAGE_MASK;
1464	loff_t			block_start;
1465	loff_t			block_end;
1466	loff_t			from = pos & (PAGE_CACHE_SIZE - 1);
1467	loff_t			to = from + len;
1468	struct buffer_head	*bh, *head;
1469
1470	ASSERT(block_offset + from == pos);
1471
1472	head = page_buffers(page);
1473	block_start = 0;
1474	for (bh = head; bh != head || !block_start;
1475	     bh = bh->b_this_page, block_start = block_end,
1476				   block_offset += bh->b_size) {
1477		block_end = block_start + bh->b_size;
1478
1479		/* skip buffers before the write */
1480		if (block_end <= from)
1481			continue;
1482
1483		/* if the buffer is after the write, we're done */
1484		if (block_start >= to)
1485			break;
1486
1487		if (!buffer_delay(bh))
1488			continue;
1489
1490		if (!buffer_new(bh) && block_offset < i_size_read(inode))
1491			continue;
1492
1493		xfs_vm_kill_delalloc_range(inode, block_offset,
1494					   block_offset + bh->b_size);
1495	}
1496
1497}
1498
1499/*
1500 * This used to call block_write_begin(), but it unlocks and releases the page
1501 * on error, and we need that page to be able to punch stale delalloc blocks out
1502 * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
1503 * the appropriate point.
1504 */
1505STATIC int
1506xfs_vm_write_begin(
1507	struct file		*file,
1508	struct address_space	*mapping,
1509	loff_t			pos,
1510	unsigned		len,
1511	unsigned		flags,
1512	struct page		**pagep,
1513	void			**fsdata)
1514{
1515	pgoff_t			index = pos >> PAGE_CACHE_SHIFT;
1516	struct page		*page;
1517	int			status;
1518
1519	ASSERT(len <= PAGE_CACHE_SIZE);
1520
1521	page = grab_cache_page_write_begin(mapping, index,
1522					   flags | AOP_FLAG_NOFS);
1523	if (!page)
1524		return -ENOMEM;
1525
1526	status = __block_write_begin(page, pos, len, xfs_get_blocks);
1527	if (unlikely(status)) {
1528		struct inode	*inode = mapping->host;
1529
1530		xfs_vm_write_failed(inode, page, pos, len);
1531		unlock_page(page);
1532
1533		if (pos + len > i_size_read(inode))
1534			truncate_pagecache(inode, pos + len, i_size_read(inode));
1535
1536		page_cache_release(page);
1537		page = NULL;
1538	}
1539
1540	*pagep = page;
1541	return status;
1542}
1543
1544/*
1545 * On failure, we only need to kill delalloc blocks beyond EOF because they
1546 * will never be written. For blocks within EOF, generic_write_end() zeros them
1547 * so they are safe to leave alone and be written with all the other valid data.
1548 */
1549STATIC int
1550xfs_vm_write_end(
1551	struct file		*file,
1552	struct address_space	*mapping,
1553	loff_t			pos,
1554	unsigned		len,
1555	unsigned		copied,
1556	struct page		*page,
1557	void			*fsdata)
1558{
1559	int			ret;
1560
1561	ASSERT(len <= PAGE_CACHE_SIZE);
1562
1563	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
1564	if (unlikely(ret < len)) {
1565		struct inode	*inode = mapping->host;
1566		size_t		isize = i_size_read(inode);
1567		loff_t		to = pos + len;
1568
1569		if (to > isize) {
1570			truncate_pagecache(inode, to, isize);
1571			xfs_vm_kill_delalloc_range(inode, isize, to);
1572		}
1573	}
1574	return ret;
1575}
1576
1577STATIC sector_t
1578xfs_vm_bmap(
1579	struct address_space	*mapping,
1580	sector_t		block)
1581{
1582	struct inode		*inode = (struct inode *)mapping->host;
1583	struct xfs_inode	*ip = XFS_I(inode);
1584
1585	trace_xfs_vm_bmap(XFS_I(inode));
1586	xfs_ilock(ip, XFS_IOLOCK_SHARED);
1587	xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
1588	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
1589	return generic_block_bmap(mapping, block, xfs_get_blocks);
1590}
1591
1592STATIC int
1593xfs_vm_readpage(
1594	struct file		*unused,
1595	struct page		*page)
1596{
1597	return mpage_readpage(page, xfs_get_blocks);
1598}
1599
1600STATIC int
1601xfs_vm_readpages(
1602	struct file		*unused,
1603	struct address_space	*mapping,
1604	struct list_head	*pages,
1605	unsigned		nr_pages)
1606{
1607	return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
1608}
1609
1610const struct address_space_operations xfs_address_space_operations = {
1611	.readpage		= xfs_vm_readpage,
1612	.readpages		= xfs_vm_readpages,
1613	.writepage		= xfs_vm_writepage,
1614	.writepages		= xfs_vm_writepages,
1615	.releasepage		= xfs_vm_releasepage,
1616	.invalidatepage		= xfs_vm_invalidatepage,
1617	.write_begin		= xfs_vm_write_begin,
1618	.write_end		= xfs_vm_write_end,
1619	.bmap			= xfs_vm_bmap,
1620	.direct_IO		= xfs_vm_direct_IO,
1621	.migratepage		= buffer_migrate_page,
1622	.is_partially_uptodate  = block_is_partially_uptodate,
1623	.error_remove_page	= generic_error_remove_page,
1624};