Linux Audio

Check our new training course

Loading...
   1/*
   2 * "splice": joining two ropes together by interweaving their strands.
   3 *
   4 * This is the "extended pipe" functionality, where a pipe is used as
   5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
   6 * buffer that you can use to transfer data from one end to the other.
   7 *
   8 * The traditional unix read/write is extended with a "splice()" operation
   9 * that transfers data buffers to or from a pipe buffer.
  10 *
  11 * Named by Larry McVoy, original implementation from Linus, extended by
  12 * Jens to support splicing to files, network, direct splicing, etc and
  13 * fixing lots of bugs.
  14 *
  15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
  16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
  17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
  18 *
  19 */
  20#include <linux/fs.h>
  21#include <linux/file.h>
  22#include <linux/pagemap.h>
  23#include <linux/splice.h>
  24#include <linux/memcontrol.h>
  25#include <linux/mm_inline.h>
  26#include <linux/swap.h>
  27#include <linux/writeback.h>
  28#include <linux/export.h>
  29#include <linux/syscalls.h>
  30#include <linux/uio.h>
  31#include <linux/security.h>
  32#include <linux/gfp.h>
  33#include <linux/socket.h>
  34
  35/*
  36 * Attempt to steal a page from a pipe buffer. This should perhaps go into
  37 * a vm helper function, it's already simplified quite a bit by the
  38 * addition of remove_mapping(). If success is returned, the caller may
  39 * attempt to reuse this page for another destination.
  40 */
  41static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
  42				     struct pipe_buffer *buf)
  43{
  44	struct page *page = buf->page;
  45	struct address_space *mapping;
  46
  47	lock_page(page);
  48
  49	mapping = page_mapping(page);
  50	if (mapping) {
  51		WARN_ON(!PageUptodate(page));
  52
  53		/*
  54		 * At least for ext2 with nobh option, we need to wait on
  55		 * writeback completing on this page, since we'll remove it
  56		 * from the pagecache.  Otherwise truncate wont wait on the
  57		 * page, allowing the disk blocks to be reused by someone else
  58		 * before we actually wrote our data to them. fs corruption
  59		 * ensues.
  60		 */
  61		wait_on_page_writeback(page);
  62
  63		if (page_has_private(page) &&
  64		    !try_to_release_page(page, GFP_KERNEL))
  65			goto out_unlock;
  66
  67		/*
  68		 * If we succeeded in removing the mapping, set LRU flag
  69		 * and return good.
  70		 */
  71		if (remove_mapping(mapping, page)) {
  72			buf->flags |= PIPE_BUF_FLAG_LRU;
  73			return 0;
  74		}
  75	}
  76
  77	/*
  78	 * Raced with truncate or failed to remove page from current
  79	 * address space, unlock and return failure.
  80	 */
  81out_unlock:
  82	unlock_page(page);
  83	return 1;
  84}
  85
  86static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
  87					struct pipe_buffer *buf)
  88{
  89	page_cache_release(buf->page);
  90	buf->flags &= ~PIPE_BUF_FLAG_LRU;
  91}
  92
  93/*
  94 * Check whether the contents of buf is OK to access. Since the content
  95 * is a page cache page, IO may be in flight.
  96 */
  97static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
  98				       struct pipe_buffer *buf)
  99{
 100	struct page *page = buf->page;
 101	int err;
 102
 103	if (!PageUptodate(page)) {
 104		lock_page(page);
 105
 106		/*
 107		 * Page got truncated/unhashed. This will cause a 0-byte
 108		 * splice, if this is the first page.
 109		 */
 110		if (!page->mapping) {
 111			err = -ENODATA;
 112			goto error;
 113		}
 114
 115		/*
 116		 * Uh oh, read-error from disk.
 117		 */
 118		if (!PageUptodate(page)) {
 119			err = -EIO;
 120			goto error;
 121		}
 122
 123		/*
 124		 * Page is ok afterall, we are done.
 125		 */
 126		unlock_page(page);
 127	}
 128
 129	return 0;
 130error:
 131	unlock_page(page);
 132	return err;
 133}
 134
 135const struct pipe_buf_operations page_cache_pipe_buf_ops = {
 136	.can_merge = 0,
 137	.map = generic_pipe_buf_map,
 138	.unmap = generic_pipe_buf_unmap,
 139	.confirm = page_cache_pipe_buf_confirm,
 140	.release = page_cache_pipe_buf_release,
 141	.steal = page_cache_pipe_buf_steal,
 142	.get = generic_pipe_buf_get,
 143};
 144
 145static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
 146				    struct pipe_buffer *buf)
 147{
 148	if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
 149		return 1;
 150
 151	buf->flags |= PIPE_BUF_FLAG_LRU;
 152	return generic_pipe_buf_steal(pipe, buf);
 153}
 154
 155static const struct pipe_buf_operations user_page_pipe_buf_ops = {
 156	.can_merge = 0,
 157	.map = generic_pipe_buf_map,
 158	.unmap = generic_pipe_buf_unmap,
 159	.confirm = generic_pipe_buf_confirm,
 160	.release = page_cache_pipe_buf_release,
 161	.steal = user_page_pipe_buf_steal,
 162	.get = generic_pipe_buf_get,
 163};
 164
 165static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
 166{
 167	smp_mb();
 168	if (waitqueue_active(&pipe->wait))
 169		wake_up_interruptible(&pipe->wait);
 170	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 171}
 172
 173/**
 174 * splice_to_pipe - fill passed data into a pipe
 175 * @pipe:	pipe to fill
 176 * @spd:	data to fill
 177 *
 178 * Description:
 179 *    @spd contains a map of pages and len/offset tuples, along with
 180 *    the struct pipe_buf_operations associated with these pages. This
 181 *    function will link that data to the pipe.
 182 *
 183 */
 184ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 185		       struct splice_pipe_desc *spd)
 186{
 187	unsigned int spd_pages = spd->nr_pages;
 188	int ret, do_wakeup, page_nr;
 189
 190	ret = 0;
 191	do_wakeup = 0;
 192	page_nr = 0;
 193
 194	pipe_lock(pipe);
 195
 196	for (;;) {
 197		if (!pipe->readers) {
 198			send_sig(SIGPIPE, current, 0);
 199			if (!ret)
 200				ret = -EPIPE;
 201			break;
 202		}
 203
 204		if (pipe->nrbufs < pipe->buffers) {
 205			int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
 206			struct pipe_buffer *buf = pipe->bufs + newbuf;
 207
 208			buf->page = spd->pages[page_nr];
 209			buf->offset = spd->partial[page_nr].offset;
 210			buf->len = spd->partial[page_nr].len;
 211			buf->private = spd->partial[page_nr].private;
 212			buf->ops = spd->ops;
 213			if (spd->flags & SPLICE_F_GIFT)
 214				buf->flags |= PIPE_BUF_FLAG_GIFT;
 215
 216			pipe->nrbufs++;
 217			page_nr++;
 218			ret += buf->len;
 219
 220			if (pipe->inode)
 221				do_wakeup = 1;
 222
 223			if (!--spd->nr_pages)
 224				break;
 225			if (pipe->nrbufs < pipe->buffers)
 226				continue;
 227
 228			break;
 229		}
 230
 231		if (spd->flags & SPLICE_F_NONBLOCK) {
 232			if (!ret)
 233				ret = -EAGAIN;
 234			break;
 235		}
 236
 237		if (signal_pending(current)) {
 238			if (!ret)
 239				ret = -ERESTARTSYS;
 240			break;
 241		}
 242
 243		if (do_wakeup) {
 244			smp_mb();
 245			if (waitqueue_active(&pipe->wait))
 246				wake_up_interruptible_sync(&pipe->wait);
 247			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 248			do_wakeup = 0;
 249		}
 250
 251		pipe->waiting_writers++;
 252		pipe_wait(pipe);
 253		pipe->waiting_writers--;
 254	}
 255
 256	pipe_unlock(pipe);
 257
 258	if (do_wakeup)
 259		wakeup_pipe_readers(pipe);
 260
 261	while (page_nr < spd_pages)
 262		spd->spd_release(spd, page_nr++);
 263
 264	return ret;
 265}
 266
 267void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
 268{
 269	page_cache_release(spd->pages[i]);
 270}
 271
 272/*
 273 * Check if we need to grow the arrays holding pages and partial page
 274 * descriptions.
 275 */
 276int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
 277{
 278	unsigned int buffers = ACCESS_ONCE(pipe->buffers);
 279
 280	spd->nr_pages_max = buffers;
 281	if (buffers <= PIPE_DEF_BUFFERS)
 282		return 0;
 283
 284	spd->pages = kmalloc(buffers * sizeof(struct page *), GFP_KERNEL);
 285	spd->partial = kmalloc(buffers * sizeof(struct partial_page), GFP_KERNEL);
 286
 287	if (spd->pages && spd->partial)
 288		return 0;
 289
 290	kfree(spd->pages);
 291	kfree(spd->partial);
 292	return -ENOMEM;
 293}
 294
 295void splice_shrink_spd(struct splice_pipe_desc *spd)
 296{
 297	if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
 298		return;
 299
 300	kfree(spd->pages);
 301	kfree(spd->partial);
 302}
 303
 304static int
 305__generic_file_splice_read(struct file *in, loff_t *ppos,
 306			   struct pipe_inode_info *pipe, size_t len,
 307			   unsigned int flags)
 308{
 309	struct address_space *mapping = in->f_mapping;
 310	unsigned int loff, nr_pages, req_pages;
 311	struct page *pages[PIPE_DEF_BUFFERS];
 312	struct partial_page partial[PIPE_DEF_BUFFERS];
 313	struct page *page;
 314	pgoff_t index, end_index;
 315	loff_t isize;
 316	int error, page_nr;
 317	struct splice_pipe_desc spd = {
 318		.pages = pages,
 319		.partial = partial,
 320		.nr_pages_max = PIPE_DEF_BUFFERS,
 321		.flags = flags,
 322		.ops = &page_cache_pipe_buf_ops,
 323		.spd_release = spd_release_page,
 324	};
 325
 326	if (splice_grow_spd(pipe, &spd))
 327		return -ENOMEM;
 328
 329	index = *ppos >> PAGE_CACHE_SHIFT;
 330	loff = *ppos & ~PAGE_CACHE_MASK;
 331	req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 332	nr_pages = min(req_pages, spd.nr_pages_max);
 333
 334	/*
 335	 * Lookup the (hopefully) full range of pages we need.
 336	 */
 337	spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
 338	index += spd.nr_pages;
 339
 340	/*
 341	 * If find_get_pages_contig() returned fewer pages than we needed,
 342	 * readahead/allocate the rest and fill in the holes.
 343	 */
 344	if (spd.nr_pages < nr_pages)
 345		page_cache_sync_readahead(mapping, &in->f_ra, in,
 346				index, req_pages - spd.nr_pages);
 347
 348	error = 0;
 349	while (spd.nr_pages < nr_pages) {
 350		/*
 351		 * Page could be there, find_get_pages_contig() breaks on
 352		 * the first hole.
 353		 */
 354		page = find_get_page(mapping, index);
 355		if (!page) {
 356			/*
 357			 * page didn't exist, allocate one.
 358			 */
 359			page = page_cache_alloc_cold(mapping);
 360			if (!page)
 361				break;
 362
 363			error = add_to_page_cache_lru(page, mapping, index,
 364						GFP_KERNEL);
 365			if (unlikely(error)) {
 366				page_cache_release(page);
 367				if (error == -EEXIST)
 368					continue;
 369				break;
 370			}
 371			/*
 372			 * add_to_page_cache() locks the page, unlock it
 373			 * to avoid convoluting the logic below even more.
 374			 */
 375			unlock_page(page);
 376		}
 377
 378		spd.pages[spd.nr_pages++] = page;
 379		index++;
 380	}
 381
 382	/*
 383	 * Now loop over the map and see if we need to start IO on any
 384	 * pages, fill in the partial map, etc.
 385	 */
 386	index = *ppos >> PAGE_CACHE_SHIFT;
 387	nr_pages = spd.nr_pages;
 388	spd.nr_pages = 0;
 389	for (page_nr = 0; page_nr < nr_pages; page_nr++) {
 390		unsigned int this_len;
 391
 392		if (!len)
 393			break;
 394
 395		/*
 396		 * this_len is the max we'll use from this page
 397		 */
 398		this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
 399		page = spd.pages[page_nr];
 400
 401		if (PageReadahead(page))
 402			page_cache_async_readahead(mapping, &in->f_ra, in,
 403					page, index, req_pages - page_nr);
 404
 405		/*
 406		 * If the page isn't uptodate, we may need to start io on it
 407		 */
 408		if (!PageUptodate(page)) {
 409			lock_page(page);
 410
 411			/*
 412			 * Page was truncated, or invalidated by the
 413			 * filesystem.  Redo the find/create, but this time the
 414			 * page is kept locked, so there's no chance of another
 415			 * race with truncate/invalidate.
 416			 */
 417			if (!page->mapping) {
 418				unlock_page(page);
 419				page = find_or_create_page(mapping, index,
 420						mapping_gfp_mask(mapping));
 421
 422				if (!page) {
 423					error = -ENOMEM;
 424					break;
 425				}
 426				page_cache_release(spd.pages[page_nr]);
 427				spd.pages[page_nr] = page;
 428			}
 429			/*
 430			 * page was already under io and is now done, great
 431			 */
 432			if (PageUptodate(page)) {
 433				unlock_page(page);
 434				goto fill_it;
 435			}
 436
 437			/*
 438			 * need to read in the page
 439			 */
 440			error = mapping->a_ops->readpage(in, page);
 441			if (unlikely(error)) {
 442				/*
 443				 * We really should re-lookup the page here,
 444				 * but it complicates things a lot. Instead
 445				 * lets just do what we already stored, and
 446				 * we'll get it the next time we are called.
 447				 */
 448				if (error == AOP_TRUNCATED_PAGE)
 449					error = 0;
 450
 451				break;
 452			}
 453		}
 454fill_it:
 455		/*
 456		 * i_size must be checked after PageUptodate.
 457		 */
 458		isize = i_size_read(mapping->host);
 459		end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
 460		if (unlikely(!isize || index > end_index))
 461			break;
 462
 463		/*
 464		 * if this is the last page, see if we need to shrink
 465		 * the length and stop
 466		 */
 467		if (end_index == index) {
 468			unsigned int plen;
 469
 470			/*
 471			 * max good bytes in this page
 472			 */
 473			plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
 474			if (plen <= loff)
 475				break;
 476
 477			/*
 478			 * force quit after adding this page
 479			 */
 480			this_len = min(this_len, plen - loff);
 481			len = this_len;
 482		}
 483
 484		spd.partial[page_nr].offset = loff;
 485		spd.partial[page_nr].len = this_len;
 486		len -= this_len;
 487		loff = 0;
 488		spd.nr_pages++;
 489		index++;
 490	}
 491
 492	/*
 493	 * Release any pages at the end, if we quit early. 'page_nr' is how far
 494	 * we got, 'nr_pages' is how many pages are in the map.
 495	 */
 496	while (page_nr < nr_pages)
 497		page_cache_release(spd.pages[page_nr++]);
 498	in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
 499
 500	if (spd.nr_pages)
 501		error = splice_to_pipe(pipe, &spd);
 502
 503	splice_shrink_spd(&spd);
 504	return error;
 505}
 506
 507/**
 508 * generic_file_splice_read - splice data from file to a pipe
 509 * @in:		file to splice from
 510 * @ppos:	position in @in
 511 * @pipe:	pipe to splice to
 512 * @len:	number of bytes to splice
 513 * @flags:	splice modifier flags
 514 *
 515 * Description:
 516 *    Will read pages from given file and fill them into a pipe. Can be
 517 *    used as long as the address_space operations for the source implements
 518 *    a readpage() hook.
 519 *
 520 */
 521ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 522				 struct pipe_inode_info *pipe, size_t len,
 523				 unsigned int flags)
 524{
 525	loff_t isize, left;
 526	int ret;
 527
 528	isize = i_size_read(in->f_mapping->host);
 529	if (unlikely(*ppos >= isize))
 530		return 0;
 531
 532	left = isize - *ppos;
 533	if (unlikely(left < len))
 534		len = left;
 535
 536	ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
 537	if (ret > 0) {
 538		*ppos += ret;
 539		file_accessed(in);
 540	}
 541
 542	return ret;
 543}
 544EXPORT_SYMBOL(generic_file_splice_read);
 545
 546static const struct pipe_buf_operations default_pipe_buf_ops = {
 547	.can_merge = 0,
 548	.map = generic_pipe_buf_map,
 549	.unmap = generic_pipe_buf_unmap,
 550	.confirm = generic_pipe_buf_confirm,
 551	.release = generic_pipe_buf_release,
 552	.steal = generic_pipe_buf_steal,
 553	.get = generic_pipe_buf_get,
 554};
 555
 556static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
 557			    unsigned long vlen, loff_t offset)
 558{
 559	mm_segment_t old_fs;
 560	loff_t pos = offset;
 561	ssize_t res;
 562
 563	old_fs = get_fs();
 564	set_fs(get_ds());
 565	/* The cast to a user pointer is valid due to the set_fs() */
 566	res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
 567	set_fs(old_fs);
 568
 569	return res;
 570}
 571
 572static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
 573			    loff_t pos)
 574{
 575	mm_segment_t old_fs;
 576	ssize_t res;
 577
 578	old_fs = get_fs();
 579	set_fs(get_ds());
 580	/* The cast to a user pointer is valid due to the set_fs() */
 581	res = vfs_write(file, (const char __user *)buf, count, &pos);
 582	set_fs(old_fs);
 583
 584	return res;
 585}
 586
 587ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 588				 struct pipe_inode_info *pipe, size_t len,
 589				 unsigned int flags)
 590{
 591	unsigned int nr_pages;
 592	unsigned int nr_freed;
 593	size_t offset;
 594	struct page *pages[PIPE_DEF_BUFFERS];
 595	struct partial_page partial[PIPE_DEF_BUFFERS];
 596	struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
 597	ssize_t res;
 598	size_t this_len;
 599	int error;
 600	int i;
 601	struct splice_pipe_desc spd = {
 602		.pages = pages,
 603		.partial = partial,
 604		.nr_pages_max = PIPE_DEF_BUFFERS,
 605		.flags = flags,
 606		.ops = &default_pipe_buf_ops,
 607		.spd_release = spd_release_page,
 608	};
 609
 610	if (splice_grow_spd(pipe, &spd))
 611		return -ENOMEM;
 612
 613	res = -ENOMEM;
 614	vec = __vec;
 615	if (spd.nr_pages_max > PIPE_DEF_BUFFERS) {
 616		vec = kmalloc(spd.nr_pages_max * sizeof(struct iovec), GFP_KERNEL);
 617		if (!vec)
 618			goto shrink_ret;
 619	}
 620
 621	offset = *ppos & ~PAGE_CACHE_MASK;
 622	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 623
 624	for (i = 0; i < nr_pages && i < spd.nr_pages_max && len; i++) {
 625		struct page *page;
 626
 627		page = alloc_page(GFP_USER);
 628		error = -ENOMEM;
 629		if (!page)
 630			goto err;
 631
 632		this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
 633		vec[i].iov_base = (void __user *) page_address(page);
 634		vec[i].iov_len = this_len;
 635		spd.pages[i] = page;
 636		spd.nr_pages++;
 637		len -= this_len;
 638		offset = 0;
 639	}
 640
 641	res = kernel_readv(in, vec, spd.nr_pages, *ppos);
 642	if (res < 0) {
 643		error = res;
 644		goto err;
 645	}
 646
 647	error = 0;
 648	if (!res)
 649		goto err;
 650
 651	nr_freed = 0;
 652	for (i = 0; i < spd.nr_pages; i++) {
 653		this_len = min_t(size_t, vec[i].iov_len, res);
 654		spd.partial[i].offset = 0;
 655		spd.partial[i].len = this_len;
 656		if (!this_len) {
 657			__free_page(spd.pages[i]);
 658			spd.pages[i] = NULL;
 659			nr_freed++;
 660		}
 661		res -= this_len;
 662	}
 663	spd.nr_pages -= nr_freed;
 664
 665	res = splice_to_pipe(pipe, &spd);
 666	if (res > 0)
 667		*ppos += res;
 668
 669shrink_ret:
 670	if (vec != __vec)
 671		kfree(vec);
 672	splice_shrink_spd(&spd);
 673	return res;
 674
 675err:
 676	for (i = 0; i < spd.nr_pages; i++)
 677		__free_page(spd.pages[i]);
 678
 679	res = error;
 680	goto shrink_ret;
 681}
 682EXPORT_SYMBOL(default_file_splice_read);
 683
 684/*
 685 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
 686 * using sendpage(). Return the number of bytes sent.
 687 */
 688static int pipe_to_sendpage(struct pipe_inode_info *pipe,
 689			    struct pipe_buffer *buf, struct splice_desc *sd)
 690{
 691	struct file *file = sd->u.file;
 692	loff_t pos = sd->pos;
 693	int more;
 694
 695	if (!likely(file->f_op && file->f_op->sendpage))
 696		return -EINVAL;
 697
 698	more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
 699	if (sd->len < sd->total_len)
 700		more |= MSG_SENDPAGE_NOTLAST;
 701	return file->f_op->sendpage(file, buf->page, buf->offset,
 702				    sd->len, &pos, more);
 703}
 704
 705/*
 706 * This is a little more tricky than the file -> pipe splicing. There are
 707 * basically three cases:
 708 *
 709 *	- Destination page already exists in the address space and there
 710 *	  are users of it. For that case we have no other option that
 711 *	  copying the data. Tough luck.
 712 *	- Destination page already exists in the address space, but there
 713 *	  are no users of it. Make sure it's uptodate, then drop it. Fall
 714 *	  through to last case.
 715 *	- Destination page does not exist, we can add the pipe page to
 716 *	  the page cache and avoid the copy.
 717 *
 718 * If asked to move pages to the output file (SPLICE_F_MOVE is set in
 719 * sd->flags), we attempt to migrate pages from the pipe to the output
 720 * file address space page cache. This is possible if no one else has
 721 * the pipe page referenced outside of the pipe and page cache. If
 722 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
 723 * a new page in the output file page cache and fill/dirty that.
 724 */
 725int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 726		 struct splice_desc *sd)
 727{
 728	struct file *file = sd->u.file;
 729	struct address_space *mapping = file->f_mapping;
 730	unsigned int offset, this_len;
 731	struct page *page;
 732	void *fsdata;
 733	int ret;
 734
 735	offset = sd->pos & ~PAGE_CACHE_MASK;
 736
 737	this_len = sd->len;
 738	if (this_len + offset > PAGE_CACHE_SIZE)
 739		this_len = PAGE_CACHE_SIZE - offset;
 740
 741	ret = pagecache_write_begin(file, mapping, sd->pos, this_len,
 742				AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
 743	if (unlikely(ret))
 744		goto out;
 745
 746	if (buf->page != page) {
 747		char *src = buf->ops->map(pipe, buf, 1);
 748		char *dst = kmap_atomic(page);
 749
 750		memcpy(dst + offset, src + buf->offset, this_len);
 751		flush_dcache_page(page);
 752		kunmap_atomic(dst);
 753		buf->ops->unmap(pipe, buf, src);
 754	}
 755	ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
 756				page, fsdata);
 757out:
 758	return ret;
 759}
 760EXPORT_SYMBOL(pipe_to_file);
 761
 762static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
 763{
 764	smp_mb();
 765	if (waitqueue_active(&pipe->wait))
 766		wake_up_interruptible(&pipe->wait);
 767	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 768}
 769
 770/**
 771 * splice_from_pipe_feed - feed available data from a pipe to a file
 772 * @pipe:	pipe to splice from
 773 * @sd:		information to @actor
 774 * @actor:	handler that splices the data
 775 *
 776 * Description:
 777 *    This function loops over the pipe and calls @actor to do the
 778 *    actual moving of a single struct pipe_buffer to the desired
 779 *    destination.  It returns when there's no more buffers left in
 780 *    the pipe or if the requested number of bytes (@sd->total_len)
 781 *    have been copied.  It returns a positive number (one) if the
 782 *    pipe needs to be filled with more data, zero if the required
 783 *    number of bytes have been copied and -errno on error.
 784 *
 785 *    This, together with splice_from_pipe_{begin,end,next}, may be
 786 *    used to implement the functionality of __splice_from_pipe() when
 787 *    locking is required around copying the pipe buffers to the
 788 *    destination.
 789 */
 790int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
 791			  splice_actor *actor)
 792{
 793	int ret;
 794
 795	while (pipe->nrbufs) {
 796		struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
 797		const struct pipe_buf_operations *ops = buf->ops;
 798
 799		sd->len = buf->len;
 800		if (sd->len > sd->total_len)
 801			sd->len = sd->total_len;
 802
 803		ret = buf->ops->confirm(pipe, buf);
 804		if (unlikely(ret)) {
 805			if (ret == -ENODATA)
 806				ret = 0;
 807			return ret;
 808		}
 809
 810		ret = actor(pipe, buf, sd);
 811		if (ret <= 0)
 812			return ret;
 813
 814		buf->offset += ret;
 815		buf->len -= ret;
 816
 817		sd->num_spliced += ret;
 818		sd->len -= ret;
 819		sd->pos += ret;
 820		sd->total_len -= ret;
 821
 822		if (!buf->len) {
 823			buf->ops = NULL;
 824			ops->release(pipe, buf);
 825			pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
 826			pipe->nrbufs--;
 827			if (pipe->inode)
 828				sd->need_wakeup = true;
 829		}
 830
 831		if (!sd->total_len)
 832			return 0;
 833	}
 834
 835	return 1;
 836}
 837EXPORT_SYMBOL(splice_from_pipe_feed);
 838
 839/**
 840 * splice_from_pipe_next - wait for some data to splice from
 841 * @pipe:	pipe to splice from
 842 * @sd:		information about the splice operation
 843 *
 844 * Description:
 845 *    This function will wait for some data and return a positive
 846 *    value (one) if pipe buffers are available.  It will return zero
 847 *    or -errno if no more data needs to be spliced.
 848 */
 849int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
 850{
 851	while (!pipe->nrbufs) {
 852		if (!pipe->writers)
 853			return 0;
 854
 855		if (!pipe->waiting_writers && sd->num_spliced)
 856			return 0;
 857
 858		if (sd->flags & SPLICE_F_NONBLOCK)
 859			return -EAGAIN;
 860
 861		if (signal_pending(current))
 862			return -ERESTARTSYS;
 863
 864		if (sd->need_wakeup) {
 865			wakeup_pipe_writers(pipe);
 866			sd->need_wakeup = false;
 867		}
 868
 869		pipe_wait(pipe);
 870	}
 871
 872	return 1;
 873}
 874EXPORT_SYMBOL(splice_from_pipe_next);
 875
 876/**
 877 * splice_from_pipe_begin - start splicing from pipe
 878 * @sd:		information about the splice operation
 879 *
 880 * Description:
 881 *    This function should be called before a loop containing
 882 *    splice_from_pipe_next() and splice_from_pipe_feed() to
 883 *    initialize the necessary fields of @sd.
 884 */
 885void splice_from_pipe_begin(struct splice_desc *sd)
 886{
 887	sd->num_spliced = 0;
 888	sd->need_wakeup = false;
 889}
 890EXPORT_SYMBOL(splice_from_pipe_begin);
 891
 892/**
 893 * splice_from_pipe_end - finish splicing from pipe
 894 * @pipe:	pipe to splice from
 895 * @sd:		information about the splice operation
 896 *
 897 * Description:
 898 *    This function will wake up pipe writers if necessary.  It should
 899 *    be called after a loop containing splice_from_pipe_next() and
 900 *    splice_from_pipe_feed().
 901 */
 902void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
 903{
 904	if (sd->need_wakeup)
 905		wakeup_pipe_writers(pipe);
 906}
 907EXPORT_SYMBOL(splice_from_pipe_end);
 908
 909/**
 910 * __splice_from_pipe - splice data from a pipe to given actor
 911 * @pipe:	pipe to splice from
 912 * @sd:		information to @actor
 913 * @actor:	handler that splices the data
 914 *
 915 * Description:
 916 *    This function does little more than loop over the pipe and call
 917 *    @actor to do the actual moving of a single struct pipe_buffer to
 918 *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
 919 *    pipe_to_user.
 920 *
 921 */
 922ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
 923			   splice_actor *actor)
 924{
 925	int ret;
 926
 927	splice_from_pipe_begin(sd);
 928	do {
 929		ret = splice_from_pipe_next(pipe, sd);
 930		if (ret > 0)
 931			ret = splice_from_pipe_feed(pipe, sd, actor);
 932	} while (ret > 0);
 933	splice_from_pipe_end(pipe, sd);
 934
 935	return sd->num_spliced ? sd->num_spliced : ret;
 936}
 937EXPORT_SYMBOL(__splice_from_pipe);
 938
 939/**
 940 * splice_from_pipe - splice data from a pipe to a file
 941 * @pipe:	pipe to splice from
 942 * @out:	file to splice to
 943 * @ppos:	position in @out
 944 * @len:	how many bytes to splice
 945 * @flags:	splice modifier flags
 946 * @actor:	handler that splices the data
 947 *
 948 * Description:
 949 *    See __splice_from_pipe. This function locks the pipe inode,
 950 *    otherwise it's identical to __splice_from_pipe().
 951 *
 952 */
 953ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 954			 loff_t *ppos, size_t len, unsigned int flags,
 955			 splice_actor *actor)
 956{
 957	ssize_t ret;
 958	struct splice_desc sd = {
 959		.total_len = len,
 960		.flags = flags,
 961		.pos = *ppos,
 962		.u.file = out,
 963	};
 964
 965	pipe_lock(pipe);
 966	ret = __splice_from_pipe(pipe, &sd, actor);
 967	pipe_unlock(pipe);
 968
 969	return ret;
 970}
 971
 972/**
 973 * generic_file_splice_write - splice data from a pipe to a file
 974 * @pipe:	pipe info
 975 * @out:	file to write to
 976 * @ppos:	position in @out
 977 * @len:	number of bytes to splice
 978 * @flags:	splice modifier flags
 979 *
 980 * Description:
 981 *    Will either move or copy pages (determined by @flags options) from
 982 *    the given pipe inode to the given file.
 983 *
 984 */
 985ssize_t
 986generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 987			  loff_t *ppos, size_t len, unsigned int flags)
 988{
 989	struct address_space *mapping = out->f_mapping;
 990	struct inode *inode = mapping->host;
 991	struct splice_desc sd = {
 992		.total_len = len,
 993		.flags = flags,
 994		.pos = *ppos,
 995		.u.file = out,
 996	};
 997	ssize_t ret;
 998
 999	pipe_lock(pipe);
1000
1001	splice_from_pipe_begin(&sd);
1002	do {
1003		ret = splice_from_pipe_next(pipe, &sd);
1004		if (ret <= 0)
1005			break;
1006
1007		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
1008		ret = file_remove_suid(out);
1009		if (!ret) {
1010			ret = file_update_time(out);
1011			if (!ret)
1012				ret = splice_from_pipe_feed(pipe, &sd,
1013							    pipe_to_file);
1014		}
1015		mutex_unlock(&inode->i_mutex);
1016	} while (ret > 0);
1017	splice_from_pipe_end(pipe, &sd);
1018
1019	pipe_unlock(pipe);
1020
1021	if (sd.num_spliced)
1022		ret = sd.num_spliced;
1023
1024	if (ret > 0) {
1025		unsigned long nr_pages;
1026		int err;
1027
1028		nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1029
1030		err = generic_write_sync(out, *ppos, ret);
1031		if (err)
1032			ret = err;
1033		else
1034			*ppos += ret;
1035		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
1036	}
1037
1038	return ret;
1039}
1040
1041EXPORT_SYMBOL(generic_file_splice_write);
1042
1043static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1044			  struct splice_desc *sd)
1045{
1046	int ret;
1047	void *data;
1048
1049	data = buf->ops->map(pipe, buf, 0);
1050	ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
1051	buf->ops->unmap(pipe, buf, data);
1052
1053	return ret;
1054}
1055
1056static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
1057					 struct file *out, loff_t *ppos,
1058					 size_t len, unsigned int flags)
1059{
1060	ssize_t ret;
1061
1062	ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
1063	if (ret > 0)
1064		*ppos += ret;
1065
1066	return ret;
1067}
1068
1069/**
1070 * generic_splice_sendpage - splice data from a pipe to a socket
1071 * @pipe:	pipe to splice from
1072 * @out:	socket to write to
1073 * @ppos:	position in @out
1074 * @len:	number of bytes to splice
1075 * @flags:	splice modifier flags
1076 *
1077 * Description:
1078 *    Will send @len bytes from the pipe to a network socket. No data copying
1079 *    is involved.
1080 *
1081 */
1082ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
1083				loff_t *ppos, size_t len, unsigned int flags)
1084{
1085	return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
1086}
1087
1088EXPORT_SYMBOL(generic_splice_sendpage);
1089
1090/*
1091 * Attempt to initiate a splice from pipe to file.
1092 */
1093static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
1094			   loff_t *ppos, size_t len, unsigned int flags)
1095{
1096	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
1097				loff_t *, size_t, unsigned int);
1098	int ret;
1099
1100	if (unlikely(!(out->f_mode & FMODE_WRITE)))
1101		return -EBADF;
1102
1103	if (unlikely(out->f_flags & O_APPEND))
1104		return -EINVAL;
1105
1106	ret = rw_verify_area(WRITE, out, ppos, len);
1107	if (unlikely(ret < 0))
1108		return ret;
1109
1110	if (out->f_op && out->f_op->splice_write)
1111		splice_write = out->f_op->splice_write;
1112	else
1113		splice_write = default_file_splice_write;
1114
1115	return splice_write(pipe, out, ppos, len, flags);
1116}
1117
1118/*
1119 * Attempt to initiate a splice from a file to a pipe.
1120 */
1121static long do_splice_to(struct file *in, loff_t *ppos,
1122			 struct pipe_inode_info *pipe, size_t len,
1123			 unsigned int flags)
1124{
1125	ssize_t (*splice_read)(struct file *, loff_t *,
1126			       struct pipe_inode_info *, size_t, unsigned int);
1127	int ret;
1128
1129	if (unlikely(!(in->f_mode & FMODE_READ)))
1130		return -EBADF;
1131
1132	ret = rw_verify_area(READ, in, ppos, len);
1133	if (unlikely(ret < 0))
1134		return ret;
1135
1136	if (in->f_op && in->f_op->splice_read)
1137		splice_read = in->f_op->splice_read;
1138	else
1139		splice_read = default_file_splice_read;
1140
1141	return splice_read(in, ppos, pipe, len, flags);
1142}
1143
1144/**
1145 * splice_direct_to_actor - splices data directly between two non-pipes
1146 * @in:		file to splice from
1147 * @sd:		actor information on where to splice to
1148 * @actor:	handles the data splicing
1149 *
1150 * Description:
1151 *    This is a special case helper to splice directly between two
1152 *    points, without requiring an explicit pipe. Internally an allocated
1153 *    pipe is cached in the process, and reused during the lifetime of
1154 *    that process.
1155 *
1156 */
1157ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
1158			       splice_direct_actor *actor)
1159{
1160	struct pipe_inode_info *pipe;
1161	long ret, bytes;
1162	umode_t i_mode;
1163	size_t len;
1164	int i, flags;
1165
1166	/*
1167	 * We require the input being a regular file, as we don't want to
1168	 * randomly drop data for eg socket -> socket splicing. Use the
1169	 * piped splicing for that!
1170	 */
1171	i_mode = in->f_path.dentry->d_inode->i_mode;
1172	if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
1173		return -EINVAL;
1174
1175	/*
1176	 * neither in nor out is a pipe, setup an internal pipe attached to
1177	 * 'out' and transfer the wanted data from 'in' to 'out' through that
1178	 */
1179	pipe = current->splice_pipe;
1180	if (unlikely(!pipe)) {
1181		pipe = alloc_pipe_info(NULL);
1182		if (!pipe)
1183			return -ENOMEM;
1184
1185		/*
1186		 * We don't have an immediate reader, but we'll read the stuff
1187		 * out of the pipe right after the splice_to_pipe(). So set
1188		 * PIPE_READERS appropriately.
1189		 */
1190		pipe->readers = 1;
1191
1192		current->splice_pipe = pipe;
1193	}
1194
1195	/*
1196	 * Do the splice.
1197	 */
1198	ret = 0;
1199	bytes = 0;
1200	len = sd->total_len;
1201	flags = sd->flags;
1202
1203	/*
1204	 * Don't block on output, we have to drain the direct pipe.
1205	 */
1206	sd->flags &= ~SPLICE_F_NONBLOCK;
1207
1208	while (len) {
1209		size_t read_len;
1210		loff_t pos = sd->pos, prev_pos = pos;
1211
1212		ret = do_splice_to(in, &pos, pipe, len, flags);
1213		if (unlikely(ret <= 0))
1214			goto out_release;
1215
1216		read_len = ret;
1217		sd->total_len = read_len;
1218
1219		/*
1220		 * NOTE: nonblocking mode only applies to the input. We
1221		 * must not do the output in nonblocking mode as then we
1222		 * could get stuck data in the internal pipe:
1223		 */
1224		ret = actor(pipe, sd);
1225		if (unlikely(ret <= 0)) {
1226			sd->pos = prev_pos;
1227			goto out_release;
1228		}
1229
1230		bytes += ret;
1231		len -= ret;
1232		sd->pos = pos;
1233
1234		if (ret < read_len) {
1235			sd->pos = prev_pos + ret;
1236			goto out_release;
1237		}
1238	}
1239
1240done:
1241	pipe->nrbufs = pipe->curbuf = 0;
1242	file_accessed(in);
1243	return bytes;
1244
1245out_release:
1246	/*
1247	 * If we did an incomplete transfer we must release
1248	 * the pipe buffers in question:
1249	 */
1250	for (i = 0; i < pipe->buffers; i++) {
1251		struct pipe_buffer *buf = pipe->bufs + i;
1252
1253		if (buf->ops) {
1254			buf->ops->release(pipe, buf);
1255			buf->ops = NULL;
1256		}
1257	}
1258
1259	if (!bytes)
1260		bytes = ret;
1261
1262	goto done;
1263}
1264EXPORT_SYMBOL(splice_direct_to_actor);
1265
1266static int direct_splice_actor(struct pipe_inode_info *pipe,
1267			       struct splice_desc *sd)
1268{
1269	struct file *file = sd->u.file;
1270
1271	return do_splice_from(pipe, file, &file->f_pos, sd->total_len,
1272			      sd->flags);
1273}
1274
1275/**
1276 * do_splice_direct - splices data directly between two files
1277 * @in:		file to splice from
1278 * @ppos:	input file offset
1279 * @out:	file to splice to
1280 * @len:	number of bytes to splice
1281 * @flags:	splice modifier flags
1282 *
1283 * Description:
1284 *    For use by do_sendfile(). splice can easily emulate sendfile, but
1285 *    doing it in the application would incur an extra system call
1286 *    (splice in + splice out, as compared to just sendfile()). So this helper
1287 *    can splice directly through a process-private pipe.
1288 *
1289 */
1290long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1291		      size_t len, unsigned int flags)
1292{
1293	struct splice_desc sd = {
1294		.len		= len,
1295		.total_len	= len,
1296		.flags		= flags,
1297		.pos		= *ppos,
1298		.u.file		= out,
1299	};
1300	long ret;
1301
1302	ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1303	if (ret > 0)
1304		*ppos = sd.pos;
1305
1306	return ret;
1307}
1308
1309static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1310			       struct pipe_inode_info *opipe,
1311			       size_t len, unsigned int flags);
1312
1313/*
1314 * Determine where to splice to/from.
1315 */
1316static long do_splice(struct file *in, loff_t __user *off_in,
1317		      struct file *out, loff_t __user *off_out,
1318		      size_t len, unsigned int flags)
1319{
1320	struct pipe_inode_info *ipipe;
1321	struct pipe_inode_info *opipe;
1322	loff_t offset, *off;
1323	long ret;
1324
1325	ipipe = get_pipe_info(in);
1326	opipe = get_pipe_info(out);
1327
1328	if (ipipe && opipe) {
1329		if (off_in || off_out)
1330			return -ESPIPE;
1331
1332		if (!(in->f_mode & FMODE_READ))
1333			return -EBADF;
1334
1335		if (!(out->f_mode & FMODE_WRITE))
1336			return -EBADF;
1337
1338		/* Splicing to self would be fun, but... */
1339		if (ipipe == opipe)
1340			return -EINVAL;
1341
1342		return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1343	}
1344
1345	if (ipipe) {
1346		if (off_in)
1347			return -ESPIPE;
1348		if (off_out) {
1349			if (!(out->f_mode & FMODE_PWRITE))
1350				return -EINVAL;
1351			if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1352				return -EFAULT;
1353			off = &offset;
1354		} else
1355			off = &out->f_pos;
1356
1357		ret = do_splice_from(ipipe, out, off, len, flags);
1358
1359		if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
1360			ret = -EFAULT;
1361
1362		return ret;
1363	}
1364
1365	if (opipe) {
1366		if (off_out)
1367			return -ESPIPE;
1368		if (off_in) {
1369			if (!(in->f_mode & FMODE_PREAD))
1370				return -EINVAL;
1371			if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1372				return -EFAULT;
1373			off = &offset;
1374		} else
1375			off = &in->f_pos;
1376
1377		ret = do_splice_to(in, off, opipe, len, flags);
1378
1379		if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
1380			ret = -EFAULT;
1381
1382		return ret;
1383	}
1384
1385	return -EINVAL;
1386}
1387
1388/*
1389 * Map an iov into an array of pages and offset/length tupples. With the
1390 * partial_page structure, we can map several non-contiguous ranges into
1391 * our ones pages[] map instead of splitting that operation into pieces.
1392 * Could easily be exported as a generic helper for other users, in which
1393 * case one would probably want to add a 'max_nr_pages' parameter as well.
1394 */
1395static int get_iovec_page_array(const struct iovec __user *iov,
1396				unsigned int nr_vecs, struct page **pages,
1397				struct partial_page *partial, bool aligned,
1398				unsigned int pipe_buffers)
1399{
1400	int buffers = 0, error = 0;
1401
1402	while (nr_vecs) {
1403		unsigned long off, npages;
1404		struct iovec entry;
1405		void __user *base;
1406		size_t len;
1407		int i;
1408
1409		error = -EFAULT;
1410		if (copy_from_user(&entry, iov, sizeof(entry)))
1411			break;
1412
1413		base = entry.iov_base;
1414		len = entry.iov_len;
1415
1416		/*
1417		 * Sanity check this iovec. 0 read succeeds.
1418		 */
1419		error = 0;
1420		if (unlikely(!len))
1421			break;
1422		error = -EFAULT;
1423		if (!access_ok(VERIFY_READ, base, len))
1424			break;
1425
1426		/*
1427		 * Get this base offset and number of pages, then map
1428		 * in the user pages.
1429		 */
1430		off = (unsigned long) base & ~PAGE_MASK;
1431
1432		/*
1433		 * If asked for alignment, the offset must be zero and the
1434		 * length a multiple of the PAGE_SIZE.
1435		 */
1436		error = -EINVAL;
1437		if (aligned && (off || len & ~PAGE_MASK))
1438			break;
1439
1440		npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1441		if (npages > pipe_buffers - buffers)
1442			npages = pipe_buffers - buffers;
1443
1444		error = get_user_pages_fast((unsigned long)base, npages,
1445					0, &pages[buffers]);
1446
1447		if (unlikely(error <= 0))
1448			break;
1449
1450		/*
1451		 * Fill this contiguous range into the partial page map.
1452		 */
1453		for (i = 0; i < error; i++) {
1454			const int plen = min_t(size_t, len, PAGE_SIZE - off);
1455
1456			partial[buffers].offset = off;
1457			partial[buffers].len = plen;
1458
1459			off = 0;
1460			len -= plen;
1461			buffers++;
1462		}
1463
1464		/*
1465		 * We didn't complete this iov, stop here since it probably
1466		 * means we have to move some of this into a pipe to
1467		 * be able to continue.
1468		 */
1469		if (len)
1470			break;
1471
1472		/*
1473		 * Don't continue if we mapped fewer pages than we asked for,
1474		 * or if we mapped the max number of pages that we have
1475		 * room for.
1476		 */
1477		if (error < npages || buffers == pipe_buffers)
1478			break;
1479
1480		nr_vecs--;
1481		iov++;
1482	}
1483
1484	if (buffers)
1485		return buffers;
1486
1487	return error;
1488}
1489
1490static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1491			struct splice_desc *sd)
1492{
1493	char *src;
1494	int ret;
1495
1496	/*
1497	 * See if we can use the atomic maps, by prefaulting in the
1498	 * pages and doing an atomic copy
1499	 */
1500	if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) {
1501		src = buf->ops->map(pipe, buf, 1);
1502		ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset,
1503							sd->len);
1504		buf->ops->unmap(pipe, buf, src);
1505		if (!ret) {
1506			ret = sd->len;
1507			goto out;
1508		}
1509	}
1510
1511	/*
1512	 * No dice, use slow non-atomic map and copy
1513 	 */
1514	src = buf->ops->map(pipe, buf, 0);
1515
1516	ret = sd->len;
1517	if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
1518		ret = -EFAULT;
1519
1520	buf->ops->unmap(pipe, buf, src);
1521out:
1522	if (ret > 0)
1523		sd->u.userptr += ret;
1524	return ret;
1525}
1526
1527/*
1528 * For lack of a better implementation, implement vmsplice() to userspace
1529 * as a simple copy of the pipes pages to the user iov.
1530 */
1531static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
1532			     unsigned long nr_segs, unsigned int flags)
1533{
1534	struct pipe_inode_info *pipe;
1535	struct splice_desc sd;
1536	ssize_t size;
1537	int error;
1538	long ret;
1539
1540	pipe = get_pipe_info(file);
1541	if (!pipe)
1542		return -EBADF;
1543
1544	pipe_lock(pipe);
1545
1546	error = ret = 0;
1547	while (nr_segs) {
1548		void __user *base;
1549		size_t len;
1550
1551		/*
1552		 * Get user address base and length for this iovec.
1553		 */
1554		error = get_user(base, &iov->iov_base);
1555		if (unlikely(error))
1556			break;
1557		error = get_user(len, &iov->iov_len);
1558		if (unlikely(error))
1559			break;
1560
1561		/*
1562		 * Sanity check this iovec. 0 read succeeds.
1563		 */
1564		if (unlikely(!len))
1565			break;
1566		if (unlikely(!base)) {
1567			error = -EFAULT;
1568			break;
1569		}
1570
1571		if (unlikely(!access_ok(VERIFY_WRITE, base, len))) {
1572			error = -EFAULT;
1573			break;
1574		}
1575
1576		sd.len = 0;
1577		sd.total_len = len;
1578		sd.flags = flags;
1579		sd.u.userptr = base;
1580		sd.pos = 0;
1581
1582		size = __splice_from_pipe(pipe, &sd, pipe_to_user);
1583		if (size < 0) {
1584			if (!ret)
1585				ret = size;
1586
1587			break;
1588		}
1589
1590		ret += size;
1591
1592		if (size < len)
1593			break;
1594
1595		nr_segs--;
1596		iov++;
1597	}
1598
1599	pipe_unlock(pipe);
1600
1601	if (!ret)
1602		ret = error;
1603
1604	return ret;
1605}
1606
1607/*
1608 * vmsplice splices a user address range into a pipe. It can be thought of
1609 * as splice-from-memory, where the regular splice is splice-from-file (or
1610 * to file). In both cases the output is a pipe, naturally.
1611 */
1612static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1613			     unsigned long nr_segs, unsigned int flags)
1614{
1615	struct pipe_inode_info *pipe;
1616	struct page *pages[PIPE_DEF_BUFFERS];
1617	struct partial_page partial[PIPE_DEF_BUFFERS];
1618	struct splice_pipe_desc spd = {
1619		.pages = pages,
1620		.partial = partial,
1621		.nr_pages_max = PIPE_DEF_BUFFERS,
1622		.flags = flags,
1623		.ops = &user_page_pipe_buf_ops,
1624		.spd_release = spd_release_page,
1625	};
1626	long ret;
1627
1628	pipe = get_pipe_info(file);
1629	if (!pipe)
1630		return -EBADF;
1631
1632	if (splice_grow_spd(pipe, &spd))
1633		return -ENOMEM;
1634
1635	spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
1636					    spd.partial, false,
1637					    spd.nr_pages_max);
1638	if (spd.nr_pages <= 0)
1639		ret = spd.nr_pages;
1640	else
1641		ret = splice_to_pipe(pipe, &spd);
1642
1643	splice_shrink_spd(&spd);
1644	return ret;
1645}
1646
1647/*
1648 * Note that vmsplice only really supports true splicing _from_ user memory
1649 * to a pipe, not the other way around. Splicing from user memory is a simple
1650 * operation that can be supported without any funky alignment restrictions
1651 * or nasty vm tricks. We simply map in the user memory and fill them into
1652 * a pipe. The reverse isn't quite as easy, though. There are two possible
1653 * solutions for that:
1654 *
1655 *	- memcpy() the data internally, at which point we might as well just
1656 *	  do a regular read() on the buffer anyway.
1657 *	- Lots of nasty vm tricks, that are neither fast nor flexible (it
1658 *	  has restriction limitations on both ends of the pipe).
1659 *
1660 * Currently we punt and implement it as a normal copy, see pipe_to_user().
1661 *
1662 */
1663SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
1664		unsigned long, nr_segs, unsigned int, flags)
1665{
1666	struct file *file;
1667	long error;
1668	int fput;
1669
1670	if (unlikely(nr_segs > UIO_MAXIOV))
1671		return -EINVAL;
1672	else if (unlikely(!nr_segs))
1673		return 0;
1674
1675	error = -EBADF;
1676	file = fget_light(fd, &fput);
1677	if (file) {
1678		if (file->f_mode & FMODE_WRITE)
1679			error = vmsplice_to_pipe(file, iov, nr_segs, flags);
1680		else if (file->f_mode & FMODE_READ)
1681			error = vmsplice_to_user(file, iov, nr_segs, flags);
1682
1683		fput_light(file, fput);
1684	}
1685
1686	return error;
1687}
1688
1689SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1690		int, fd_out, loff_t __user *, off_out,
1691		size_t, len, unsigned int, flags)
1692{
1693	long error;
1694	struct file *in, *out;
1695	int fput_in, fput_out;
1696
1697	if (unlikely(!len))
1698		return 0;
1699
1700	error = -EBADF;
1701	in = fget_light(fd_in, &fput_in);
1702	if (in) {
1703		if (in->f_mode & FMODE_READ) {
1704			out = fget_light(fd_out, &fput_out);
1705			if (out) {
1706				if (out->f_mode & FMODE_WRITE)
1707					error = do_splice(in, off_in,
1708							  out, off_out,
1709							  len, flags);
1710				fput_light(out, fput_out);
1711			}
1712		}
1713
1714		fput_light(in, fput_in);
1715	}
1716
1717	return error;
1718}
1719
1720/*
1721 * Make sure there's data to read. Wait for input if we can, otherwise
1722 * return an appropriate error.
1723 */
1724static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1725{
1726	int ret;
1727
1728	/*
1729	 * Check ->nrbufs without the inode lock first. This function
1730	 * is speculative anyways, so missing one is ok.
1731	 */
1732	if (pipe->nrbufs)
1733		return 0;
1734
1735	ret = 0;
1736	pipe_lock(pipe);
1737
1738	while (!pipe->nrbufs) {
1739		if (signal_pending(current)) {
1740			ret = -ERESTARTSYS;
1741			break;
1742		}
1743		if (!pipe->writers)
1744			break;
1745		if (!pipe->waiting_writers) {
1746			if (flags & SPLICE_F_NONBLOCK) {
1747				ret = -EAGAIN;
1748				break;
1749			}
1750		}
1751		pipe_wait(pipe);
1752	}
1753
1754	pipe_unlock(pipe);
1755	return ret;
1756}
1757
1758/*
1759 * Make sure there's writeable room. Wait for room if we can, otherwise
1760 * return an appropriate error.
1761 */
1762static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1763{
1764	int ret;
1765
1766	/*
1767	 * Check ->nrbufs without the inode lock first. This function
1768	 * is speculative anyways, so missing one is ok.
1769	 */
1770	if (pipe->nrbufs < pipe->buffers)
1771		return 0;
1772
1773	ret = 0;
1774	pipe_lock(pipe);
1775
1776	while (pipe->nrbufs >= pipe->buffers) {
1777		if (!pipe->readers) {
1778			send_sig(SIGPIPE, current, 0);
1779			ret = -EPIPE;
1780			break;
1781		}
1782		if (flags & SPLICE_F_NONBLOCK) {
1783			ret = -EAGAIN;
1784			break;
1785		}
1786		if (signal_pending(current)) {
1787			ret = -ERESTARTSYS;
1788			break;
1789		}
1790		pipe->waiting_writers++;
1791		pipe_wait(pipe);
1792		pipe->waiting_writers--;
1793	}
1794
1795	pipe_unlock(pipe);
1796	return ret;
1797}
1798
1799/*
1800 * Splice contents of ipipe to opipe.
1801 */
1802static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1803			       struct pipe_inode_info *opipe,
1804			       size_t len, unsigned int flags)
1805{
1806	struct pipe_buffer *ibuf, *obuf;
1807	int ret = 0, nbuf;
1808	bool input_wakeup = false;
1809
1810
1811retry:
1812	ret = ipipe_prep(ipipe, flags);
1813	if (ret)
1814		return ret;
1815
1816	ret = opipe_prep(opipe, flags);
1817	if (ret)
1818		return ret;
1819
1820	/*
1821	 * Potential ABBA deadlock, work around it by ordering lock
1822	 * grabbing by pipe info address. Otherwise two different processes
1823	 * could deadlock (one doing tee from A -> B, the other from B -> A).
1824	 */
1825	pipe_double_lock(ipipe, opipe);
1826
1827	do {
1828		if (!opipe->readers) {
1829			send_sig(SIGPIPE, current, 0);
1830			if (!ret)
1831				ret = -EPIPE;
1832			break;
1833		}
1834
1835		if (!ipipe->nrbufs && !ipipe->writers)
1836			break;
1837
1838		/*
1839		 * Cannot make any progress, because either the input
1840		 * pipe is empty or the output pipe is full.
1841		 */
1842		if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
1843			/* Already processed some buffers, break */
1844			if (ret)
1845				break;
1846
1847			if (flags & SPLICE_F_NONBLOCK) {
1848				ret = -EAGAIN;
1849				break;
1850			}
1851
1852			/*
1853			 * We raced with another reader/writer and haven't
1854			 * managed to process any buffers.  A zero return
1855			 * value means EOF, so retry instead.
1856			 */
1857			pipe_unlock(ipipe);
1858			pipe_unlock(opipe);
1859			goto retry;
1860		}
1861
1862		ibuf = ipipe->bufs + ipipe->curbuf;
1863		nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1864		obuf = opipe->bufs + nbuf;
1865
1866		if (len >= ibuf->len) {
1867			/*
1868			 * Simply move the whole buffer from ipipe to opipe
1869			 */
1870			*obuf = *ibuf;
1871			ibuf->ops = NULL;
1872			opipe->nrbufs++;
1873			ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
1874			ipipe->nrbufs--;
1875			input_wakeup = true;
1876		} else {
1877			/*
1878			 * Get a reference to this pipe buffer,
1879			 * so we can copy the contents over.
1880			 */
1881			ibuf->ops->get(ipipe, ibuf);
1882			*obuf = *ibuf;
1883
1884			/*
1885			 * Don't inherit the gift flag, we need to
1886			 * prevent multiple steals of this page.
1887			 */
1888			obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1889
1890			obuf->len = len;
1891			opipe->nrbufs++;
1892			ibuf->offset += obuf->len;
1893			ibuf->len -= obuf->len;
1894		}
1895		ret += obuf->len;
1896		len -= obuf->len;
1897	} while (len);
1898
1899	pipe_unlock(ipipe);
1900	pipe_unlock(opipe);
1901
1902	/*
1903	 * If we put data in the output pipe, wakeup any potential readers.
1904	 */
1905	if (ret > 0)
1906		wakeup_pipe_readers(opipe);
1907
1908	if (input_wakeup)
1909		wakeup_pipe_writers(ipipe);
1910
1911	return ret;
1912}
1913
1914/*
1915 * Link contents of ipipe to opipe.
1916 */
1917static int link_pipe(struct pipe_inode_info *ipipe,
1918		     struct pipe_inode_info *opipe,
1919		     size_t len, unsigned int flags)
1920{
1921	struct pipe_buffer *ibuf, *obuf;
1922	int ret = 0, i = 0, nbuf;
1923
1924	/*
1925	 * Potential ABBA deadlock, work around it by ordering lock
1926	 * grabbing by pipe info address. Otherwise two different processes
1927	 * could deadlock (one doing tee from A -> B, the other from B -> A).
1928	 */
1929	pipe_double_lock(ipipe, opipe);
1930
1931	do {
1932		if (!opipe->readers) {
1933			send_sig(SIGPIPE, current, 0);
1934			if (!ret)
1935				ret = -EPIPE;
1936			break;
1937		}
1938
1939		/*
1940		 * If we have iterated all input buffers or ran out of
1941		 * output room, break.
1942		 */
1943		if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
1944			break;
1945
1946		ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
1947		nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1948
1949		/*
1950		 * Get a reference to this pipe buffer,
1951		 * so we can copy the contents over.
1952		 */
1953		ibuf->ops->get(ipipe, ibuf);
1954
1955		obuf = opipe->bufs + nbuf;
1956		*obuf = *ibuf;
1957
1958		/*
1959		 * Don't inherit the gift flag, we need to
1960		 * prevent multiple steals of this page.
1961		 */
1962		obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1963
1964		if (obuf->len > len)
1965			obuf->len = len;
1966
1967		opipe->nrbufs++;
1968		ret += obuf->len;
1969		len -= obuf->len;
1970		i++;
1971	} while (len);
1972
1973	/*
1974	 * return EAGAIN if we have the potential of some data in the
1975	 * future, otherwise just return 0
1976	 */
1977	if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
1978		ret = -EAGAIN;
1979
1980	pipe_unlock(ipipe);
1981	pipe_unlock(opipe);
1982
1983	/*
1984	 * If we put data in the output pipe, wakeup any potential readers.
1985	 */
1986	if (ret > 0)
1987		wakeup_pipe_readers(opipe);
1988
1989	return ret;
1990}
1991
1992/*
1993 * This is a tee(1) implementation that works on pipes. It doesn't copy
1994 * any data, it simply references the 'in' pages on the 'out' pipe.
1995 * The 'flags' used are the SPLICE_F_* variants, currently the only
1996 * applicable one is SPLICE_F_NONBLOCK.
1997 */
1998static long do_tee(struct file *in, struct file *out, size_t len,
1999		   unsigned int flags)
2000{
2001	struct pipe_inode_info *ipipe = get_pipe_info(in);
2002	struct pipe_inode_info *opipe = get_pipe_info(out);
2003	int ret = -EINVAL;
2004
2005	/*
2006	 * Duplicate the contents of ipipe to opipe without actually
2007	 * copying the data.
2008	 */
2009	if (ipipe && opipe && ipipe != opipe) {
2010		/*
2011		 * Keep going, unless we encounter an error. The ipipe/opipe
2012		 * ordering doesn't really matter.
2013		 */
2014		ret = ipipe_prep(ipipe, flags);
2015		if (!ret) {
2016			ret = opipe_prep(opipe, flags);
2017			if (!ret)
2018				ret = link_pipe(ipipe, opipe, len, flags);
2019		}
2020	}
2021
2022	return ret;
2023}
2024
2025SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
2026{
2027	struct file *in;
2028	int error, fput_in;
2029
2030	if (unlikely(!len))
2031		return 0;
2032
2033	error = -EBADF;
2034	in = fget_light(fdin, &fput_in);
2035	if (in) {
2036		if (in->f_mode & FMODE_READ) {
2037			int fput_out;
2038			struct file *out = fget_light(fdout, &fput_out);
2039
2040			if (out) {
2041				if (out->f_mode & FMODE_WRITE)
2042					error = do_tee(in, out, len, flags);
2043				fput_light(out, fput_out);
2044			}
2045		}
2046 		fput_light(in, fput_in);
2047 	}
2048
2049	return error;
2050}