Linux Audio

Check our new training course

Linux kernel drivers training

Mar 31-Apr 9, 2025, special US time zones
Register
Loading...
v6.9.4
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  linux/fs/read_write.c
   4 *
   5 *  Copyright (C) 1991, 1992  Linus Torvalds
   6 */
   7
   8#include <linux/slab.h>
   9#include <linux/stat.h>
  10#include <linux/sched/xacct.h>
  11#include <linux/fcntl.h>
  12#include <linux/file.h>
  13#include <linux/uio.h>
  14#include <linux/fsnotify.h>
  15#include <linux/security.h>
  16#include <linux/export.h>
  17#include <linux/syscalls.h>
  18#include <linux/pagemap.h>
  19#include <linux/splice.h>
  20#include <linux/compat.h>
  21#include <linux/mount.h>
  22#include <linux/fs.h>
  23#include "internal.h"
  24
  25#include <linux/uaccess.h>
  26#include <asm/unistd.h>
  27
  28const struct file_operations generic_ro_fops = {
  29	.llseek		= generic_file_llseek,
  30	.read_iter	= generic_file_read_iter,
  31	.mmap		= generic_file_readonly_mmap,
  32	.splice_read	= filemap_splice_read,
  33};
  34
  35EXPORT_SYMBOL(generic_ro_fops);
  36
  37static inline bool unsigned_offsets(struct file *file)
  38{
  39	return file->f_mode & FMODE_UNSIGNED_OFFSET;
  40}
  41
  42/**
  43 * vfs_setpos - update the file offset for lseek
  44 * @file:	file structure in question
  45 * @offset:	file offset to seek to
  46 * @maxsize:	maximum file size
  47 *
  48 * This is a low-level filesystem helper for updating the file offset to
  49 * the value specified by @offset if the given offset is valid and it is
  50 * not equal to the current file offset.
  51 *
  52 * Return the specified offset on success and -EINVAL on invalid offset.
  53 */
  54loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
  55{
  56	if (offset < 0 && !unsigned_offsets(file))
  57		return -EINVAL;
  58	if (offset > maxsize)
  59		return -EINVAL;
  60
  61	if (offset != file->f_pos) {
  62		file->f_pos = offset;
  63		file->f_version = 0;
  64	}
  65	return offset;
  66}
  67EXPORT_SYMBOL(vfs_setpos);
  68
  69/**
  70 * generic_file_llseek_size - generic llseek implementation for regular files
  71 * @file:	file structure to seek on
  72 * @offset:	file offset to seek to
  73 * @whence:	type of seek
  74 * @maxsize:	max size of this file in file system
  75 * @eof:	offset used for SEEK_END position
  76 *
  77 * This is a variant of generic_file_llseek that allows passing in a custom
  78 * maximum file size and a custom EOF position, for e.g. hashed directories
  79 *
  80 * Synchronization:
  81 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  82 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  83 * read/writes behave like SEEK_SET against seeks.
  84 */
  85loff_t
  86generic_file_llseek_size(struct file *file, loff_t offset, int whence,
  87		loff_t maxsize, loff_t eof)
  88{
  89	switch (whence) {
  90	case SEEK_END:
  91		offset += eof;
  92		break;
  93	case SEEK_CUR:
  94		/*
  95		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
  96		 * position-querying operation.  Avoid rewriting the "same"
  97		 * f_pos value back to the file because a concurrent read(),
  98		 * write() or lseek() might have altered it
  99		 */
 100		if (offset == 0)
 101			return file->f_pos;
 102		/*
 103		 * f_lock protects against read/modify/write race with other
 104		 * SEEK_CURs. Note that parallel writes and reads behave
 105		 * like SEEK_SET.
 106		 */
 107		spin_lock(&file->f_lock);
 108		offset = vfs_setpos(file, file->f_pos + offset, maxsize);
 109		spin_unlock(&file->f_lock);
 110		return offset;
 111	case SEEK_DATA:
 112		/*
 113		 * In the generic case the entire file is data, so as long as
 114		 * offset isn't at the end of the file then the offset is data.
 115		 */
 116		if ((unsigned long long)offset >= eof)
 117			return -ENXIO;
 118		break;
 119	case SEEK_HOLE:
 120		/*
 121		 * There is a virtual hole at the end of the file, so as long as
 122		 * offset isn't i_size or larger, return i_size.
 123		 */
 124		if ((unsigned long long)offset >= eof)
 125			return -ENXIO;
 126		offset = eof;
 127		break;
 128	}
 129
 130	return vfs_setpos(file, offset, maxsize);
 131}
 132EXPORT_SYMBOL(generic_file_llseek_size);
 133
 134/**
 135 * generic_file_llseek - generic llseek implementation for regular files
 136 * @file:	file structure to seek on
 137 * @offset:	file offset to seek to
 138 * @whence:	type of seek
 139 *
 140 * This is a generic implemenation of ->llseek useable for all normal local
 141 * filesystems.  It just updates the file offset to the value specified by
 142 * @offset and @whence.
 143 */
 144loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
 145{
 146	struct inode *inode = file->f_mapping->host;
 147
 148	return generic_file_llseek_size(file, offset, whence,
 149					inode->i_sb->s_maxbytes,
 150					i_size_read(inode));
 151}
 152EXPORT_SYMBOL(generic_file_llseek);
 153
 154/**
 155 * fixed_size_llseek - llseek implementation for fixed-sized devices
 156 * @file:	file structure to seek on
 157 * @offset:	file offset to seek to
 158 * @whence:	type of seek
 159 * @size:	size of the file
 160 *
 161 */
 162loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
 163{
 164	switch (whence) {
 165	case SEEK_SET: case SEEK_CUR: case SEEK_END:
 166		return generic_file_llseek_size(file, offset, whence,
 167						size, size);
 168	default:
 169		return -EINVAL;
 170	}
 171}
 172EXPORT_SYMBOL(fixed_size_llseek);
 173
 174/**
 175 * no_seek_end_llseek - llseek implementation for fixed-sized devices
 176 * @file:	file structure to seek on
 177 * @offset:	file offset to seek to
 178 * @whence:	type of seek
 179 *
 180 */
 181loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
 182{
 183	switch (whence) {
 184	case SEEK_SET: case SEEK_CUR:
 185		return generic_file_llseek_size(file, offset, whence,
 186						OFFSET_MAX, 0);
 187	default:
 188		return -EINVAL;
 189	}
 190}
 191EXPORT_SYMBOL(no_seek_end_llseek);
 192
 193/**
 194 * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
 195 * @file:	file structure to seek on
 196 * @offset:	file offset to seek to
 197 * @whence:	type of seek
 198 * @size:	maximal offset allowed
 199 *
 200 */
 201loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
 202{
 203	switch (whence) {
 204	case SEEK_SET: case SEEK_CUR:
 205		return generic_file_llseek_size(file, offset, whence,
 206						size, 0);
 207	default:
 208		return -EINVAL;
 209	}
 210}
 211EXPORT_SYMBOL(no_seek_end_llseek_size);
 212
 213/**
 214 * noop_llseek - No Operation Performed llseek implementation
 215 * @file:	file structure to seek on
 216 * @offset:	file offset to seek to
 217 * @whence:	type of seek
 218 *
 219 * This is an implementation of ->llseek useable for the rare special case when
 220 * userspace expects the seek to succeed but the (device) file is actually not
 221 * able to perform the seek. In this case you use noop_llseek() instead of
 222 * falling back to the default implementation of ->llseek.
 223 */
 224loff_t noop_llseek(struct file *file, loff_t offset, int whence)
 225{
 226	return file->f_pos;
 227}
 228EXPORT_SYMBOL(noop_llseek);
 229
 
 
 
 
 
 
 230loff_t default_llseek(struct file *file, loff_t offset, int whence)
 231{
 232	struct inode *inode = file_inode(file);
 233	loff_t retval;
 234
 235	inode_lock(inode);
 236	switch (whence) {
 237		case SEEK_END:
 238			offset += i_size_read(inode);
 239			break;
 240		case SEEK_CUR:
 241			if (offset == 0) {
 242				retval = file->f_pos;
 243				goto out;
 244			}
 245			offset += file->f_pos;
 246			break;
 247		case SEEK_DATA:
 248			/*
 249			 * In the generic case the entire file is data, so as
 250			 * long as offset isn't at the end of the file then the
 251			 * offset is data.
 252			 */
 253			if (offset >= inode->i_size) {
 254				retval = -ENXIO;
 255				goto out;
 256			}
 257			break;
 258		case SEEK_HOLE:
 259			/*
 260			 * There is a virtual hole at the end of the file, so
 261			 * as long as offset isn't i_size or larger, return
 262			 * i_size.
 263			 */
 264			if (offset >= inode->i_size) {
 265				retval = -ENXIO;
 266				goto out;
 267			}
 268			offset = inode->i_size;
 269			break;
 270	}
 271	retval = -EINVAL;
 272	if (offset >= 0 || unsigned_offsets(file)) {
 273		if (offset != file->f_pos) {
 274			file->f_pos = offset;
 275			file->f_version = 0;
 276		}
 277		retval = offset;
 278	}
 279out:
 280	inode_unlock(inode);
 281	return retval;
 282}
 283EXPORT_SYMBOL(default_llseek);
 284
 285loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
 286{
 287	if (!(file->f_mode & FMODE_LSEEK))
 288		return -ESPIPE;
 289	return file->f_op->llseek(file, offset, whence);
 
 
 
 
 
 290}
 291EXPORT_SYMBOL(vfs_llseek);
 292
 293static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
 294{
 295	off_t retval;
 296	struct fd f = fdget_pos(fd);
 297	if (!f.file)
 298		return -EBADF;
 299
 300	retval = -EINVAL;
 301	if (whence <= SEEK_MAX) {
 302		loff_t res = vfs_llseek(f.file, offset, whence);
 303		retval = res;
 304		if (res != (loff_t)retval)
 305			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */
 306	}
 307	fdput_pos(f);
 308	return retval;
 309}
 310
 311SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
 312{
 313	return ksys_lseek(fd, offset, whence);
 314}
 315
 316#ifdef CONFIG_COMPAT
 317COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
 318{
 319	return ksys_lseek(fd, offset, whence);
 320}
 321#endif
 322
 323#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \
 324	defined(__ARCH_WANT_SYS_LLSEEK)
 325SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 326		unsigned long, offset_low, loff_t __user *, result,
 327		unsigned int, whence)
 328{
 329	int retval;
 330	struct fd f = fdget_pos(fd);
 331	loff_t offset;
 332
 333	if (!f.file)
 334		return -EBADF;
 335
 336	retval = -EINVAL;
 337	if (whence > SEEK_MAX)
 338		goto out_putf;
 339
 340	offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
 341			whence);
 342
 343	retval = (int)offset;
 344	if (offset >= 0) {
 345		retval = -EFAULT;
 346		if (!copy_to_user(result, &offset, sizeof(offset)))
 347			retval = 0;
 348	}
 349out_putf:
 350	fdput_pos(f);
 351	return retval;
 352}
 353#endif
 354
 355int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
 356{
 357	int mask = read_write == READ ? MAY_READ : MAY_WRITE;
 358	int ret;
 
 359
 
 360	if (unlikely((ssize_t) count < 0))
 361		return -EINVAL;
 362
 363	if (ppos) {
 364		loff_t pos = *ppos;
 365
 366		if (unlikely(pos < 0)) {
 367			if (!unsigned_offsets(file))
 368				return -EINVAL;
 369			if (count >= -pos) /* both values are in 0..LLONG_MAX */
 370				return -EOVERFLOW;
 371		} else if (unlikely((loff_t) (pos + count) < 0)) {
 372			if (!unsigned_offsets(file))
 373				return -EINVAL;
 374		}
 375	}
 376
 377	ret = security_file_permission(file, mask);
 378	if (ret)
 379		return ret;
 380
 381	return fsnotify_file_area_perm(file, mask, ppos, count);
 
 
 
 382}
 383EXPORT_SYMBOL(rw_verify_area);
 384
 385static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 386{
 
 387	struct kiocb kiocb;
 388	struct iov_iter iter;
 389	ssize_t ret;
 390
 391	init_sync_kiocb(&kiocb, filp);
 392	kiocb.ki_pos = (ppos ? *ppos : 0);
 393	iov_iter_ubuf(&iter, ITER_DEST, buf, len);
 394
 395	ret = call_read_iter(filp, &kiocb, &iter);
 396	BUG_ON(ret == -EIOCBQUEUED);
 397	if (ppos)
 398		*ppos = kiocb.ki_pos;
 399	return ret;
 400}
 401
 402static int warn_unsupported(struct file *file, const char *op)
 
 403{
 404	pr_warn_ratelimited(
 405		"kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
 406		op, file, current->pid, current->comm);
 407	return -EINVAL;
 408}
 409
 410ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
 411{
 412	struct kvec iov = {
 413		.iov_base	= buf,
 414		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
 415	};
 416	struct kiocb kiocb;
 417	struct iov_iter iter;
 418	ssize_t ret;
 419
 420	if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ)))
 421		return -EINVAL;
 422	if (!(file->f_mode & FMODE_CAN_READ))
 423		return -EINVAL;
 424	/*
 425	 * Also fail if ->read_iter and ->read are both wired up as that
 426	 * implies very convoluted semantics.
 427	 */
 428	if (unlikely(!file->f_op->read_iter || file->f_op->read))
 429		return warn_unsupported(file, "read");
 430
 431	init_sync_kiocb(&kiocb, file);
 432	kiocb.ki_pos = pos ? *pos : 0;
 433	iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len);
 434	ret = file->f_op->read_iter(&kiocb, &iter);
 435	if (ret > 0) {
 436		if (pos)
 437			*pos = kiocb.ki_pos;
 438		fsnotify_access(file);
 439		add_rchar(current, ret);
 440	}
 441	inc_syscr(current);
 442	return ret;
 443}
 444
 445ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
 446{
 447	ssize_t ret;
 
 448
 449	ret = rw_verify_area(READ, file, pos, count);
 450	if (ret)
 451		return ret;
 452	return __kernel_read(file, buf, count, pos);
 
 
 453}
 454EXPORT_SYMBOL(kernel_read);
 455
 456ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 457{
 458	ssize_t ret;
 459
 460	if (!(file->f_mode & FMODE_READ))
 461		return -EBADF;
 462	if (!(file->f_mode & FMODE_CAN_READ))
 463		return -EINVAL;
 464	if (unlikely(!access_ok(buf, count)))
 465		return -EFAULT;
 466
 467	ret = rw_verify_area(READ, file, pos, count);
 468	if (ret)
 469		return ret;
 470	if (count > MAX_RW_COUNT)
 471		count =  MAX_RW_COUNT;
 472
 473	if (file->f_op->read)
 474		ret = file->f_op->read(file, buf, count, pos);
 475	else if (file->f_op->read_iter)
 476		ret = new_sync_read(file, buf, count, pos);
 477	else
 478		ret = -EINVAL;
 479	if (ret > 0) {
 480		fsnotify_access(file);
 481		add_rchar(current, ret);
 482	}
 483	inc_syscr(current);
 484	return ret;
 485}
 486
 487static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 488{
 
 489	struct kiocb kiocb;
 490	struct iov_iter iter;
 491	ssize_t ret;
 492
 493	init_sync_kiocb(&kiocb, filp);
 494	kiocb.ki_pos = (ppos ? *ppos : 0);
 495	iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len);
 496
 497	ret = call_write_iter(filp, &kiocb, &iter);
 498	BUG_ON(ret == -EIOCBQUEUED);
 499	if (ret > 0 && ppos)
 500		*ppos = kiocb.ki_pos;
 501	return ret;
 502}
 503
 504/* caller is responsible for file_start_write/file_end_write */
 505ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos)
 506{
 507	struct kiocb kiocb;
 
 
 
 
 
 
 
 
 
 
 
 508	ssize_t ret;
 509
 510	if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
 511		return -EBADF;
 512	if (!(file->f_mode & FMODE_CAN_WRITE))
 513		return -EINVAL;
 514	/*
 515	 * Also fail if ->write_iter and ->write are both wired up as that
 516	 * implies very convoluted semantics.
 517	 */
 518	if (unlikely(!file->f_op->write_iter || file->f_op->write))
 519		return warn_unsupported(file, "write");
 520
 521	init_sync_kiocb(&kiocb, file);
 522	kiocb.ki_pos = pos ? *pos : 0;
 523	ret = file->f_op->write_iter(&kiocb, from);
 
 
 
 
 524	if (ret > 0) {
 525		if (pos)
 526			*pos = kiocb.ki_pos;
 527		fsnotify_modify(file);
 528		add_wchar(current, ret);
 529	}
 530	inc_syscw(current);
 531	return ret;
 532}
 533
 534/* caller is responsible for file_start_write/file_end_write */
 535ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
 536{
 537	struct kvec iov = {
 538		.iov_base	= (void *)buf,
 539		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
 540	};
 541	struct iov_iter iter;
 542	iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len);
 543	return __kernel_write_iter(file, &iter, pos);
 544}
 545/*
 546 * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()",
 547 * but autofs is one of the few internal kernel users that actually
 548 * wants this _and_ can be built as a module. So we need to export
 549 * this symbol for autofs, even though it really isn't appropriate
 550 * for any other kernel modules.
 551 */
 552EXPORT_SYMBOL_GPL(__kernel_write);
 553
 554ssize_t kernel_write(struct file *file, const void *buf, size_t count,
 555			    loff_t *pos)
 556{
 557	ssize_t ret;
 
 558
 559	ret = rw_verify_area(WRITE, file, pos, count);
 560	if (ret)
 561		return ret;
 
 
 562
 563	file_start_write(file);
 564	ret =  __kernel_write(file, buf, count, pos);
 565	file_end_write(file);
 566	return ret;
 567}
 568EXPORT_SYMBOL(kernel_write);
 569
 570ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 571{
 572	ssize_t ret;
 573
 574	if (!(file->f_mode & FMODE_WRITE))
 575		return -EBADF;
 576	if (!(file->f_mode & FMODE_CAN_WRITE))
 577		return -EINVAL;
 578	if (unlikely(!access_ok(buf, count)))
 579		return -EFAULT;
 580
 581	ret = rw_verify_area(WRITE, file, pos, count);
 582	if (ret)
 583		return ret;
 584	if (count > MAX_RW_COUNT)
 585		count =  MAX_RW_COUNT;
 586	file_start_write(file);
 587	if (file->f_op->write)
 588		ret = file->f_op->write(file, buf, count, pos);
 589	else if (file->f_op->write_iter)
 590		ret = new_sync_write(file, buf, count, pos);
 591	else
 592		ret = -EINVAL;
 593	if (ret > 0) {
 594		fsnotify_modify(file);
 595		add_wchar(current, ret);
 596	}
 597	inc_syscw(current);
 598	file_end_write(file);
 599	return ret;
 600}
 601
 602/* file_ppos returns &file->f_pos or NULL if file is stream */
 603static inline loff_t *file_ppos(struct file *file)
 
 
 
 
 604{
 605	return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos;
 606}
 607
 608ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
 609{
 610	struct fd f = fdget_pos(fd);
 611	ssize_t ret = -EBADF;
 612
 613	if (f.file) {
 614		loff_t pos, *ppos = file_ppos(f.file);
 615		if (ppos) {
 616			pos = *ppos;
 617			ppos = &pos;
 618		}
 619		ret = vfs_read(f.file, buf, count, ppos);
 620		if (ret >= 0 && ppos)
 621			f.file->f_pos = pos;
 622		fdput_pos(f);
 623	}
 624	return ret;
 625}
 626
 627SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 628{
 629	return ksys_read(fd, buf, count);
 630}
 631
 632ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
 633{
 634	struct fd f = fdget_pos(fd);
 635	ssize_t ret = -EBADF;
 636
 637	if (f.file) {
 638		loff_t pos, *ppos = file_ppos(f.file);
 639		if (ppos) {
 640			pos = *ppos;
 641			ppos = &pos;
 642		}
 643		ret = vfs_write(f.file, buf, count, ppos);
 644		if (ret >= 0 && ppos)
 645			f.file->f_pos = pos;
 646		fdput_pos(f);
 647	}
 648
 649	return ret;
 650}
 651
 652SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 653		size_t, count)
 654{
 655	return ksys_write(fd, buf, count);
 656}
 657
 658ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
 659		     loff_t pos)
 660{
 661	struct fd f;
 662	ssize_t ret = -EBADF;
 663
 664	if (pos < 0)
 665		return -EINVAL;
 666
 667	f = fdget(fd);
 668	if (f.file) {
 669		ret = -ESPIPE;
 670		if (f.file->f_mode & FMODE_PREAD)
 671			ret = vfs_read(f.file, buf, count, &pos);
 672		fdput(f);
 673	}
 674
 675	return ret;
 676}
 677
 678SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
 679			size_t, count, loff_t, pos)
 680{
 681	return ksys_pread64(fd, buf, count, pos);
 682}
 683
 684#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64)
 685COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf,
 686		       size_t, count, compat_arg_u64_dual(pos))
 687{
 688	return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos));
 689}
 690#endif
 691
 692ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
 693		      size_t count, loff_t pos)
 694{
 695	struct fd f;
 696	ssize_t ret = -EBADF;
 697
 698	if (pos < 0)
 699		return -EINVAL;
 700
 701	f = fdget(fd);
 702	if (f.file) {
 703		ret = -ESPIPE;
 704		if (f.file->f_mode & FMODE_PWRITE)  
 705			ret = vfs_write(f.file, buf, count, &pos);
 706		fdput(f);
 707	}
 708
 709	return ret;
 710}
 711
 712SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
 713			 size_t, count, loff_t, pos)
 714{
 715	return ksys_pwrite64(fd, buf, count, pos);
 716}
 717
 718#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64)
 719COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf,
 720		       size_t, count, compat_arg_u64_dual(pos))
 721{
 722	return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos));
 723}
 724#endif
 725
 726static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 727		loff_t *ppos, int type, rwf_t flags)
 728{
 729	struct kiocb kiocb;
 730	ssize_t ret;
 731
 732	init_sync_kiocb(&kiocb, filp);
 733	ret = kiocb_set_rw_flags(&kiocb, flags);
 734	if (ret)
 735		return ret;
 736	kiocb.ki_pos = (ppos ? *ppos : 0);
 737
 738	if (type == READ)
 739		ret = call_read_iter(filp, &kiocb, iter);
 740	else
 741		ret = call_write_iter(filp, &kiocb, iter);
 742	BUG_ON(ret == -EIOCBQUEUED);
 743	if (ppos)
 744		*ppos = kiocb.ki_pos;
 745	return ret;
 746}
 747
 748/* Do it by hand, with file-ops */
 749static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
 750		loff_t *ppos, int type, rwf_t flags)
 751{
 752	ssize_t ret = 0;
 753
 754	if (flags & ~RWF_HIPRI)
 755		return -EOPNOTSUPP;
 756
 757	while (iov_iter_count(iter)) {
 
 758		ssize_t nr;
 759
 760		if (type == READ) {
 761			nr = filp->f_op->read(filp, iter_iov_addr(iter),
 762						iter_iov_len(iter), ppos);
 763		} else {
 764			nr = filp->f_op->write(filp, iter_iov_addr(iter),
 765						iter_iov_len(iter), ppos);
 766		}
 767
 768		if (nr < 0) {
 769			if (!ret)
 770				ret = nr;
 771			break;
 772		}
 773		ret += nr;
 774		if (nr != iter_iov_len(iter))
 775			break;
 776		iov_iter_advance(iter, nr);
 777	}
 778
 779	return ret;
 780}
 781
 782ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
 783			   struct iov_iter *iter)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 784{
 785	size_t tot_len;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 786	ssize_t ret = 0;
 
 787
 788	if (!file->f_op->read_iter)
 789		return -EINVAL;
 790	if (!(file->f_mode & FMODE_READ))
 791		return -EBADF;
 792	if (!(file->f_mode & FMODE_CAN_READ))
 793		return -EINVAL;
 
 794
 795	tot_len = iov_iter_count(iter);
 796	if (!tot_len)
 
 
 
 
 
 
 
 
 
 
 
 797		goto out;
 798	ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len);
 799	if (ret < 0)
 800		return ret;
 801
 802	ret = call_read_iter(file, iocb, iter);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 803out:
 804	if (ret >= 0)
 805		fsnotify_access(file);
 806	return ret;
 807}
 808EXPORT_SYMBOL(vfs_iocb_iter_read);
 809
 810ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
 811		      rwf_t flags)
 812{
 813	size_t tot_len;
 814	ssize_t ret = 0;
 815
 816	if (!file->f_op->read_iter)
 817		return -EINVAL;
 818	if (!(file->f_mode & FMODE_READ))
 819		return -EBADF;
 820	if (!(file->f_mode & FMODE_CAN_READ))
 821		return -EINVAL;
 822
 823	tot_len = iov_iter_count(iter);
 824	if (!tot_len)
 825		goto out;
 826	ret = rw_verify_area(READ, file, ppos, tot_len);
 827	if (ret < 0)
 828		return ret;
 829
 830	ret = do_iter_readv_writev(file, iter, ppos, READ, flags);
 
 
 
 831out:
 832	if (ret >= 0)
 833		fsnotify_access(file);
 834	return ret;
 835}
 
 
 
 
 
 
 
 
 836EXPORT_SYMBOL(vfs_iter_read);
 837
 838/*
 839 * Caller is responsible for calling kiocb_end_write() on completion
 840 * if async iocb was queued.
 841 */
 842ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
 843			    struct iov_iter *iter)
 844{
 845	size_t tot_len;
 846	ssize_t ret = 0;
 847
 848	if (!file->f_op->write_iter)
 849		return -EINVAL;
 850	if (!(file->f_mode & FMODE_WRITE))
 851		return -EBADF;
 852	if (!(file->f_mode & FMODE_CAN_WRITE))
 853		return -EINVAL;
 854
 855	tot_len = iov_iter_count(iter);
 856	if (!tot_len)
 857		return 0;
 858	ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len);
 859	if (ret < 0)
 860		return ret;
 861
 862	kiocb_start_write(iocb);
 863	ret = call_write_iter(file, iocb, iter);
 864	if (ret != -EIOCBQUEUED)
 865		kiocb_end_write(iocb);
 866	if (ret > 0)
 867		fsnotify_modify(file);
 868
 869	return ret;
 870}
 871EXPORT_SYMBOL(vfs_iocb_iter_write);
 872
 873ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
 874		       rwf_t flags)
 875{
 876	size_t tot_len;
 877	ssize_t ret;
 878
 879	if (!(file->f_mode & FMODE_WRITE))
 880		return -EBADF;
 881	if (!(file->f_mode & FMODE_CAN_WRITE))
 882		return -EINVAL;
 883	if (!file->f_op->write_iter)
 884		return -EINVAL;
 885
 886	tot_len = iov_iter_count(iter);
 887	if (!tot_len)
 888		return 0;
 889
 890	ret = rw_verify_area(WRITE, file, ppos, tot_len);
 891	if (ret < 0)
 892		return ret;
 893
 894	file_start_write(file);
 895	ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags);
 896	if (ret > 0)
 897		fsnotify_modify(file);
 898	file_end_write(file);
 899
 900	return ret;
 901}
 902EXPORT_SYMBOL(vfs_iter_write);
 903
 904static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 905			 unsigned long vlen, loff_t *pos, rwf_t flags)
 906{
 907	struct iovec iovstack[UIO_FASTIOV];
 908	struct iovec *iov = iovstack;
 909	struct iov_iter iter;
 910	size_t tot_len;
 911	ssize_t ret = 0;
 912
 913	if (!(file->f_mode & FMODE_READ))
 914		return -EBADF;
 915	if (!(file->f_mode & FMODE_CAN_READ))
 916		return -EINVAL;
 917
 918	ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov,
 919			   &iter);
 920	if (ret < 0)
 921		return ret;
 922
 923	tot_len = iov_iter_count(&iter);
 924	if (!tot_len)
 925		goto out;
 926
 927	ret = rw_verify_area(READ, file, pos, tot_len);
 928	if (ret < 0)
 929		goto out;
 
 
 930
 931	if (file->f_op->read_iter)
 932		ret = do_iter_readv_writev(file, &iter, pos, READ, flags);
 933	else
 934		ret = do_loop_readv_writev(file, &iter, pos, READ, flags);
 935out:
 936	if (ret >= 0)
 937		fsnotify_access(file);
 938	kfree(iov);
 939	return ret;
 940}
 941
 942static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 943			  unsigned long vlen, loff_t *pos, rwf_t flags)
 944{
 945	struct iovec iovstack[UIO_FASTIOV];
 946	struct iovec *iov = iovstack;
 947	struct iov_iter iter;
 948	size_t tot_len;
 949	ssize_t ret = 0;
 950
 951	if (!(file->f_mode & FMODE_WRITE))
 952		return -EBADF;
 953	if (!(file->f_mode & FMODE_CAN_WRITE))
 954		return -EINVAL;
 955
 956	ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov,
 957			   &iter);
 958	if (ret < 0)
 959		return ret;
 960
 961	tot_len = iov_iter_count(&iter);
 962	if (!tot_len)
 963		goto out;
 964
 965	ret = rw_verify_area(WRITE, file, pos, tot_len);
 966	if (ret < 0)
 967		goto out;
 968
 969	file_start_write(file);
 970	if (file->f_op->write_iter)
 971		ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags);
 972	else
 973		ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags);
 974	if (ret > 0)
 975		fsnotify_modify(file);
 976	file_end_write(file);
 977out:
 978	kfree(iov);
 979	return ret;
 980}
 981
 982static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
 983			unsigned long vlen, rwf_t flags)
 984{
 985	struct fd f = fdget_pos(fd);
 986	ssize_t ret = -EBADF;
 987
 988	if (f.file) {
 989		loff_t pos, *ppos = file_ppos(f.file);
 990		if (ppos) {
 991			pos = *ppos;
 992			ppos = &pos;
 993		}
 994		ret = vfs_readv(f.file, vec, vlen, ppos, flags);
 995		if (ret >= 0 && ppos)
 996			f.file->f_pos = pos;
 997		fdput_pos(f);
 998	}
 999
1000	if (ret > 0)
1001		add_rchar(current, ret);
1002	inc_syscr(current);
1003	return ret;
1004}
1005
1006static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
1007			 unsigned long vlen, rwf_t flags)
1008{
1009	struct fd f = fdget_pos(fd);
1010	ssize_t ret = -EBADF;
1011
1012	if (f.file) {
1013		loff_t pos, *ppos = file_ppos(f.file);
1014		if (ppos) {
1015			pos = *ppos;
1016			ppos = &pos;
1017		}
1018		ret = vfs_writev(f.file, vec, vlen, ppos, flags);
1019		if (ret >= 0 && ppos)
1020			f.file->f_pos = pos;
1021		fdput_pos(f);
1022	}
1023
1024	if (ret > 0)
1025		add_wchar(current, ret);
1026	inc_syscw(current);
1027	return ret;
1028}
1029
1030static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
1031{
1032#define HALF_LONG_BITS (BITS_PER_LONG / 2)
1033	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
1034}
1035
1036static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
1037			 unsigned long vlen, loff_t pos, rwf_t flags)
1038{
1039	struct fd f;
1040	ssize_t ret = -EBADF;
1041
1042	if (pos < 0)
1043		return -EINVAL;
1044
1045	f = fdget(fd);
1046	if (f.file) {
1047		ret = -ESPIPE;
1048		if (f.file->f_mode & FMODE_PREAD)
1049			ret = vfs_readv(f.file, vec, vlen, &pos, flags);
1050		fdput(f);
1051	}
1052
1053	if (ret > 0)
1054		add_rchar(current, ret);
1055	inc_syscr(current);
1056	return ret;
1057}
1058
1059static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
1060			  unsigned long vlen, loff_t pos, rwf_t flags)
1061{
1062	struct fd f;
1063	ssize_t ret = -EBADF;
1064
1065	if (pos < 0)
1066		return -EINVAL;
1067
1068	f = fdget(fd);
1069	if (f.file) {
1070		ret = -ESPIPE;
1071		if (f.file->f_mode & FMODE_PWRITE)
1072			ret = vfs_writev(f.file, vec, vlen, &pos, flags);
1073		fdput(f);
1074	}
1075
1076	if (ret > 0)
1077		add_wchar(current, ret);
1078	inc_syscw(current);
1079	return ret;
1080}
1081
1082SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
1083		unsigned long, vlen)
1084{
1085	return do_readv(fd, vec, vlen, 0);
1086}
1087
1088SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
1089		unsigned long, vlen)
1090{
1091	return do_writev(fd, vec, vlen, 0);
1092}
1093
1094SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
1095		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1096{
1097	loff_t pos = pos_from_hilo(pos_h, pos_l);
1098
1099	return do_preadv(fd, vec, vlen, pos, 0);
1100}
1101
1102SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
1103		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1104		rwf_t, flags)
1105{
1106	loff_t pos = pos_from_hilo(pos_h, pos_l);
1107
1108	if (pos == -1)
1109		return do_readv(fd, vec, vlen, flags);
1110
1111	return do_preadv(fd, vec, vlen, pos, flags);
1112}
1113
1114SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
1115		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1116{
1117	loff_t pos = pos_from_hilo(pos_h, pos_l);
1118
1119	return do_pwritev(fd, vec, vlen, pos, 0);
1120}
1121
1122SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
1123		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1124		rwf_t, flags)
1125{
1126	loff_t pos = pos_from_hilo(pos_h, pos_l);
1127
1128	if (pos == -1)
1129		return do_writev(fd, vec, vlen, flags);
1130
1131	return do_pwritev(fd, vec, vlen, pos, flags);
1132}
1133
1134/*
1135 * Various compat syscalls.  Note that they all pretend to take a native
1136 * iovec - import_iovec will properly treat those as compat_iovecs based on
1137 * in_compat_syscall().
1138 */
1139#ifdef CONFIG_COMPAT
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1140#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
1141COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1142		const struct iovec __user *, vec,
1143		unsigned long, vlen, loff_t, pos)
1144{
1145	return do_preadv(fd, vec, vlen, pos, 0);
1146}
1147#endif
1148
1149COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1150		const struct iovec __user *, vec,
1151		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1152{
1153	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1154
1155	return do_preadv(fd, vec, vlen, pos, 0);
1156}
1157
1158#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
1159COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
1160		const struct iovec __user *, vec,
1161		unsigned long, vlen, loff_t, pos, rwf_t, flags)
1162{
1163	if (pos == -1)
1164		return do_readv(fd, vec, vlen, flags);
1165	return do_preadv(fd, vec, vlen, pos, flags);
1166}
1167#endif
1168
1169COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
1170		const struct iovec __user *, vec,
1171		compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
1172		rwf_t, flags)
1173{
1174	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1175
1176	if (pos == -1)
1177		return do_readv(fd, vec, vlen, flags);
1178	return do_preadv(fd, vec, vlen, pos, flags);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1179}
1180
1181#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
1182COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1183		const struct iovec __user *, vec,
1184		unsigned long, vlen, loff_t, pos)
1185{
1186	return do_pwritev(fd, vec, vlen, pos, 0);
1187}
1188#endif
1189
1190COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1191		const struct iovec __user *,vec,
1192		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1193{
1194	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1195
1196	return do_pwritev(fd, vec, vlen, pos, 0);
1197}
1198
1199#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
1200COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
1201		const struct iovec __user *, vec,
1202		unsigned long, vlen, loff_t, pos, rwf_t, flags)
1203{
1204	if (pos == -1)
1205		return do_writev(fd, vec, vlen, flags);
1206	return do_pwritev(fd, vec, vlen, pos, flags);
1207}
1208#endif
1209
1210COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
1211		const struct iovec __user *,vec,
1212		compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
1213{
1214	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1215
1216	if (pos == -1)
1217		return do_writev(fd, vec, vlen, flags);
1218	return do_pwritev(fd, vec, vlen, pos, flags);
 
1219}
1220#endif /* CONFIG_COMPAT */
 
1221
1222static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1223			   size_t count, loff_t max)
1224{
1225	struct fd in, out;
1226	struct inode *in_inode, *out_inode;
1227	struct pipe_inode_info *opipe;
1228	loff_t pos;
1229	loff_t out_pos;
1230	ssize_t retval;
1231	int fl;
1232
1233	/*
1234	 * Get input file, and verify that it is ok..
1235	 */
1236	retval = -EBADF;
1237	in = fdget(in_fd);
1238	if (!in.file)
1239		goto out;
1240	if (!(in.file->f_mode & FMODE_READ))
1241		goto fput_in;
1242	retval = -ESPIPE;
1243	if (!ppos) {
1244		pos = in.file->f_pos;
1245	} else {
1246		pos = *ppos;
1247		if (!(in.file->f_mode & FMODE_PREAD))
1248			goto fput_in;
1249	}
1250	retval = rw_verify_area(READ, in.file, &pos, count);
1251	if (retval < 0)
1252		goto fput_in;
1253	if (count > MAX_RW_COUNT)
1254		count =  MAX_RW_COUNT;
1255
1256	/*
1257	 * Get output file, and verify that it is ok..
1258	 */
1259	retval = -EBADF;
1260	out = fdget(out_fd);
1261	if (!out.file)
1262		goto fput_in;
1263	if (!(out.file->f_mode & FMODE_WRITE))
1264		goto fput_out;
 
1265	in_inode = file_inode(in.file);
1266	out_inode = file_inode(out.file);
1267	out_pos = out.file->f_pos;
 
 
 
1268
1269	if (!max)
1270		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1271
1272	if (unlikely(pos + count > max)) {
1273		retval = -EOVERFLOW;
1274		if (pos >= max)
1275			goto fput_out;
1276		count = max - pos;
1277	}
1278
1279	fl = 0;
1280#if 0
1281	/*
1282	 * We need to debate whether we can enable this or not. The
1283	 * man page documents EAGAIN return for the output at least,
1284	 * and the application is arguably buggy if it doesn't expect
1285	 * EAGAIN on a non-blocking file descriptor.
1286	 */
1287	if (in.file->f_flags & O_NONBLOCK)
1288		fl = SPLICE_F_NONBLOCK;
1289#endif
1290	opipe = get_pipe_info(out.file, true);
1291	if (!opipe) {
1292		retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1293		if (retval < 0)
1294			goto fput_out;
1295		retval = do_splice_direct(in.file, &pos, out.file, &out_pos,
1296					  count, fl);
1297	} else {
1298		if (out.file->f_flags & O_NONBLOCK)
1299			fl |= SPLICE_F_NONBLOCK;
1300
1301		retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl);
1302	}
1303
1304	if (retval > 0) {
1305		add_rchar(current, retval);
1306		add_wchar(current, retval);
1307		fsnotify_access(in.file);
1308		fsnotify_modify(out.file);
1309		out.file->f_pos = out_pos;
1310		if (ppos)
1311			*ppos = pos;
1312		else
1313			in.file->f_pos = pos;
1314	}
1315
1316	inc_syscr(current);
1317	inc_syscw(current);
1318	if (pos > max)
1319		retval = -EOVERFLOW;
1320
1321fput_out:
1322	fdput(out);
1323fput_in:
1324	fdput(in);
1325out:
1326	return retval;
1327}
1328
1329SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1330{
1331	loff_t pos;
1332	off_t off;
1333	ssize_t ret;
1334
1335	if (offset) {
1336		if (unlikely(get_user(off, offset)))
1337			return -EFAULT;
1338		pos = off;
1339		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1340		if (unlikely(put_user(pos, offset)))
1341			return -EFAULT;
1342		return ret;
1343	}
1344
1345	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1346}
1347
1348SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1349{
1350	loff_t pos;
1351	ssize_t ret;
1352
1353	if (offset) {
1354		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1355			return -EFAULT;
1356		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1357		if (unlikely(put_user(pos, offset)))
1358			return -EFAULT;
1359		return ret;
1360	}
1361
1362	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1363}
1364
1365#ifdef CONFIG_COMPAT
1366COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1367		compat_off_t __user *, offset, compat_size_t, count)
1368{
1369	loff_t pos;
1370	off_t off;
1371	ssize_t ret;
1372
1373	if (offset) {
1374		if (unlikely(get_user(off, offset)))
1375			return -EFAULT;
1376		pos = off;
1377		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1378		if (unlikely(put_user(pos, offset)))
1379			return -EFAULT;
1380		return ret;
1381	}
1382
1383	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1384}
1385
1386COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1387		compat_loff_t __user *, offset, compat_size_t, count)
1388{
1389	loff_t pos;
1390	ssize_t ret;
1391
1392	if (offset) {
1393		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1394			return -EFAULT;
1395		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1396		if (unlikely(put_user(pos, offset)))
1397			return -EFAULT;
1398		return ret;
1399	}
1400
1401	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1402}
1403#endif
1404
1405/*
1406 * Performs necessary checks before doing a file copy
1407 *
1408 * Can adjust amount of bytes to copy via @req_count argument.
1409 * Returns appropriate error code that caller should return or
1410 * zero in case the copy should be allowed.
1411 */
1412static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
1413				    struct file *file_out, loff_t pos_out,
1414				    size_t *req_count, unsigned int flags)
1415{
1416	struct inode *inode_in = file_inode(file_in);
1417	struct inode *inode_out = file_inode(file_out);
1418	uint64_t count = *req_count;
1419	loff_t size_in;
1420	int ret;
1421
1422	ret = generic_file_rw_checks(file_in, file_out);
1423	if (ret)
1424		return ret;
1425
1426	/*
1427	 * We allow some filesystems to handle cross sb copy, but passing
1428	 * a file of the wrong filesystem type to filesystem driver can result
1429	 * in an attempt to dereference the wrong type of ->private_data, so
1430	 * avoid doing that until we really have a good reason.
1431	 *
1432	 * nfs and cifs define several different file_system_type structures
1433	 * and several different sets of file_operations, but they all end up
1434	 * using the same ->copy_file_range() function pointer.
1435	 */
1436	if (flags & COPY_FILE_SPLICE) {
1437		/* cross sb splice is allowed */
1438	} else if (file_out->f_op->copy_file_range) {
1439		if (file_in->f_op->copy_file_range !=
1440		    file_out->f_op->copy_file_range)
1441			return -EXDEV;
1442	} else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) {
1443		return -EXDEV;
1444	}
1445
1446	/* Don't touch certain kinds of inodes */
1447	if (IS_IMMUTABLE(inode_out))
1448		return -EPERM;
1449
1450	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
1451		return -ETXTBSY;
1452
1453	/* Ensure offsets don't wrap. */
1454	if (pos_in + count < pos_in || pos_out + count < pos_out)
1455		return -EOVERFLOW;
1456
1457	/* Shorten the copy to EOF */
1458	size_in = i_size_read(inode_in);
1459	if (pos_in >= size_in)
1460		count = 0;
1461	else
1462		count = min(count, size_in - (uint64_t)pos_in);
1463
1464	ret = generic_write_check_limits(file_out, pos_out, &count);
1465	if (ret)
1466		return ret;
1467
1468	/* Don't allow overlapped copying within the same file. */
1469	if (inode_in == inode_out &&
1470	    pos_out + count > pos_in &&
1471	    pos_out < pos_in + count)
1472		return -EINVAL;
1473
1474	*req_count = count;
1475	return 0;
1476}
1477
1478/*
1479 * copy_file_range() differs from regular file read and write in that it
1480 * specifically allows return partial success.  When it does so is up to
1481 * the copy_file_range method.
1482 */
1483ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
1484			    struct file *file_out, loff_t pos_out,
1485			    size_t len, unsigned int flags)
1486{
 
 
1487	ssize_t ret;
1488	bool splice = flags & COPY_FILE_SPLICE;
1489	bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb;
1490
1491	if (flags & ~COPY_FILE_SPLICE)
1492		return -EINVAL;
1493
1494	ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
1495				       flags);
1496	if (unlikely(ret))
1497		return ret;
1498
1499	ret = rw_verify_area(READ, file_in, &pos_in, len);
1500	if (unlikely(ret))
1501		return ret;
1502
1503	ret = rw_verify_area(WRITE, file_out, &pos_out, len);
1504	if (unlikely(ret))
1505		return ret;
1506
 
 
 
 
 
 
 
 
 
1507	if (len == 0)
1508		return 0;
1509
1510	file_start_write(file_out);
1511
1512	/*
1513	 * Cloning is supported by more file systems, so we implement copy on
1514	 * same sb using clone, but for filesystems where both clone and copy
1515	 * are supported (e.g. nfs,cifs), we only call the copy method.
1516	 */
1517	if (!splice && file_out->f_op->copy_file_range) {
1518		ret = file_out->f_op->copy_file_range(file_in, pos_in,
1519						      file_out, pos_out,
1520						      len, flags);
1521	} else if (!splice && file_in->f_op->remap_file_range && samesb) {
1522		ret = file_in->f_op->remap_file_range(file_in, pos_in,
1523				file_out, pos_out,
1524				min_t(loff_t, MAX_RW_COUNT, len),
1525				REMAP_FILE_CAN_SHORTEN);
1526		/* fallback to splice */
1527		if (ret <= 0)
1528			splice = true;
1529	} else if (samesb) {
1530		/* Fallback to splice for same sb copy for backward compat */
1531		splice = true;
1532	}
1533
1534	file_end_write(file_out);
1535
1536	if (!splice)
1537		goto done;
 
 
1538
1539	/*
1540	 * We can get here for same sb copy of filesystems that do not implement
1541	 * ->copy_file_range() in case filesystem does not support clone or in
1542	 * case filesystem supports clone but rejected the clone request (e.g.
1543	 * because it was not block aligned).
1544	 *
1545	 * In both cases, fall back to kernel copy so we are able to maintain a
1546	 * consistent story about which filesystems support copy_file_range()
1547	 * and which filesystems do not, that will allow userspace tools to
1548	 * make consistent desicions w.r.t using copy_file_range().
1549	 *
1550	 * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE
1551	 * for server-side-copy between any two sb.
1552	 *
1553	 * In any case, we call do_splice_direct() and not splice_file_range(),
1554	 * without file_start_write() held, to avoid possible deadlocks related
1555	 * to splicing from input file, while file_start_write() is held on
1556	 * the output file on a different sb.
1557	 */
1558	ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
1559			       min_t(size_t, len, MAX_RW_COUNT), 0);
 
1560done:
1561	if (ret > 0) {
1562		fsnotify_access(file_in);
1563		add_rchar(current, ret);
1564		fsnotify_modify(file_out);
1565		add_wchar(current, ret);
1566	}
1567
1568	inc_syscr(current);
1569	inc_syscw(current);
1570
 
 
1571	return ret;
1572}
1573EXPORT_SYMBOL(vfs_copy_file_range);
1574
1575SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
1576		int, fd_out, loff_t __user *, off_out,
1577		size_t, len, unsigned int, flags)
1578{
1579	loff_t pos_in;
1580	loff_t pos_out;
1581	struct fd f_in;
1582	struct fd f_out;
1583	ssize_t ret = -EBADF;
1584
1585	f_in = fdget(fd_in);
1586	if (!f_in.file)
1587		goto out2;
1588
1589	f_out = fdget(fd_out);
1590	if (!f_out.file)
1591		goto out1;
1592
1593	ret = -EFAULT;
1594	if (off_in) {
1595		if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
1596			goto out;
1597	} else {
1598		pos_in = f_in.file->f_pos;
1599	}
1600
1601	if (off_out) {
1602		if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
1603			goto out;
1604	} else {
1605		pos_out = f_out.file->f_pos;
1606	}
1607
1608	ret = -EINVAL;
1609	if (flags != 0)
1610		goto out;
1611
1612	ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
1613				  flags);
1614	if (ret > 0) {
1615		pos_in += ret;
1616		pos_out += ret;
1617
1618		if (off_in) {
1619			if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
1620				ret = -EFAULT;
1621		} else {
1622			f_in.file->f_pos = pos_in;
1623		}
1624
1625		if (off_out) {
1626			if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
1627				ret = -EFAULT;
1628		} else {
1629			f_out.file->f_pos = pos_out;
1630		}
1631	}
1632
1633out:
1634	fdput(f_out);
1635out1:
1636	fdput(f_in);
1637out2:
1638	return ret;
1639}
1640
1641/*
1642 * Don't operate on ranges the page cache doesn't support, and don't exceed the
1643 * LFS limits.  If pos is under the limit it becomes a short access.  If it
1644 * exceeds the limit we return -EFBIG.
1645 */
1646int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count)
1647{
1648	struct inode *inode = file->f_mapping->host;
1649	loff_t max_size = inode->i_sb->s_maxbytes;
1650	loff_t limit = rlimit(RLIMIT_FSIZE);
1651
1652	if (limit != RLIM_INFINITY) {
1653		if (pos >= limit) {
1654			send_sig(SIGXFSZ, current, 0);
1655			return -EFBIG;
1656		}
1657		*count = min(*count, limit - pos);
1658	}
1659
1660	if (!(file->f_flags & O_LARGEFILE))
1661		max_size = MAX_NON_LFS;
1662
1663	if (unlikely(pos >= max_size))
1664		return -EFBIG;
 
1665
1666	*count = min(*count, max_size - pos);
 
 
 
 
1667
1668	return 0;
1669}
1670
1671/* Like generic_write_checks(), but takes size of write instead of iter. */
1672int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
1673{
1674	struct file *file = iocb->ki_filp;
1675	struct inode *inode = file->f_mapping->host;
 
 
 
 
 
 
 
 
 
 
 
 
1676
1677	if (IS_SWAPFILE(inode))
 
 
 
 
1678		return -ETXTBSY;
1679
1680	if (!*count)
 
 
 
 
 
 
 
 
1681		return 0;
1682
1683	if (iocb->ki_flags & IOCB_APPEND)
1684		iocb->ki_pos = i_size_read(inode);
 
 
 
 
 
 
1685
1686	if ((iocb->ki_flags & IOCB_NOWAIT) &&
1687	    !((iocb->ki_flags & IOCB_DIRECT) ||
1688	      (file->f_mode & FMODE_BUF_WASYNC)))
1689		return -EINVAL;
1690
1691	return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
1692}
1693EXPORT_SYMBOL(generic_write_checks_count);
1694
1695/*
1696 * Performs necessary checks before doing a write
1697 *
1698 * Can adjust writing position or amount of bytes to write.
1699 * Returns appropriate error code that caller should return or
1700 * zero in case that write should be allowed.
1701 */
1702ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
1703{
1704	loff_t count = iov_iter_count(from);
1705	int ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1706
1707	ret = generic_write_checks_count(iocb, &count);
 
1708	if (ret)
1709		return ret;
1710
1711	iov_iter_truncate(from, count);
1712	return iov_iter_count(from);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1713}
1714EXPORT_SYMBOL(generic_write_checks);
1715
1716/*
1717 * Performs common checks before doing a file copy/clone
1718 * from @file_in to @file_out.
1719 */
1720int generic_file_rw_checks(struct file *file_in, struct file *file_out)
1721{
1722	struct inode *inode_in = file_inode(file_in);
1723	struct inode *inode_out = file_inode(file_out);
 
1724
1725	/* Don't copy dirs, pipes, sockets... */
1726	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1727		return -EISDIR;
1728	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1729		return -EINVAL;
1730
 
 
 
 
 
 
 
 
1731	if (!(file_in->f_mode & FMODE_READ) ||
1732	    !(file_out->f_mode & FMODE_WRITE) ||
1733	    (file_out->f_flags & O_APPEND))
1734		return -EBADF;
1735
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1736	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1737}
v4.17
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  linux/fs/read_write.c
   4 *
   5 *  Copyright (C) 1991, 1992  Linus Torvalds
   6 */
   7
   8#include <linux/slab.h>
   9#include <linux/stat.h>
  10#include <linux/sched/xacct.h>
  11#include <linux/fcntl.h>
  12#include <linux/file.h>
  13#include <linux/uio.h>
  14#include <linux/fsnotify.h>
  15#include <linux/security.h>
  16#include <linux/export.h>
  17#include <linux/syscalls.h>
  18#include <linux/pagemap.h>
  19#include <linux/splice.h>
  20#include <linux/compat.h>
  21#include <linux/mount.h>
  22#include <linux/fs.h>
  23#include "internal.h"
  24
  25#include <linux/uaccess.h>
  26#include <asm/unistd.h>
  27
  28const struct file_operations generic_ro_fops = {
  29	.llseek		= generic_file_llseek,
  30	.read_iter	= generic_file_read_iter,
  31	.mmap		= generic_file_readonly_mmap,
  32	.splice_read	= generic_file_splice_read,
  33};
  34
  35EXPORT_SYMBOL(generic_ro_fops);
  36
  37static inline bool unsigned_offsets(struct file *file)
  38{
  39	return file->f_mode & FMODE_UNSIGNED_OFFSET;
  40}
  41
  42/**
  43 * vfs_setpos - update the file offset for lseek
  44 * @file:	file structure in question
  45 * @offset:	file offset to seek to
  46 * @maxsize:	maximum file size
  47 *
  48 * This is a low-level filesystem helper for updating the file offset to
  49 * the value specified by @offset if the given offset is valid and it is
  50 * not equal to the current file offset.
  51 *
  52 * Return the specified offset on success and -EINVAL on invalid offset.
  53 */
  54loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
  55{
  56	if (offset < 0 && !unsigned_offsets(file))
  57		return -EINVAL;
  58	if (offset > maxsize)
  59		return -EINVAL;
  60
  61	if (offset != file->f_pos) {
  62		file->f_pos = offset;
  63		file->f_version = 0;
  64	}
  65	return offset;
  66}
  67EXPORT_SYMBOL(vfs_setpos);
  68
  69/**
  70 * generic_file_llseek_size - generic llseek implementation for regular files
  71 * @file:	file structure to seek on
  72 * @offset:	file offset to seek to
  73 * @whence:	type of seek
  74 * @size:	max size of this file in file system
  75 * @eof:	offset used for SEEK_END position
  76 *
  77 * This is a variant of generic_file_llseek that allows passing in a custom
  78 * maximum file size and a custom EOF position, for e.g. hashed directories
  79 *
  80 * Synchronization:
  81 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  82 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  83 * read/writes behave like SEEK_SET against seeks.
  84 */
  85loff_t
  86generic_file_llseek_size(struct file *file, loff_t offset, int whence,
  87		loff_t maxsize, loff_t eof)
  88{
  89	switch (whence) {
  90	case SEEK_END:
  91		offset += eof;
  92		break;
  93	case SEEK_CUR:
  94		/*
  95		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
  96		 * position-querying operation.  Avoid rewriting the "same"
  97		 * f_pos value back to the file because a concurrent read(),
  98		 * write() or lseek() might have altered it
  99		 */
 100		if (offset == 0)
 101			return file->f_pos;
 102		/*
 103		 * f_lock protects against read/modify/write race with other
 104		 * SEEK_CURs. Note that parallel writes and reads behave
 105		 * like SEEK_SET.
 106		 */
 107		spin_lock(&file->f_lock);
 108		offset = vfs_setpos(file, file->f_pos + offset, maxsize);
 109		spin_unlock(&file->f_lock);
 110		return offset;
 111	case SEEK_DATA:
 112		/*
 113		 * In the generic case the entire file is data, so as long as
 114		 * offset isn't at the end of the file then the offset is data.
 115		 */
 116		if ((unsigned long long)offset >= eof)
 117			return -ENXIO;
 118		break;
 119	case SEEK_HOLE:
 120		/*
 121		 * There is a virtual hole at the end of the file, so as long as
 122		 * offset isn't i_size or larger, return i_size.
 123		 */
 124		if ((unsigned long long)offset >= eof)
 125			return -ENXIO;
 126		offset = eof;
 127		break;
 128	}
 129
 130	return vfs_setpos(file, offset, maxsize);
 131}
 132EXPORT_SYMBOL(generic_file_llseek_size);
 133
 134/**
 135 * generic_file_llseek - generic llseek implementation for regular files
 136 * @file:	file structure to seek on
 137 * @offset:	file offset to seek to
 138 * @whence:	type of seek
 139 *
 140 * This is a generic implemenation of ->llseek useable for all normal local
 141 * filesystems.  It just updates the file offset to the value specified by
 142 * @offset and @whence.
 143 */
 144loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
 145{
 146	struct inode *inode = file->f_mapping->host;
 147
 148	return generic_file_llseek_size(file, offset, whence,
 149					inode->i_sb->s_maxbytes,
 150					i_size_read(inode));
 151}
 152EXPORT_SYMBOL(generic_file_llseek);
 153
 154/**
 155 * fixed_size_llseek - llseek implementation for fixed-sized devices
 156 * @file:	file structure to seek on
 157 * @offset:	file offset to seek to
 158 * @whence:	type of seek
 159 * @size:	size of the file
 160 *
 161 */
 162loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
 163{
 164	switch (whence) {
 165	case SEEK_SET: case SEEK_CUR: case SEEK_END:
 166		return generic_file_llseek_size(file, offset, whence,
 167						size, size);
 168	default:
 169		return -EINVAL;
 170	}
 171}
 172EXPORT_SYMBOL(fixed_size_llseek);
 173
 174/**
 175 * no_seek_end_llseek - llseek implementation for fixed-sized devices
 176 * @file:	file structure to seek on
 177 * @offset:	file offset to seek to
 178 * @whence:	type of seek
 179 *
 180 */
 181loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
 182{
 183	switch (whence) {
 184	case SEEK_SET: case SEEK_CUR:
 185		return generic_file_llseek_size(file, offset, whence,
 186						OFFSET_MAX, 0);
 187	default:
 188		return -EINVAL;
 189	}
 190}
 191EXPORT_SYMBOL(no_seek_end_llseek);
 192
 193/**
 194 * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
 195 * @file:	file structure to seek on
 196 * @offset:	file offset to seek to
 197 * @whence:	type of seek
 198 * @size:	maximal offset allowed
 199 *
 200 */
 201loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
 202{
 203	switch (whence) {
 204	case SEEK_SET: case SEEK_CUR:
 205		return generic_file_llseek_size(file, offset, whence,
 206						size, 0);
 207	default:
 208		return -EINVAL;
 209	}
 210}
 211EXPORT_SYMBOL(no_seek_end_llseek_size);
 212
 213/**
 214 * noop_llseek - No Operation Performed llseek implementation
 215 * @file:	file structure to seek on
 216 * @offset:	file offset to seek to
 217 * @whence:	type of seek
 218 *
 219 * This is an implementation of ->llseek useable for the rare special case when
 220 * userspace expects the seek to succeed but the (device) file is actually not
 221 * able to perform the seek. In this case you use noop_llseek() instead of
 222 * falling back to the default implementation of ->llseek.
 223 */
 224loff_t noop_llseek(struct file *file, loff_t offset, int whence)
 225{
 226	return file->f_pos;
 227}
 228EXPORT_SYMBOL(noop_llseek);
 229
 230loff_t no_llseek(struct file *file, loff_t offset, int whence)
 231{
 232	return -ESPIPE;
 233}
 234EXPORT_SYMBOL(no_llseek);
 235
 236loff_t default_llseek(struct file *file, loff_t offset, int whence)
 237{
 238	struct inode *inode = file_inode(file);
 239	loff_t retval;
 240
 241	inode_lock(inode);
 242	switch (whence) {
 243		case SEEK_END:
 244			offset += i_size_read(inode);
 245			break;
 246		case SEEK_CUR:
 247			if (offset == 0) {
 248				retval = file->f_pos;
 249				goto out;
 250			}
 251			offset += file->f_pos;
 252			break;
 253		case SEEK_DATA:
 254			/*
 255			 * In the generic case the entire file is data, so as
 256			 * long as offset isn't at the end of the file then the
 257			 * offset is data.
 258			 */
 259			if (offset >= inode->i_size) {
 260				retval = -ENXIO;
 261				goto out;
 262			}
 263			break;
 264		case SEEK_HOLE:
 265			/*
 266			 * There is a virtual hole at the end of the file, so
 267			 * as long as offset isn't i_size or larger, return
 268			 * i_size.
 269			 */
 270			if (offset >= inode->i_size) {
 271				retval = -ENXIO;
 272				goto out;
 273			}
 274			offset = inode->i_size;
 275			break;
 276	}
 277	retval = -EINVAL;
 278	if (offset >= 0 || unsigned_offsets(file)) {
 279		if (offset != file->f_pos) {
 280			file->f_pos = offset;
 281			file->f_version = 0;
 282		}
 283		retval = offset;
 284	}
 285out:
 286	inode_unlock(inode);
 287	return retval;
 288}
 289EXPORT_SYMBOL(default_llseek);
 290
 291loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
 292{
 293	loff_t (*fn)(struct file *, loff_t, int);
 294
 295	fn = no_llseek;
 296	if (file->f_mode & FMODE_LSEEK) {
 297		if (file->f_op->llseek)
 298			fn = file->f_op->llseek;
 299	}
 300	return fn(file, offset, whence);
 301}
 302EXPORT_SYMBOL(vfs_llseek);
 303
 304off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
 305{
 306	off_t retval;
 307	struct fd f = fdget_pos(fd);
 308	if (!f.file)
 309		return -EBADF;
 310
 311	retval = -EINVAL;
 312	if (whence <= SEEK_MAX) {
 313		loff_t res = vfs_llseek(f.file, offset, whence);
 314		retval = res;
 315		if (res != (loff_t)retval)
 316			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */
 317	}
 318	fdput_pos(f);
 319	return retval;
 320}
 321
 322SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
 323{
 324	return ksys_lseek(fd, offset, whence);
 325}
 326
 327#ifdef CONFIG_COMPAT
 328COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
 329{
 330	return ksys_lseek(fd, offset, whence);
 331}
 332#endif
 333
 334#ifdef __ARCH_WANT_SYS_LLSEEK
 
 335SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 336		unsigned long, offset_low, loff_t __user *, result,
 337		unsigned int, whence)
 338{
 339	int retval;
 340	struct fd f = fdget_pos(fd);
 341	loff_t offset;
 342
 343	if (!f.file)
 344		return -EBADF;
 345
 346	retval = -EINVAL;
 347	if (whence > SEEK_MAX)
 348		goto out_putf;
 349
 350	offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
 351			whence);
 352
 353	retval = (int)offset;
 354	if (offset >= 0) {
 355		retval = -EFAULT;
 356		if (!copy_to_user(result, &offset, sizeof(offset)))
 357			retval = 0;
 358	}
 359out_putf:
 360	fdput_pos(f);
 361	return retval;
 362}
 363#endif
 364
 365int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
 366{
 367	struct inode *inode;
 368	loff_t pos;
 369	int retval = -EINVAL;
 370
 371	inode = file_inode(file);
 372	if (unlikely((ssize_t) count < 0))
 373		return retval;
 374	pos = *ppos;
 375	if (unlikely(pos < 0)) {
 376		if (!unsigned_offsets(file))
 377			return retval;
 378		if (count >= -pos) /* both values are in 0..LLONG_MAX */
 379			return -EOVERFLOW;
 380	} else if (unlikely((loff_t) (pos + count) < 0)) {
 381		if (!unsigned_offsets(file))
 382			return retval;
 
 
 
 
 383	}
 384
 385	if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
 386		retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
 387				read_write == READ ? F_RDLCK : F_WRLCK);
 388		if (retval < 0)
 389			return retval;
 390	}
 391	return security_file_permission(file,
 392				read_write == READ ? MAY_READ : MAY_WRITE);
 393}
 
 394
 395static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 396{
 397	struct iovec iov = { .iov_base = buf, .iov_len = len };
 398	struct kiocb kiocb;
 399	struct iov_iter iter;
 400	ssize_t ret;
 401
 402	init_sync_kiocb(&kiocb, filp);
 403	kiocb.ki_pos = *ppos;
 404	iov_iter_init(&iter, READ, &iov, 1, len);
 405
 406	ret = call_read_iter(filp, &kiocb, &iter);
 407	BUG_ON(ret == -EIOCBQUEUED);
 408	*ppos = kiocb.ki_pos;
 
 409	return ret;
 410}
 411
 412ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
 413		   loff_t *pos)
 414{
 415	if (file->f_op->read)
 416		return file->f_op->read(file, buf, count, pos);
 417	else if (file->f_op->read_iter)
 418		return new_sync_read(file, buf, count, pos);
 419	else
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 420		return -EINVAL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 421}
 422
 423ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
 424{
 425	mm_segment_t old_fs;
 426	ssize_t result;
 427
 428	old_fs = get_fs();
 429	set_fs(get_ds());
 430	/* The cast to a user pointer is valid due to the set_fs() */
 431	result = vfs_read(file, (void __user *)buf, count, pos);
 432	set_fs(old_fs);
 433	return result;
 434}
 435EXPORT_SYMBOL(kernel_read);
 436
 437ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 438{
 439	ssize_t ret;
 440
 441	if (!(file->f_mode & FMODE_READ))
 442		return -EBADF;
 443	if (!(file->f_mode & FMODE_CAN_READ))
 444		return -EINVAL;
 445	if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
 446		return -EFAULT;
 447
 448	ret = rw_verify_area(READ, file, pos, count);
 449	if (!ret) {
 450		if (count > MAX_RW_COUNT)
 451			count =  MAX_RW_COUNT;
 452		ret = __vfs_read(file, buf, count, pos);
 453		if (ret > 0) {
 454			fsnotify_access(file);
 455			add_rchar(current, ret);
 456		}
 457		inc_syscr(current);
 
 
 
 
 
 458	}
 459
 460	return ret;
 461}
 462
 463static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 464{
 465	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
 466	struct kiocb kiocb;
 467	struct iov_iter iter;
 468	ssize_t ret;
 469
 470	init_sync_kiocb(&kiocb, filp);
 471	kiocb.ki_pos = *ppos;
 472	iov_iter_init(&iter, WRITE, &iov, 1, len);
 473
 474	ret = call_write_iter(filp, &kiocb, &iter);
 475	BUG_ON(ret == -EIOCBQUEUED);
 476	if (ret > 0)
 477		*ppos = kiocb.ki_pos;
 478	return ret;
 479}
 480
 481ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
 482		    loff_t *pos)
 483{
 484	if (file->f_op->write)
 485		return file->f_op->write(file, p, count, pos);
 486	else if (file->f_op->write_iter)
 487		return new_sync_write(file, p, count, pos);
 488	else
 489		return -EINVAL;
 490}
 491
 492ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
 493{
 494	mm_segment_t old_fs;
 495	const char __user *p;
 496	ssize_t ret;
 497
 
 
 498	if (!(file->f_mode & FMODE_CAN_WRITE))
 499		return -EINVAL;
 
 
 
 
 
 
 500
 501	old_fs = get_fs();
 502	set_fs(get_ds());
 503	p = (__force const char __user *)buf;
 504	if (count > MAX_RW_COUNT)
 505		count =  MAX_RW_COUNT;
 506	ret = __vfs_write(file, p, count, pos);
 507	set_fs(old_fs);
 508	if (ret > 0) {
 
 
 509		fsnotify_modify(file);
 510		add_wchar(current, ret);
 511	}
 512	inc_syscw(current);
 513	return ret;
 514}
 515EXPORT_SYMBOL(__kernel_write);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 516
 517ssize_t kernel_write(struct file *file, const void *buf, size_t count,
 518			    loff_t *pos)
 519{
 520	mm_segment_t old_fs;
 521	ssize_t res;
 522
 523	old_fs = get_fs();
 524	set_fs(get_ds());
 525	/* The cast to a user pointer is valid due to the set_fs() */
 526	res = vfs_write(file, (__force const char __user *)buf, count, pos);
 527	set_fs(old_fs);
 528
 529	return res;
 
 
 
 530}
 531EXPORT_SYMBOL(kernel_write);
 532
 533ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 534{
 535	ssize_t ret;
 536
 537	if (!(file->f_mode & FMODE_WRITE))
 538		return -EBADF;
 539	if (!(file->f_mode & FMODE_CAN_WRITE))
 540		return -EINVAL;
 541	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
 542		return -EFAULT;
 543
 544	ret = rw_verify_area(WRITE, file, pos, count);
 545	if (!ret) {
 546		if (count > MAX_RW_COUNT)
 547			count =  MAX_RW_COUNT;
 548		file_start_write(file);
 549		ret = __vfs_write(file, buf, count, pos);
 550		if (ret > 0) {
 551			fsnotify_modify(file);
 552			add_wchar(current, ret);
 553		}
 554		inc_syscw(current);
 555		file_end_write(file);
 
 
 
 556	}
 557
 
 558	return ret;
 559}
 560
 561static inline loff_t file_pos_read(struct file *file)
 562{
 563	return file->f_pos;
 564}
 565
 566static inline void file_pos_write(struct file *file, loff_t pos)
 567{
 568	file->f_pos = pos;
 569}
 570
 571ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
 572{
 573	struct fd f = fdget_pos(fd);
 574	ssize_t ret = -EBADF;
 575
 576	if (f.file) {
 577		loff_t pos = file_pos_read(f.file);
 578		ret = vfs_read(f.file, buf, count, &pos);
 579		if (ret >= 0)
 580			file_pos_write(f.file, pos);
 
 
 
 
 581		fdput_pos(f);
 582	}
 583	return ret;
 584}
 585
 586SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 587{
 588	return ksys_read(fd, buf, count);
 589}
 590
 591ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
 592{
 593	struct fd f = fdget_pos(fd);
 594	ssize_t ret = -EBADF;
 595
 596	if (f.file) {
 597		loff_t pos = file_pos_read(f.file);
 598		ret = vfs_write(f.file, buf, count, &pos);
 599		if (ret >= 0)
 600			file_pos_write(f.file, pos);
 
 
 
 
 601		fdput_pos(f);
 602	}
 603
 604	return ret;
 605}
 606
 607SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 608		size_t, count)
 609{
 610	return ksys_write(fd, buf, count);
 611}
 612
 613ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
 614		     loff_t pos)
 615{
 616	struct fd f;
 617	ssize_t ret = -EBADF;
 618
 619	if (pos < 0)
 620		return -EINVAL;
 621
 622	f = fdget(fd);
 623	if (f.file) {
 624		ret = -ESPIPE;
 625		if (f.file->f_mode & FMODE_PREAD)
 626			ret = vfs_read(f.file, buf, count, &pos);
 627		fdput(f);
 628	}
 629
 630	return ret;
 631}
 632
 633SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
 634			size_t, count, loff_t, pos)
 635{
 636	return ksys_pread64(fd, buf, count, pos);
 637}
 638
 
 
 
 
 
 
 
 
 639ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
 640		      size_t count, loff_t pos)
 641{
 642	struct fd f;
 643	ssize_t ret = -EBADF;
 644
 645	if (pos < 0)
 646		return -EINVAL;
 647
 648	f = fdget(fd);
 649	if (f.file) {
 650		ret = -ESPIPE;
 651		if (f.file->f_mode & FMODE_PWRITE)  
 652			ret = vfs_write(f.file, buf, count, &pos);
 653		fdput(f);
 654	}
 655
 656	return ret;
 657}
 658
 659SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
 660			 size_t, count, loff_t, pos)
 661{
 662	return ksys_pwrite64(fd, buf, count, pos);
 663}
 664
 
 
 
 
 
 
 
 
 665static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 666		loff_t *ppos, int type, rwf_t flags)
 667{
 668	struct kiocb kiocb;
 669	ssize_t ret;
 670
 671	init_sync_kiocb(&kiocb, filp);
 672	ret = kiocb_set_rw_flags(&kiocb, flags);
 673	if (ret)
 674		return ret;
 675	kiocb.ki_pos = *ppos;
 676
 677	if (type == READ)
 678		ret = call_read_iter(filp, &kiocb, iter);
 679	else
 680		ret = call_write_iter(filp, &kiocb, iter);
 681	BUG_ON(ret == -EIOCBQUEUED);
 682	*ppos = kiocb.ki_pos;
 
 683	return ret;
 684}
 685
 686/* Do it by hand, with file-ops */
 687static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
 688		loff_t *ppos, int type, rwf_t flags)
 689{
 690	ssize_t ret = 0;
 691
 692	if (flags & ~RWF_HIPRI)
 693		return -EOPNOTSUPP;
 694
 695	while (iov_iter_count(iter)) {
 696		struct iovec iovec = iov_iter_iovec(iter);
 697		ssize_t nr;
 698
 699		if (type == READ) {
 700			nr = filp->f_op->read(filp, iovec.iov_base,
 701					      iovec.iov_len, ppos);
 702		} else {
 703			nr = filp->f_op->write(filp, iovec.iov_base,
 704					       iovec.iov_len, ppos);
 705		}
 706
 707		if (nr < 0) {
 708			if (!ret)
 709				ret = nr;
 710			break;
 711		}
 712		ret += nr;
 713		if (nr != iovec.iov_len)
 714			break;
 715		iov_iter_advance(iter, nr);
 716	}
 717
 718	return ret;
 719}
 720
 721/* A write operation does a read from user space and vice versa */
 722#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
 723
 724/**
 725 * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace
 726 *     into the kernel and check that it is valid.
 727 *
 728 * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE.
 729 * @uvector: Pointer to the userspace array.
 730 * @nr_segs: Number of elements in userspace array.
 731 * @fast_segs: Number of elements in @fast_pointer.
 732 * @fast_pointer: Pointer to (usually small on-stack) kernel array.
 733 * @ret_pointer: (output parameter) Pointer to a variable that will point to
 734 *     either @fast_pointer, a newly allocated kernel array, or NULL,
 735 *     depending on which array was used.
 736 *
 737 * This function copies an array of &struct iovec of @nr_segs from
 738 * userspace into the kernel and checks that each element is valid (e.g.
 739 * it does not point to a kernel address or cause overflow by being too
 740 * large, etc.).
 741 *
 742 * As an optimization, the caller may provide a pointer to a small
 743 * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long
 744 * (the size of this array, or 0 if unused, should be given in @fast_segs).
 745 *
 746 * @ret_pointer will always point to the array that was used, so the
 747 * caller must take care not to call kfree() on it e.g. in case the
 748 * @fast_pointer array was used and it was allocated on the stack.
 749 *
 750 * Return: The total number of bytes covered by the iovec array on success
 751 *   or a negative error code on error.
 752 */
 753ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 754			      unsigned long nr_segs, unsigned long fast_segs,
 755			      struct iovec *fast_pointer,
 756			      struct iovec **ret_pointer)
 757{
 758	unsigned long seg;
 759	ssize_t ret;
 760	struct iovec *iov = fast_pointer;
 761
 762	/*
 763	 * SuS says "The readv() function *may* fail if the iovcnt argument
 764	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
 765	 * traditionally returned zero for zero segments, so...
 766	 */
 767	if (nr_segs == 0) {
 768		ret = 0;
 769		goto out;
 770	}
 771
 772	/*
 773	 * First get the "struct iovec" from user memory and
 774	 * verify all the pointers
 775	 */
 776	if (nr_segs > UIO_MAXIOV) {
 777		ret = -EINVAL;
 778		goto out;
 779	}
 780	if (nr_segs > fast_segs) {
 781		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
 782		if (iov == NULL) {
 783			ret = -ENOMEM;
 784			goto out;
 785		}
 786	}
 787	if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
 788		ret = -EFAULT;
 789		goto out;
 790	}
 791
 792	/*
 793	 * According to the Single Unix Specification we should return EINVAL
 794	 * if an element length is < 0 when cast to ssize_t or if the
 795	 * total length would overflow the ssize_t return value of the
 796	 * system call.
 797	 *
 798	 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
 799	 * overflow case.
 800	 */
 801	ret = 0;
 802	for (seg = 0; seg < nr_segs; seg++) {
 803		void __user *buf = iov[seg].iov_base;
 804		ssize_t len = (ssize_t)iov[seg].iov_len;
 805
 806		/* see if we we're about to use an invalid len or if
 807		 * it's about to overflow ssize_t */
 808		if (len < 0) {
 809			ret = -EINVAL;
 810			goto out;
 811		}
 812		if (type >= 0
 813		    && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
 814			ret = -EFAULT;
 815			goto out;
 816		}
 817		if (len > MAX_RW_COUNT - ret) {
 818			len = MAX_RW_COUNT - ret;
 819			iov[seg].iov_len = len;
 820		}
 821		ret += len;
 822	}
 823out:
 824	*ret_pointer = iov;
 825	return ret;
 826}
 827
 828#ifdef CONFIG_COMPAT
 829ssize_t compat_rw_copy_check_uvector(int type,
 830		const struct compat_iovec __user *uvector, unsigned long nr_segs,
 831		unsigned long fast_segs, struct iovec *fast_pointer,
 832		struct iovec **ret_pointer)
 833{
 834	compat_ssize_t tot_len;
 835	struct iovec *iov = *ret_pointer = fast_pointer;
 836	ssize_t ret = 0;
 837	int seg;
 838
 839	/*
 840	 * SuS says "The readv() function *may* fail if the iovcnt argument
 841	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
 842	 * traditionally returned zero for zero segments, so...
 843	 */
 844	if (nr_segs == 0)
 845		goto out;
 846
 847	ret = -EINVAL;
 848	if (nr_segs > UIO_MAXIOV)
 849		goto out;
 850	if (nr_segs > fast_segs) {
 851		ret = -ENOMEM;
 852		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
 853		if (iov == NULL)
 854			goto out;
 855	}
 856	*ret_pointer = iov;
 857
 858	ret = -EFAULT;
 859	if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
 860		goto out;
 
 
 
 861
 862	/*
 863	 * Single unix specification:
 864	 * We should -EINVAL if an element length is not >= 0 and fitting an
 865	 * ssize_t.
 866	 *
 867	 * In Linux, the total length is limited to MAX_RW_COUNT, there is
 868	 * no overflow possibility.
 869	 */
 870	tot_len = 0;
 871	ret = -EINVAL;
 872	for (seg = 0; seg < nr_segs; seg++) {
 873		compat_uptr_t buf;
 874		compat_ssize_t len;
 875
 876		if (__get_user(len, &uvector->iov_len) ||
 877		   __get_user(buf, &uvector->iov_base)) {
 878			ret = -EFAULT;
 879			goto out;
 880		}
 881		if (len < 0)	/* size_t not fitting in compat_ssize_t .. */
 882			goto out;
 883		if (type >= 0 &&
 884		    !access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
 885			ret = -EFAULT;
 886			goto out;
 887		}
 888		if (len > MAX_RW_COUNT - tot_len)
 889			len = MAX_RW_COUNT - tot_len;
 890		tot_len += len;
 891		iov->iov_base = compat_ptr(buf);
 892		iov->iov_len = (compat_size_t) len;
 893		uvector++;
 894		iov++;
 895	}
 896	ret = tot_len;
 897
 898out:
 
 
 899	return ret;
 900}
 901#endif
 902
 903static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
 904		loff_t *pos, rwf_t flags)
 905{
 906	size_t tot_len;
 907	ssize_t ret = 0;
 908
 
 
 909	if (!(file->f_mode & FMODE_READ))
 910		return -EBADF;
 911	if (!(file->f_mode & FMODE_CAN_READ))
 912		return -EINVAL;
 913
 914	tot_len = iov_iter_count(iter);
 915	if (!tot_len)
 916		goto out;
 917	ret = rw_verify_area(READ, file, pos, tot_len);
 918	if (ret < 0)
 919		return ret;
 920
 921	if (file->f_op->read_iter)
 922		ret = do_iter_readv_writev(file, iter, pos, READ, flags);
 923	else
 924		ret = do_loop_readv_writev(file, iter, pos, READ, flags);
 925out:
 926	if (ret >= 0)
 927		fsnotify_access(file);
 928	return ret;
 929}
 930
 931ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
 932		rwf_t flags)
 933{
 934	if (!file->f_op->read_iter)
 935		return -EINVAL;
 936	return do_iter_read(file, iter, ppos, flags);
 937}
 938EXPORT_SYMBOL(vfs_iter_read);
 939
 940static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
 941		loff_t *pos, rwf_t flags)
 
 
 
 
 942{
 943	size_t tot_len;
 944	ssize_t ret = 0;
 945
 
 
 946	if (!(file->f_mode & FMODE_WRITE))
 947		return -EBADF;
 948	if (!(file->f_mode & FMODE_CAN_WRITE))
 949		return -EINVAL;
 950
 951	tot_len = iov_iter_count(iter);
 952	if (!tot_len)
 953		return 0;
 954	ret = rw_verify_area(WRITE, file, pos, tot_len);
 955	if (ret < 0)
 956		return ret;
 957
 958	if (file->f_op->write_iter)
 959		ret = do_iter_readv_writev(file, iter, pos, WRITE, flags);
 960	else
 961		ret = do_loop_readv_writev(file, iter, pos, WRITE, flags);
 962	if (ret > 0)
 963		fsnotify_modify(file);
 
 964	return ret;
 965}
 
 966
 967ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
 968		rwf_t flags)
 969{
 
 
 
 
 
 
 
 970	if (!file->f_op->write_iter)
 971		return -EINVAL;
 972	return do_iter_write(file, iter, ppos, flags);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 973}
 974EXPORT_SYMBOL(vfs_iter_write);
 975
 976ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 977		  unsigned long vlen, loff_t *pos, rwf_t flags)
 978{
 979	struct iovec iovstack[UIO_FASTIOV];
 980	struct iovec *iov = iovstack;
 981	struct iov_iter iter;
 982	ssize_t ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 983
 984	ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
 985	if (ret >= 0) {
 986		ret = do_iter_read(file, &iter, pos, flags);
 987		kfree(iov);
 988	}
 989
 
 
 
 
 
 
 
 
 990	return ret;
 991}
 992
 993static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 994		   unsigned long vlen, loff_t *pos, rwf_t flags)
 995{
 996	struct iovec iovstack[UIO_FASTIOV];
 997	struct iovec *iov = iovstack;
 998	struct iov_iter iter;
 999	ssize_t ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1000
1001	ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1002	if (ret >= 0) {
1003		file_start_write(file);
1004		ret = do_iter_write(file, &iter, pos, flags);
1005		file_end_write(file);
1006		kfree(iov);
1007	}
 
 
 
1008	return ret;
1009}
1010
1011static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
1012			unsigned long vlen, rwf_t flags)
1013{
1014	struct fd f = fdget_pos(fd);
1015	ssize_t ret = -EBADF;
1016
1017	if (f.file) {
1018		loff_t pos = file_pos_read(f.file);
1019		ret = vfs_readv(f.file, vec, vlen, &pos, flags);
1020		if (ret >= 0)
1021			file_pos_write(f.file, pos);
 
 
 
 
1022		fdput_pos(f);
1023	}
1024
1025	if (ret > 0)
1026		add_rchar(current, ret);
1027	inc_syscr(current);
1028	return ret;
1029}
1030
1031static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
1032			 unsigned long vlen, rwf_t flags)
1033{
1034	struct fd f = fdget_pos(fd);
1035	ssize_t ret = -EBADF;
1036
1037	if (f.file) {
1038		loff_t pos = file_pos_read(f.file);
1039		ret = vfs_writev(f.file, vec, vlen, &pos, flags);
1040		if (ret >= 0)
1041			file_pos_write(f.file, pos);
 
 
 
 
1042		fdput_pos(f);
1043	}
1044
1045	if (ret > 0)
1046		add_wchar(current, ret);
1047	inc_syscw(current);
1048	return ret;
1049}
1050
1051static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
1052{
1053#define HALF_LONG_BITS (BITS_PER_LONG / 2)
1054	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
1055}
1056
1057static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
1058			 unsigned long vlen, loff_t pos, rwf_t flags)
1059{
1060	struct fd f;
1061	ssize_t ret = -EBADF;
1062
1063	if (pos < 0)
1064		return -EINVAL;
1065
1066	f = fdget(fd);
1067	if (f.file) {
1068		ret = -ESPIPE;
1069		if (f.file->f_mode & FMODE_PREAD)
1070			ret = vfs_readv(f.file, vec, vlen, &pos, flags);
1071		fdput(f);
1072	}
1073
1074	if (ret > 0)
1075		add_rchar(current, ret);
1076	inc_syscr(current);
1077	return ret;
1078}
1079
1080static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
1081			  unsigned long vlen, loff_t pos, rwf_t flags)
1082{
1083	struct fd f;
1084	ssize_t ret = -EBADF;
1085
1086	if (pos < 0)
1087		return -EINVAL;
1088
1089	f = fdget(fd);
1090	if (f.file) {
1091		ret = -ESPIPE;
1092		if (f.file->f_mode & FMODE_PWRITE)
1093			ret = vfs_writev(f.file, vec, vlen, &pos, flags);
1094		fdput(f);
1095	}
1096
1097	if (ret > 0)
1098		add_wchar(current, ret);
1099	inc_syscw(current);
1100	return ret;
1101}
1102
1103SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
1104		unsigned long, vlen)
1105{
1106	return do_readv(fd, vec, vlen, 0);
1107}
1108
1109SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
1110		unsigned long, vlen)
1111{
1112	return do_writev(fd, vec, vlen, 0);
1113}
1114
1115SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
1116		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1117{
1118	loff_t pos = pos_from_hilo(pos_h, pos_l);
1119
1120	return do_preadv(fd, vec, vlen, pos, 0);
1121}
1122
1123SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
1124		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1125		rwf_t, flags)
1126{
1127	loff_t pos = pos_from_hilo(pos_h, pos_l);
1128
1129	if (pos == -1)
1130		return do_readv(fd, vec, vlen, flags);
1131
1132	return do_preadv(fd, vec, vlen, pos, flags);
1133}
1134
1135SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
1136		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1137{
1138	loff_t pos = pos_from_hilo(pos_h, pos_l);
1139
1140	return do_pwritev(fd, vec, vlen, pos, 0);
1141}
1142
1143SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
1144		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1145		rwf_t, flags)
1146{
1147	loff_t pos = pos_from_hilo(pos_h, pos_l);
1148
1149	if (pos == -1)
1150		return do_writev(fd, vec, vlen, flags);
1151
1152	return do_pwritev(fd, vec, vlen, pos, flags);
1153}
1154
 
 
 
 
 
1155#ifdef CONFIG_COMPAT
1156static size_t compat_readv(struct file *file,
1157			   const struct compat_iovec __user *vec,
1158			   unsigned long vlen, loff_t *pos, rwf_t flags)
1159{
1160	struct iovec iovstack[UIO_FASTIOV];
1161	struct iovec *iov = iovstack;
1162	struct iov_iter iter;
1163	ssize_t ret;
1164
1165	ret = compat_import_iovec(READ, vec, vlen, UIO_FASTIOV, &iov, &iter);
1166	if (ret >= 0) {
1167		ret = do_iter_read(file, &iter, pos, flags);
1168		kfree(iov);
1169	}
1170	if (ret > 0)
1171		add_rchar(current, ret);
1172	inc_syscr(current);
1173	return ret;
1174}
1175
1176static size_t do_compat_readv(compat_ulong_t fd,
1177				 const struct compat_iovec __user *vec,
1178				 compat_ulong_t vlen, rwf_t flags)
1179{
1180	struct fd f = fdget_pos(fd);
1181	ssize_t ret;
1182	loff_t pos;
1183
1184	if (!f.file)
1185		return -EBADF;
1186	pos = f.file->f_pos;
1187	ret = compat_readv(f.file, vec, vlen, &pos, flags);
1188	if (ret >= 0)
1189		f.file->f_pos = pos;
1190	fdput_pos(f);
1191	return ret;
1192
1193}
1194
1195COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
1196		const struct compat_iovec __user *,vec,
1197		compat_ulong_t, vlen)
1198{
1199	return do_compat_readv(fd, vec, vlen, 0);
1200}
1201
1202static long do_compat_preadv64(unsigned long fd,
1203				  const struct compat_iovec __user *vec,
1204				  unsigned long vlen, loff_t pos, rwf_t flags)
1205{
1206	struct fd f;
1207	ssize_t ret;
1208
1209	if (pos < 0)
1210		return -EINVAL;
1211	f = fdget(fd);
1212	if (!f.file)
1213		return -EBADF;
1214	ret = -ESPIPE;
1215	if (f.file->f_mode & FMODE_PREAD)
1216		ret = compat_readv(f.file, vec, vlen, &pos, flags);
1217	fdput(f);
1218	return ret;
1219}
1220
1221#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
1222COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1223		const struct compat_iovec __user *,vec,
1224		unsigned long, vlen, loff_t, pos)
1225{
1226	return do_compat_preadv64(fd, vec, vlen, pos, 0);
1227}
1228#endif
1229
1230COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1231		const struct compat_iovec __user *,vec,
1232		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1233{
1234	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1235
1236	return do_compat_preadv64(fd, vec, vlen, pos, 0);
1237}
1238
1239#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
1240COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
1241		const struct compat_iovec __user *,vec,
1242		unsigned long, vlen, loff_t, pos, rwf_t, flags)
1243{
1244	return do_compat_preadv64(fd, vec, vlen, pos, flags);
 
 
1245}
1246#endif
1247
1248COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
1249		const struct compat_iovec __user *,vec,
1250		compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
1251		rwf_t, flags)
1252{
1253	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1254
1255	if (pos == -1)
1256		return do_compat_readv(fd, vec, vlen, flags);
1257
1258	return do_compat_preadv64(fd, vec, vlen, pos, flags);
1259}
1260
1261static size_t compat_writev(struct file *file,
1262			    const struct compat_iovec __user *vec,
1263			    unsigned long vlen, loff_t *pos, rwf_t flags)
1264{
1265	struct iovec iovstack[UIO_FASTIOV];
1266	struct iovec *iov = iovstack;
1267	struct iov_iter iter;
1268	ssize_t ret;
1269
1270	ret = compat_import_iovec(WRITE, vec, vlen, UIO_FASTIOV, &iov, &iter);
1271	if (ret >= 0) {
1272		file_start_write(file);
1273		ret = do_iter_write(file, &iter, pos, flags);
1274		file_end_write(file);
1275		kfree(iov);
1276	}
1277	if (ret > 0)
1278		add_wchar(current, ret);
1279	inc_syscw(current);
1280	return ret;
1281}
1282
1283static size_t do_compat_writev(compat_ulong_t fd,
1284				  const struct compat_iovec __user* vec,
1285				  compat_ulong_t vlen, rwf_t flags)
1286{
1287	struct fd f = fdget_pos(fd);
1288	ssize_t ret;
1289	loff_t pos;
1290
1291	if (!f.file)
1292		return -EBADF;
1293	pos = f.file->f_pos;
1294	ret = compat_writev(f.file, vec, vlen, &pos, flags);
1295	if (ret >= 0)
1296		f.file->f_pos = pos;
1297	fdput_pos(f);
1298	return ret;
1299}
1300
1301COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1302		const struct compat_iovec __user *, vec,
1303		compat_ulong_t, vlen)
1304{
1305	return do_compat_writev(fd, vec, vlen, 0);
1306}
1307
1308static long do_compat_pwritev64(unsigned long fd,
1309				   const struct compat_iovec __user *vec,
1310				   unsigned long vlen, loff_t pos, rwf_t flags)
1311{
1312	struct fd f;
1313	ssize_t ret;
1314
1315	if (pos < 0)
1316		return -EINVAL;
1317	f = fdget(fd);
1318	if (!f.file)
1319		return -EBADF;
1320	ret = -ESPIPE;
1321	if (f.file->f_mode & FMODE_PWRITE)
1322		ret = compat_writev(f.file, vec, vlen, &pos, flags);
1323	fdput(f);
1324	return ret;
1325}
1326
1327#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
1328COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1329		const struct compat_iovec __user *,vec,
1330		unsigned long, vlen, loff_t, pos)
1331{
1332	return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1333}
1334#endif
1335
1336COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1337		const struct compat_iovec __user *,vec,
1338		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1339{
1340	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1341
1342	return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1343}
1344
1345#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
1346COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
1347		const struct compat_iovec __user *,vec,
1348		unsigned long, vlen, loff_t, pos, rwf_t, flags)
1349{
1350	return do_compat_pwritev64(fd, vec, vlen, pos, flags);
 
 
1351}
1352#endif
1353
1354COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
1355		const struct compat_iovec __user *,vec,
1356		compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
1357{
1358	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1359
1360	if (pos == -1)
1361		return do_compat_writev(fd, vec, vlen, flags);
1362
1363	return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1364}
1365
1366#endif
1367
1368static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1369		  	   size_t count, loff_t max)
1370{
1371	struct fd in, out;
1372	struct inode *in_inode, *out_inode;
 
1373	loff_t pos;
1374	loff_t out_pos;
1375	ssize_t retval;
1376	int fl;
1377
1378	/*
1379	 * Get input file, and verify that it is ok..
1380	 */
1381	retval = -EBADF;
1382	in = fdget(in_fd);
1383	if (!in.file)
1384		goto out;
1385	if (!(in.file->f_mode & FMODE_READ))
1386		goto fput_in;
1387	retval = -ESPIPE;
1388	if (!ppos) {
1389		pos = in.file->f_pos;
1390	} else {
1391		pos = *ppos;
1392		if (!(in.file->f_mode & FMODE_PREAD))
1393			goto fput_in;
1394	}
1395	retval = rw_verify_area(READ, in.file, &pos, count);
1396	if (retval < 0)
1397		goto fput_in;
1398	if (count > MAX_RW_COUNT)
1399		count =  MAX_RW_COUNT;
1400
1401	/*
1402	 * Get output file, and verify that it is ok..
1403	 */
1404	retval = -EBADF;
1405	out = fdget(out_fd);
1406	if (!out.file)
1407		goto fput_in;
1408	if (!(out.file->f_mode & FMODE_WRITE))
1409		goto fput_out;
1410	retval = -EINVAL;
1411	in_inode = file_inode(in.file);
1412	out_inode = file_inode(out.file);
1413	out_pos = out.file->f_pos;
1414	retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1415	if (retval < 0)
1416		goto fput_out;
1417
1418	if (!max)
1419		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1420
1421	if (unlikely(pos + count > max)) {
1422		retval = -EOVERFLOW;
1423		if (pos >= max)
1424			goto fput_out;
1425		count = max - pos;
1426	}
1427
1428	fl = 0;
1429#if 0
1430	/*
1431	 * We need to debate whether we can enable this or not. The
1432	 * man page documents EAGAIN return for the output at least,
1433	 * and the application is arguably buggy if it doesn't expect
1434	 * EAGAIN on a non-blocking file descriptor.
1435	 */
1436	if (in.file->f_flags & O_NONBLOCK)
1437		fl = SPLICE_F_NONBLOCK;
1438#endif
1439	file_start_write(out.file);
1440	retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
1441	file_end_write(out.file);
 
 
 
 
 
 
 
 
 
 
1442
1443	if (retval > 0) {
1444		add_rchar(current, retval);
1445		add_wchar(current, retval);
1446		fsnotify_access(in.file);
1447		fsnotify_modify(out.file);
1448		out.file->f_pos = out_pos;
1449		if (ppos)
1450			*ppos = pos;
1451		else
1452			in.file->f_pos = pos;
1453	}
1454
1455	inc_syscr(current);
1456	inc_syscw(current);
1457	if (pos > max)
1458		retval = -EOVERFLOW;
1459
1460fput_out:
1461	fdput(out);
1462fput_in:
1463	fdput(in);
1464out:
1465	return retval;
1466}
1467
1468SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1469{
1470	loff_t pos;
1471	off_t off;
1472	ssize_t ret;
1473
1474	if (offset) {
1475		if (unlikely(get_user(off, offset)))
1476			return -EFAULT;
1477		pos = off;
1478		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1479		if (unlikely(put_user(pos, offset)))
1480			return -EFAULT;
1481		return ret;
1482	}
1483
1484	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1485}
1486
1487SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1488{
1489	loff_t pos;
1490	ssize_t ret;
1491
1492	if (offset) {
1493		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1494			return -EFAULT;
1495		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1496		if (unlikely(put_user(pos, offset)))
1497			return -EFAULT;
1498		return ret;
1499	}
1500
1501	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1502}
1503
1504#ifdef CONFIG_COMPAT
1505COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1506		compat_off_t __user *, offset, compat_size_t, count)
1507{
1508	loff_t pos;
1509	off_t off;
1510	ssize_t ret;
1511
1512	if (offset) {
1513		if (unlikely(get_user(off, offset)))
1514			return -EFAULT;
1515		pos = off;
1516		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1517		if (unlikely(put_user(pos, offset)))
1518			return -EFAULT;
1519		return ret;
1520	}
1521
1522	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1523}
1524
1525COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1526		compat_loff_t __user *, offset, compat_size_t, count)
1527{
1528	loff_t pos;
1529	ssize_t ret;
1530
1531	if (offset) {
1532		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1533			return -EFAULT;
1534		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1535		if (unlikely(put_user(pos, offset)))
1536			return -EFAULT;
1537		return ret;
1538	}
1539
1540	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1541}
1542#endif
1543
1544/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1545 * copy_file_range() differs from regular file read and write in that it
1546 * specifically allows return partial success.  When it does so is up to
1547 * the copy_file_range method.
1548 */
1549ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
1550			    struct file *file_out, loff_t pos_out,
1551			    size_t len, unsigned int flags)
1552{
1553	struct inode *inode_in = file_inode(file_in);
1554	struct inode *inode_out = file_inode(file_out);
1555	ssize_t ret;
 
 
1556
1557	if (flags != 0)
1558		return -EINVAL;
1559
1560	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1561		return -EISDIR;
1562	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1563		return -EINVAL;
1564
1565	ret = rw_verify_area(READ, file_in, &pos_in, len);
1566	if (unlikely(ret))
1567		return ret;
1568
1569	ret = rw_verify_area(WRITE, file_out, &pos_out, len);
1570	if (unlikely(ret))
1571		return ret;
1572
1573	if (!(file_in->f_mode & FMODE_READ) ||
1574	    !(file_out->f_mode & FMODE_WRITE) ||
1575	    (file_out->f_flags & O_APPEND))
1576		return -EBADF;
1577
1578	/* this could be relaxed once a method supports cross-fs copies */
1579	if (inode_in->i_sb != inode_out->i_sb)
1580		return -EXDEV;
1581
1582	if (len == 0)
1583		return 0;
1584
1585	file_start_write(file_out);
1586
1587	/*
1588	 * Try cloning first, this is supported by more file systems, and
1589	 * more efficient if both clone and copy are supported (e.g. NFS).
 
1590	 */
1591	if (file_in->f_op->clone_file_range) {
1592		ret = file_in->f_op->clone_file_range(file_in, pos_in,
1593				file_out, pos_out, len);
1594		if (ret == 0) {
1595			ret = len;
1596			goto done;
1597		}
 
 
 
 
 
 
 
 
1598	}
1599
1600	if (file_out->f_op->copy_file_range) {
1601		ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
1602						      pos_out, len, flags);
1603		if (ret != -EOPNOTSUPP)
1604			goto done;
1605	}
1606
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1607	ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
1608			len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
1609
1610done:
1611	if (ret > 0) {
1612		fsnotify_access(file_in);
1613		add_rchar(current, ret);
1614		fsnotify_modify(file_out);
1615		add_wchar(current, ret);
1616	}
1617
1618	inc_syscr(current);
1619	inc_syscw(current);
1620
1621	file_end_write(file_out);
1622
1623	return ret;
1624}
1625EXPORT_SYMBOL(vfs_copy_file_range);
1626
1627SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
1628		int, fd_out, loff_t __user *, off_out,
1629		size_t, len, unsigned int, flags)
1630{
1631	loff_t pos_in;
1632	loff_t pos_out;
1633	struct fd f_in;
1634	struct fd f_out;
1635	ssize_t ret = -EBADF;
1636
1637	f_in = fdget(fd_in);
1638	if (!f_in.file)
1639		goto out2;
1640
1641	f_out = fdget(fd_out);
1642	if (!f_out.file)
1643		goto out1;
1644
1645	ret = -EFAULT;
1646	if (off_in) {
1647		if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
1648			goto out;
1649	} else {
1650		pos_in = f_in.file->f_pos;
1651	}
1652
1653	if (off_out) {
1654		if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
1655			goto out;
1656	} else {
1657		pos_out = f_out.file->f_pos;
1658	}
1659
 
 
 
 
1660	ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
1661				  flags);
1662	if (ret > 0) {
1663		pos_in += ret;
1664		pos_out += ret;
1665
1666		if (off_in) {
1667			if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
1668				ret = -EFAULT;
1669		} else {
1670			f_in.file->f_pos = pos_in;
1671		}
1672
1673		if (off_out) {
1674			if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
1675				ret = -EFAULT;
1676		} else {
1677			f_out.file->f_pos = pos_out;
1678		}
1679	}
1680
1681out:
1682	fdput(f_out);
1683out1:
1684	fdput(f_in);
1685out2:
1686	return ret;
1687}
1688
1689static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
 
 
 
 
 
1690{
1691	struct inode *inode = file_inode(file);
 
 
1692
1693	if (unlikely(pos < 0))
1694		return -EINVAL;
 
 
 
 
 
1695
1696	 if (unlikely((loff_t) (pos + len) < 0))
1697		return -EINVAL;
1698
1699	if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
1700		loff_t end = len ? pos + len - 1 : OFFSET_MAX;
1701		int retval;
1702
1703		retval = locks_mandatory_area(inode, file, pos, end,
1704				write ? F_WRLCK : F_RDLCK);
1705		if (retval < 0)
1706			return retval;
1707	}
1708
1709	return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
1710}
1711
1712/*
1713 * Check that the two inodes are eligible for cloning, the ranges make
1714 * sense, and then flush all dirty data.  Caller must ensure that the
1715 * inodes have been locked against any other modifications.
1716 *
1717 * Returns: 0 for "nothing to clone", 1 for "something to clone", or
1718 * the usual negative error code.
1719 */
1720int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
1721			       struct inode *inode_out, loff_t pos_out,
1722			       u64 *len, bool is_dedupe)
1723{
1724	loff_t bs = inode_out->i_sb->s_blocksize;
1725	loff_t blen;
1726	loff_t isize;
1727	bool same_inode = (inode_in == inode_out);
1728	int ret;
1729
1730	/* Don't touch certain kinds of inodes */
1731	if (IS_IMMUTABLE(inode_out))
1732		return -EPERM;
1733
1734	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
1735		return -ETXTBSY;
1736
1737	/* Don't reflink dirs, pipes, sockets... */
1738	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1739		return -EISDIR;
1740	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1741		return -EINVAL;
1742
1743	/* Are we going all the way to the end? */
1744	isize = i_size_read(inode_in);
1745	if (isize == 0)
1746		return 0;
1747
1748	/* Zero length dedupe exits immediately; reflink goes to EOF. */
1749	if (*len == 0) {
1750		if (is_dedupe || pos_in == isize)
1751			return 0;
1752		if (pos_in > isize)
1753			return -EINVAL;
1754		*len = isize - pos_in;
1755	}
1756
1757	/* Ensure offsets don't wrap and the input is inside i_size */
1758	if (pos_in + *len < pos_in || pos_out + *len < pos_out ||
1759	    pos_in + *len > isize)
1760		return -EINVAL;
1761
1762	/* Don't allow dedupe past EOF in the dest file */
1763	if (is_dedupe) {
1764		loff_t	disize;
1765
1766		disize = i_size_read(inode_out);
1767		if (pos_out >= disize || pos_out + *len > disize)
1768			return -EINVAL;
1769	}
1770
1771	/* If we're linking to EOF, continue to the block boundary. */
1772	if (pos_in + *len == isize)
1773		blen = ALIGN(isize, bs) - pos_in;
1774	else
1775		blen = *len;
1776
1777	/* Only reflink if we're aligned to block boundaries */
1778	if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
1779	    !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
1780		return -EINVAL;
1781
1782	/* Don't allow overlapped reflink within the same file */
1783	if (same_inode) {
1784		if (pos_out + blen > pos_in && pos_out < pos_in + blen)
1785			return -EINVAL;
1786	}
1787
1788	/* Wait for the completion of any pending IOs on both files */
1789	inode_dio_wait(inode_in);
1790	if (!same_inode)
1791		inode_dio_wait(inode_out);
1792
1793	ret = filemap_write_and_wait_range(inode_in->i_mapping,
1794			pos_in, pos_in + *len - 1);
1795	if (ret)
1796		return ret;
1797
1798	ret = filemap_write_and_wait_range(inode_out->i_mapping,
1799			pos_out, pos_out + *len - 1);
1800	if (ret)
1801		return ret;
1802
1803	/*
1804	 * Check that the extents are the same.
1805	 */
1806	if (is_dedupe) {
1807		bool		is_same = false;
1808
1809		ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
1810				inode_out, pos_out, *len, &is_same);
1811		if (ret)
1812			return ret;
1813		if (!is_same)
1814			return -EBADE;
1815	}
1816
1817	return 1;
1818}
1819EXPORT_SYMBOL(vfs_clone_file_prep_inodes);
1820
1821int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
1822		struct file *file_out, loff_t pos_out, u64 len)
 
 
 
1823{
1824	struct inode *inode_in = file_inode(file_in);
1825	struct inode *inode_out = file_inode(file_out);
1826	int ret;
1827
 
1828	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1829		return -EISDIR;
1830	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1831		return -EINVAL;
1832
1833	/*
1834	 * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
1835	 * the same mount. Practically, they only need to be on the same file
1836	 * system.
1837	 */
1838	if (inode_in->i_sb != inode_out->i_sb)
1839		return -EXDEV;
1840
1841	if (!(file_in->f_mode & FMODE_READ) ||
1842	    !(file_out->f_mode & FMODE_WRITE) ||
1843	    (file_out->f_flags & O_APPEND))
1844		return -EBADF;
1845
1846	if (!file_in->f_op->clone_file_range)
1847		return -EOPNOTSUPP;
1848
1849	ret = clone_verify_area(file_in, pos_in, len, false);
1850	if (ret)
1851		return ret;
1852
1853	ret = clone_verify_area(file_out, pos_out, len, true);
1854	if (ret)
1855		return ret;
1856
1857	if (pos_in + len > i_size_read(inode_in))
1858		return -EINVAL;
1859
1860	ret = file_in->f_op->clone_file_range(file_in, pos_in,
1861			file_out, pos_out, len);
1862	if (!ret) {
1863		fsnotify_access(file_in);
1864		fsnotify_modify(file_out);
1865	}
1866
1867	return ret;
1868}
1869EXPORT_SYMBOL(vfs_clone_file_range);
1870
1871/*
1872 * Read a page's worth of file data into the page cache.  Return the page
1873 * locked.
1874 */
1875static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
1876{
1877	struct address_space *mapping;
1878	struct page *page;
1879	pgoff_t n;
1880
1881	n = offset >> PAGE_SHIFT;
1882	mapping = inode->i_mapping;
1883	page = read_mapping_page(mapping, n, NULL);
1884	if (IS_ERR(page))
1885		return page;
1886	if (!PageUptodate(page)) {
1887		put_page(page);
1888		return ERR_PTR(-EIO);
1889	}
1890	lock_page(page);
1891	return page;
1892}
1893
1894/*
1895 * Compare extents of two files to see if they are the same.
1896 * Caller must have locked both inodes to prevent write races.
1897 */
1898int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
1899				  struct inode *dest, loff_t destoff,
1900				  loff_t len, bool *is_same)
1901{
1902	loff_t src_poff;
1903	loff_t dest_poff;
1904	void *src_addr;
1905	void *dest_addr;
1906	struct page *src_page;
1907	struct page *dest_page;
1908	loff_t cmp_len;
1909	bool same;
1910	int error;
1911
1912	error = -EINVAL;
1913	same = true;
1914	while (len) {
1915		src_poff = srcoff & (PAGE_SIZE - 1);
1916		dest_poff = destoff & (PAGE_SIZE - 1);
1917		cmp_len = min(PAGE_SIZE - src_poff,
1918			      PAGE_SIZE - dest_poff);
1919		cmp_len = min(cmp_len, len);
1920		if (cmp_len <= 0)
1921			goto out_error;
1922
1923		src_page = vfs_dedupe_get_page(src, srcoff);
1924		if (IS_ERR(src_page)) {
1925			error = PTR_ERR(src_page);
1926			goto out_error;
1927		}
1928		dest_page = vfs_dedupe_get_page(dest, destoff);
1929		if (IS_ERR(dest_page)) {
1930			error = PTR_ERR(dest_page);
1931			unlock_page(src_page);
1932			put_page(src_page);
1933			goto out_error;
1934		}
1935		src_addr = kmap_atomic(src_page);
1936		dest_addr = kmap_atomic(dest_page);
1937
1938		flush_dcache_page(src_page);
1939		flush_dcache_page(dest_page);
1940
1941		if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
1942			same = false;
1943
1944		kunmap_atomic(dest_addr);
1945		kunmap_atomic(src_addr);
1946		unlock_page(dest_page);
1947		unlock_page(src_page);
1948		put_page(dest_page);
1949		put_page(src_page);
1950
1951		if (!same)
1952			break;
1953
1954		srcoff += cmp_len;
1955		destoff += cmp_len;
1956		len -= cmp_len;
1957	}
1958
1959	*is_same = same;
1960	return 0;
1961
1962out_error:
1963	return error;
1964}
1965EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
1966
1967int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
1968{
1969	struct file_dedupe_range_info *info;
1970	struct inode *src = file_inode(file);
1971	u64 off;
1972	u64 len;
1973	int i;
1974	int ret;
1975	bool is_admin = capable(CAP_SYS_ADMIN);
1976	u16 count = same->dest_count;
1977	struct file *dst_file;
1978	loff_t dst_off;
1979	ssize_t deduped;
1980
1981	if (!(file->f_mode & FMODE_READ))
1982		return -EINVAL;
1983
1984	if (same->reserved1 || same->reserved2)
1985		return -EINVAL;
1986
1987	off = same->src_offset;
1988	len = same->src_length;
1989
1990	ret = -EISDIR;
1991	if (S_ISDIR(src->i_mode))
1992		goto out;
1993
1994	ret = -EINVAL;
1995	if (!S_ISREG(src->i_mode))
1996		goto out;
1997
1998	ret = clone_verify_area(file, off, len, false);
1999	if (ret < 0)
2000		goto out;
2001	ret = 0;
2002
2003	if (off + len > i_size_read(src))
2004		return -EINVAL;
2005
2006	/* pre-format output fields to sane values */
2007	for (i = 0; i < count; i++) {
2008		same->info[i].bytes_deduped = 0ULL;
2009		same->info[i].status = FILE_DEDUPE_RANGE_SAME;
2010	}
2011
2012	for (i = 0, info = same->info; i < count; i++, info++) {
2013		struct inode *dst;
2014		struct fd dst_fd = fdget(info->dest_fd);
2015
2016		dst_file = dst_fd.file;
2017		if (!dst_file) {
2018			info->status = -EBADF;
2019			goto next_loop;
2020		}
2021		dst = file_inode(dst_file);
2022
2023		ret = mnt_want_write_file(dst_file);
2024		if (ret) {
2025			info->status = ret;
2026			goto next_loop;
2027		}
2028
2029		dst_off = info->dest_offset;
2030		ret = clone_verify_area(dst_file, dst_off, len, true);
2031		if (ret < 0) {
2032			info->status = ret;
2033			goto next_file;
2034		}
2035		ret = 0;
2036
2037		if (info->reserved) {
2038			info->status = -EINVAL;
2039		} else if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) {
2040			info->status = -EINVAL;
2041		} else if (file->f_path.mnt != dst_file->f_path.mnt) {
2042			info->status = -EXDEV;
2043		} else if (S_ISDIR(dst->i_mode)) {
2044			info->status = -EISDIR;
2045		} else if (dst_file->f_op->dedupe_file_range == NULL) {
2046			info->status = -EINVAL;
2047		} else {
2048			deduped = dst_file->f_op->dedupe_file_range(file, off,
2049							len, dst_file,
2050							info->dest_offset);
2051			if (deduped == -EBADE)
2052				info->status = FILE_DEDUPE_RANGE_DIFFERS;
2053			else if (deduped < 0)
2054				info->status = deduped;
2055			else
2056				info->bytes_deduped += deduped;
2057		}
2058
2059next_file:
2060		mnt_drop_write_file(dst_file);
2061next_loop:
2062		fdput(dst_fd);
2063
2064		if (fatal_signal_pending(current))
2065			goto out;
2066	}
2067
2068out:
2069	return ret;
2070}
2071EXPORT_SYMBOL(vfs_dedupe_file_range);