Linux Audio

Check our new training course

Loading...
v4.10.11
 
   1/*
   2 *  linux/fs/read_write.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7#include <linux/slab.h> 
   8#include <linux/stat.h>
 
   9#include <linux/fcntl.h>
  10#include <linux/file.h>
  11#include <linux/uio.h>
  12#include <linux/fsnotify.h>
  13#include <linux/security.h>
  14#include <linux/export.h>
  15#include <linux/syscalls.h>
  16#include <linux/pagemap.h>
  17#include <linux/splice.h>
  18#include <linux/compat.h>
  19#include <linux/mount.h>
  20#include <linux/fs.h>
  21#include "internal.h"
  22
  23#include <linux/uaccess.h>
  24#include <asm/unistd.h>
  25
  26typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
  27typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *);
  28
  29const struct file_operations generic_ro_fops = {
  30	.llseek		= generic_file_llseek,
  31	.read_iter	= generic_file_read_iter,
  32	.mmap		= generic_file_readonly_mmap,
  33	.splice_read	= generic_file_splice_read,
  34};
  35
  36EXPORT_SYMBOL(generic_ro_fops);
  37
  38static inline int unsigned_offsets(struct file *file)
  39{
  40	return file->f_mode & FMODE_UNSIGNED_OFFSET;
  41}
  42
  43/**
  44 * vfs_setpos - update the file offset for lseek
  45 * @file:	file structure in question
  46 * @offset:	file offset to seek to
  47 * @maxsize:	maximum file size
 
  48 *
  49 * This is a low-level filesystem helper for updating the file offset to
  50 * the value specified by @offset if the given offset is valid and it is
  51 * not equal to the current file offset.
  52 *
  53 * Return the specified offset on success and -EINVAL on invalid offset.
  54 */
  55loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
 
  56{
  57	if (offset < 0 && !unsigned_offsets(file))
  58		return -EINVAL;
  59	if (offset > maxsize)
  60		return -EINVAL;
  61
  62	if (offset != file->f_pos) {
  63		file->f_pos = offset;
  64		file->f_version = 0;
 
  65	}
  66	return offset;
  67}
  68EXPORT_SYMBOL(vfs_setpos);
  69
  70/**
  71 * generic_file_llseek_size - generic llseek implementation for regular files
  72 * @file:	file structure to seek on
  73 * @offset:	file offset to seek to
  74 * @whence:	type of seek
  75 * @size:	max size of this file in file system
  76 * @eof:	offset used for SEEK_END position
  77 *
  78 * This is a variant of generic_file_llseek that allows passing in a custom
  79 * maximum file size and a custom EOF position, for e.g. hashed directories
 
  80 *
  81 * Synchronization:
  82 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  83 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  84 * read/writes behave like SEEK_SET against seeks.
  85 */
  86loff_t
  87generic_file_llseek_size(struct file *file, loff_t offset, int whence,
  88		loff_t maxsize, loff_t eof)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  89{
  90	switch (whence) {
  91	case SEEK_END:
  92		offset += eof;
  93		break;
  94	case SEEK_CUR:
  95		/*
  96		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
  97		 * position-querying operation.  Avoid rewriting the "same"
  98		 * f_pos value back to the file because a concurrent read(),
  99		 * write() or lseek() might have altered it
 100		 */
 101		if (offset == 0)
 102			return file->f_pos;
 103		/*
 104		 * f_lock protects against read/modify/write race with other
 105		 * SEEK_CURs. Note that parallel writes and reads behave
 106		 * like SEEK_SET.
 107		 */
 108		spin_lock(&file->f_lock);
 109		offset = vfs_setpos(file, file->f_pos + offset, maxsize);
 110		spin_unlock(&file->f_lock);
 111		return offset;
 112	case SEEK_DATA:
 113		/*
 114		 * In the generic case the entire file is data, so as long as
 115		 * offset isn't at the end of the file then the offset is data.
 116		 */
 117		if (offset >= eof)
 118			return -ENXIO;
 119		break;
 120	case SEEK_HOLE:
 121		/*
 122		 * There is a virtual hole at the end of the file, so as long as
 123		 * offset isn't i_size or larger, return i_size.
 124		 */
 125		if (offset >= eof)
 126			return -ENXIO;
 127		offset = eof;
 128		break;
 129	}
 130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 131	return vfs_setpos(file, offset, maxsize);
 132}
 133EXPORT_SYMBOL(generic_file_llseek_size);
 134
 135/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 136 * generic_file_llseek - generic llseek implementation for regular files
 137 * @file:	file structure to seek on
 138 * @offset:	file offset to seek to
 139 * @whence:	type of seek
 140 *
 141 * This is a generic implemenation of ->llseek useable for all normal local
 142 * filesystems.  It just updates the file offset to the value specified by
 143 * @offset and @whence.
 144 */
 145loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
 146{
 147	struct inode *inode = file->f_mapping->host;
 148
 149	return generic_file_llseek_size(file, offset, whence,
 150					inode->i_sb->s_maxbytes,
 151					i_size_read(inode));
 152}
 153EXPORT_SYMBOL(generic_file_llseek);
 154
 155/**
 156 * fixed_size_llseek - llseek implementation for fixed-sized devices
 157 * @file:	file structure to seek on
 158 * @offset:	file offset to seek to
 159 * @whence:	type of seek
 160 * @size:	size of the file
 161 *
 162 */
 163loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
 164{
 165	switch (whence) {
 166	case SEEK_SET: case SEEK_CUR: case SEEK_END:
 167		return generic_file_llseek_size(file, offset, whence,
 168						size, size);
 169	default:
 170		return -EINVAL;
 171	}
 172}
 173EXPORT_SYMBOL(fixed_size_llseek);
 174
 175/**
 176 * no_seek_end_llseek - llseek implementation for fixed-sized devices
 177 * @file:	file structure to seek on
 178 * @offset:	file offset to seek to
 179 * @whence:	type of seek
 180 *
 181 */
 182loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
 183{
 184	switch (whence) {
 185	case SEEK_SET: case SEEK_CUR:
 186		return generic_file_llseek_size(file, offset, whence,
 187						OFFSET_MAX, 0);
 188	default:
 189		return -EINVAL;
 190	}
 191}
 192EXPORT_SYMBOL(no_seek_end_llseek);
 193
 194/**
 195 * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
 196 * @file:	file structure to seek on
 197 * @offset:	file offset to seek to
 198 * @whence:	type of seek
 199 * @size:	maximal offset allowed
 200 *
 201 */
 202loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
 203{
 204	switch (whence) {
 205	case SEEK_SET: case SEEK_CUR:
 206		return generic_file_llseek_size(file, offset, whence,
 207						size, 0);
 208	default:
 209		return -EINVAL;
 210	}
 211}
 212EXPORT_SYMBOL(no_seek_end_llseek_size);
 213
 214/**
 215 * noop_llseek - No Operation Performed llseek implementation
 216 * @file:	file structure to seek on
 217 * @offset:	file offset to seek to
 218 * @whence:	type of seek
 219 *
 220 * This is an implementation of ->llseek useable for the rare special case when
 221 * userspace expects the seek to succeed but the (device) file is actually not
 222 * able to perform the seek. In this case you use noop_llseek() instead of
 223 * falling back to the default implementation of ->llseek.
 224 */
 225loff_t noop_llseek(struct file *file, loff_t offset, int whence)
 226{
 227	return file->f_pos;
 228}
 229EXPORT_SYMBOL(noop_llseek);
 230
 231loff_t no_llseek(struct file *file, loff_t offset, int whence)
 232{
 233	return -ESPIPE;
 234}
 235EXPORT_SYMBOL(no_llseek);
 236
 237loff_t default_llseek(struct file *file, loff_t offset, int whence)
 238{
 239	struct inode *inode = file_inode(file);
 240	loff_t retval;
 241
 242	inode_lock(inode);
 243	switch (whence) {
 244		case SEEK_END:
 245			offset += i_size_read(inode);
 246			break;
 247		case SEEK_CUR:
 248			if (offset == 0) {
 249				retval = file->f_pos;
 250				goto out;
 251			}
 252			offset += file->f_pos;
 253			break;
 254		case SEEK_DATA:
 255			/*
 256			 * In the generic case the entire file is data, so as
 257			 * long as offset isn't at the end of the file then the
 258			 * offset is data.
 259			 */
 260			if (offset >= inode->i_size) {
 261				retval = -ENXIO;
 262				goto out;
 263			}
 264			break;
 265		case SEEK_HOLE:
 266			/*
 267			 * There is a virtual hole at the end of the file, so
 268			 * as long as offset isn't i_size or larger, return
 269			 * i_size.
 270			 */
 271			if (offset >= inode->i_size) {
 272				retval = -ENXIO;
 273				goto out;
 274			}
 275			offset = inode->i_size;
 276			break;
 277	}
 278	retval = -EINVAL;
 279	if (offset >= 0 || unsigned_offsets(file)) {
 280		if (offset != file->f_pos) {
 281			file->f_pos = offset;
 282			file->f_version = 0;
 283		}
 284		retval = offset;
 285	}
 286out:
 287	inode_unlock(inode);
 288	return retval;
 289}
 290EXPORT_SYMBOL(default_llseek);
 291
 292loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
 293{
 294	loff_t (*fn)(struct file *, loff_t, int);
 295
 296	fn = no_llseek;
 297	if (file->f_mode & FMODE_LSEEK) {
 298		if (file->f_op->llseek)
 299			fn = file->f_op->llseek;
 300	}
 301	return fn(file, offset, whence);
 302}
 303EXPORT_SYMBOL(vfs_llseek);
 304
 305SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
 306{
 307	off_t retval;
 308	struct fd f = fdget_pos(fd);
 309	if (!f.file)
 310		return -EBADF;
 311
 312	retval = -EINVAL;
 313	if (whence <= SEEK_MAX) {
 314		loff_t res = vfs_llseek(f.file, offset, whence);
 315		retval = res;
 316		if (res != (loff_t)retval)
 317			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */
 318	}
 319	fdput_pos(f);
 320	return retval;
 321}
 322
 
 
 
 
 
 323#ifdef CONFIG_COMPAT
 324COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
 325{
 326	return sys_lseek(fd, offset, whence);
 327}
 328#endif
 329
 330#ifdef __ARCH_WANT_SYS_LLSEEK
 
 331SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 332		unsigned long, offset_low, loff_t __user *, result,
 333		unsigned int, whence)
 334{
 335	int retval;
 336	struct fd f = fdget_pos(fd);
 337	loff_t offset;
 338
 339	if (!f.file)
 340		return -EBADF;
 341
 342	retval = -EINVAL;
 343	if (whence > SEEK_MAX)
 344		goto out_putf;
 345
 346	offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
 347			whence);
 348
 349	retval = (int)offset;
 350	if (offset >= 0) {
 351		retval = -EFAULT;
 352		if (!copy_to_user(result, &offset, sizeof(offset)))
 353			retval = 0;
 354	}
 355out_putf:
 356	fdput_pos(f);
 357	return retval;
 358}
 359#endif
 360
 361ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
 362{
 363	struct kiocb kiocb;
 364	ssize_t ret;
 365
 366	if (!file->f_op->read_iter)
 367		return -EINVAL;
 368
 369	init_sync_kiocb(&kiocb, file);
 370	kiocb.ki_pos = *ppos;
 371
 372	iter->type |= READ;
 373	ret = file->f_op->read_iter(&kiocb, iter);
 374	BUG_ON(ret == -EIOCBQUEUED);
 375	if (ret > 0)
 376		*ppos = kiocb.ki_pos;
 377	return ret;
 
 
 
 
 
 
 
 
 
 
 378}
 379EXPORT_SYMBOL(vfs_iter_read);
 380
 381ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
 382{
 383	struct kiocb kiocb;
 
 384	ssize_t ret;
 385
 386	if (!file->f_op->write_iter)
 387		return -EINVAL;
 388
 389	init_sync_kiocb(&kiocb, file);
 390	kiocb.ki_pos = *ppos;
 391
 392	iter->type |= WRITE;
 393	ret = file->f_op->write_iter(&kiocb, iter);
 394	BUG_ON(ret == -EIOCBQUEUED);
 395	if (ret > 0)
 396		*ppos = kiocb.ki_pos;
 397	return ret;
 398}
 399EXPORT_SYMBOL(vfs_iter_write);
 400
 401int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
 402{
 403	struct inode *inode;
 404	loff_t pos;
 405	int retval = -EINVAL;
 406
 407	inode = file_inode(file);
 408	if (unlikely((ssize_t) count < 0))
 409		return retval;
 410	pos = *ppos;
 411	if (unlikely(pos < 0)) {
 412		if (!unsigned_offsets(file))
 413			return retval;
 414		if (count >= -pos) /* both values are in 0..LLONG_MAX */
 415			return -EOVERFLOW;
 416	} else if (unlikely((loff_t) (pos + count) < 0)) {
 417		if (!unsigned_offsets(file))
 418			return retval;
 419	}
 420
 421	if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
 422		retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
 423				read_write == READ ? F_RDLCK : F_WRLCK);
 424		if (retval < 0)
 425			return retval;
 426	}
 427	return security_file_permission(file,
 428				read_write == READ ? MAY_READ : MAY_WRITE);
 429}
 430
 431static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 432{
 433	struct iovec iov = { .iov_base = buf, .iov_len = len };
 
 
 
 434	struct kiocb kiocb;
 435	struct iov_iter iter;
 436	ssize_t ret;
 437
 438	init_sync_kiocb(&kiocb, filp);
 439	kiocb.ki_pos = *ppos;
 440	iov_iter_init(&iter, READ, &iov, 1, len);
 
 
 
 
 
 
 
 441
 442	ret = filp->f_op->read_iter(&kiocb, &iter);
 443	BUG_ON(ret == -EIOCBQUEUED);
 444	*ppos = kiocb.ki_pos;
 
 
 
 
 
 
 
 
 445	return ret;
 446}
 447
 448ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
 449		   loff_t *pos)
 450{
 451	if (file->f_op->read)
 452		return file->f_op->read(file, buf, count, pos);
 453	else if (file->f_op->read_iter)
 454		return new_sync_read(file, buf, count, pos);
 455	else
 456		return -EINVAL;
 457}
 458EXPORT_SYMBOL(__vfs_read);
 459
 460ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 461{
 462	ssize_t ret;
 463
 464	if (!(file->f_mode & FMODE_READ))
 465		return -EBADF;
 466	if (!(file->f_mode & FMODE_CAN_READ))
 467		return -EINVAL;
 468	if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
 469		return -EFAULT;
 470
 471	ret = rw_verify_area(READ, file, pos, count);
 472	if (!ret) {
 473		if (count > MAX_RW_COUNT)
 474			count =  MAX_RW_COUNT;
 475		ret = __vfs_read(file, buf, count, pos);
 476		if (ret > 0) {
 477			fsnotify_access(file);
 478			add_rchar(current, ret);
 479		}
 480		inc_syscr(current);
 481	}
 482
 
 
 
 
 
 
 
 
 
 
 
 483	return ret;
 484}
 485
 486EXPORT_SYMBOL(vfs_read);
 487
 488static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 489{
 490	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
 491	struct kiocb kiocb;
 492	struct iov_iter iter;
 493	ssize_t ret;
 494
 495	init_sync_kiocb(&kiocb, filp);
 496	kiocb.ki_pos = *ppos;
 497	iov_iter_init(&iter, WRITE, &iov, 1, len);
 498
 499	ret = filp->f_op->write_iter(&kiocb, &iter);
 500	BUG_ON(ret == -EIOCBQUEUED);
 501	if (ret > 0)
 502		*ppos = kiocb.ki_pos;
 503	return ret;
 504}
 505
 506ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
 507		    loff_t *pos)
 508{
 509	if (file->f_op->write)
 510		return file->f_op->write(file, p, count, pos);
 511	else if (file->f_op->write_iter)
 512		return new_sync_write(file, p, count, pos);
 513	else
 514		return -EINVAL;
 515}
 516EXPORT_SYMBOL(__vfs_write);
 517
 518ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
 519{
 520	mm_segment_t old_fs;
 521	const char __user *p;
 522	ssize_t ret;
 523
 
 
 524	if (!(file->f_mode & FMODE_CAN_WRITE))
 525		return -EINVAL;
 
 
 
 
 
 
 526
 527	old_fs = get_fs();
 528	set_fs(get_ds());
 529	p = (__force const char __user *)buf;
 530	if (count > MAX_RW_COUNT)
 531		count =  MAX_RW_COUNT;
 532	ret = __vfs_write(file, p, count, pos);
 533	set_fs(old_fs);
 534	if (ret > 0) {
 
 
 535		fsnotify_modify(file);
 536		add_wchar(current, ret);
 537	}
 538	inc_syscw(current);
 539	return ret;
 540}
 541
 542EXPORT_SYMBOL(__kernel_write);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 543
 544ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 545{
 546	ssize_t ret;
 547
 548	if (!(file->f_mode & FMODE_WRITE))
 549		return -EBADF;
 550	if (!(file->f_mode & FMODE_CAN_WRITE))
 551		return -EINVAL;
 552	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
 553		return -EFAULT;
 554
 555	ret = rw_verify_area(WRITE, file, pos, count);
 556	if (!ret) {
 557		if (count > MAX_RW_COUNT)
 558			count =  MAX_RW_COUNT;
 559		file_start_write(file);
 560		ret = __vfs_write(file, buf, count, pos);
 561		if (ret > 0) {
 562			fsnotify_modify(file);
 563			add_wchar(current, ret);
 564		}
 565		inc_syscw(current);
 566		file_end_write(file);
 
 
 
 567	}
 568
 
 569	return ret;
 570}
 571
 572EXPORT_SYMBOL(vfs_write);
 573
 574static inline loff_t file_pos_read(struct file *file)
 575{
 576	return file->f_pos;
 577}
 578
 579static inline void file_pos_write(struct file *file, loff_t pos)
 580{
 581	file->f_pos = pos;
 
 
 
 
 
 
 
 
 
 
 
 
 
 582}
 583
 584SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 585{
 586	struct fd f = fdget_pos(fd);
 
 
 
 
 
 587	ssize_t ret = -EBADF;
 588
 589	if (f.file) {
 590		loff_t pos = file_pos_read(f.file);
 591		ret = vfs_read(f.file, buf, count, &pos);
 592		if (ret >= 0)
 593			file_pos_write(f.file, pos);
 594		fdput_pos(f);
 
 
 
 595	}
 
 596	return ret;
 597}
 598
 599SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 600		size_t, count)
 601{
 602	struct fd f = fdget_pos(fd);
 603	ssize_t ret = -EBADF;
 604
 605	if (f.file) {
 606		loff_t pos = file_pos_read(f.file);
 607		ret = vfs_write(f.file, buf, count, &pos);
 608		if (ret >= 0)
 609			file_pos_write(f.file, pos);
 610		fdput_pos(f);
 611	}
 612
 613	return ret;
 
 
 
 
 
 
 
 614}
 615
 616SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
 617			size_t, count, loff_t, pos)
 618{
 619	struct fd f;
 620	ssize_t ret = -EBADF;
 
 
 
 
 
 
 
 
 621
 
 
 
 622	if (pos < 0)
 623		return -EINVAL;
 624
 625	f = fdget(fd);
 626	if (f.file) {
 627		ret = -ESPIPE;
 628		if (f.file->f_mode & FMODE_PREAD)
 629			ret = vfs_read(f.file, buf, count, &pos);
 630		fdput(f);
 631	}
 632
 633	return ret;
 
 
 
 634}
 635
 636SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
 637			 size_t, count, loff_t, pos)
 638{
 639	struct fd f;
 640	ssize_t ret = -EBADF;
 641
 642	if (pos < 0)
 643		return -EINVAL;
 644
 645	f = fdget(fd);
 646	if (f.file) {
 647		ret = -ESPIPE;
 648		if (f.file->f_mode & FMODE_PWRITE)  
 649			ret = vfs_write(f.file, buf, count, &pos);
 650		fdput(f);
 651	}
 652
 653	return ret;
 654}
 655
 656/*
 657 * Reduce an iovec's length in-place.  Return the resulting number of segments
 658 */
 659unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
 660{
 661	unsigned long seg = 0;
 662	size_t len = 0;
 663
 664	while (seg < nr_segs) {
 665		seg++;
 666		if (len + iov->iov_len >= to) {
 667			iov->iov_len = to - len;
 668			break;
 669		}
 670		len += iov->iov_len;
 671		iov++;
 672	}
 673	return seg;
 674}
 675EXPORT_SYMBOL(iov_shorten);
 676
 677static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 678		loff_t *ppos, iter_fn_t fn, int flags)
 679{
 680	struct kiocb kiocb;
 681	ssize_t ret;
 682
 683	if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC))
 684		return -EOPNOTSUPP;
 685
 686	init_sync_kiocb(&kiocb, filp);
 687	if (flags & RWF_HIPRI)
 688		kiocb.ki_flags |= IOCB_HIPRI;
 689	if (flags & RWF_DSYNC)
 690		kiocb.ki_flags |= IOCB_DSYNC;
 691	if (flags & RWF_SYNC)
 692		kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
 693	kiocb.ki_pos = *ppos;
 694
 695	ret = fn(&kiocb, iter);
 
 
 
 696	BUG_ON(ret == -EIOCBQUEUED);
 697	*ppos = kiocb.ki_pos;
 
 698	return ret;
 699}
 700
 701/* Do it by hand, with file-ops */
 702static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
 703		loff_t *ppos, io_fn_t fn, int flags)
 704{
 705	ssize_t ret = 0;
 706
 707	if (flags & ~RWF_HIPRI)
 708		return -EOPNOTSUPP;
 709
 710	while (iov_iter_count(iter)) {
 711		struct iovec iovec = iov_iter_iovec(iter);
 712		ssize_t nr;
 713
 714		nr = fn(filp, iovec.iov_base, iovec.iov_len, ppos);
 
 
 
 
 
 
 715
 716		if (nr < 0) {
 717			if (!ret)
 718				ret = nr;
 719			break;
 720		}
 721		ret += nr;
 722		if (nr != iovec.iov_len)
 723			break;
 724		iov_iter_advance(iter, nr);
 725	}
 726
 727	return ret;
 728}
 729
 730/* A write operation does a read from user space and vice versa */
 731#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
 732
 733/**
 734 * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace
 735 *     into the kernel and check that it is valid.
 736 *
 737 * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE.
 738 * @uvector: Pointer to the userspace array.
 739 * @nr_segs: Number of elements in userspace array.
 740 * @fast_segs: Number of elements in @fast_pointer.
 741 * @fast_pointer: Pointer to (usually small on-stack) kernel array.
 742 * @ret_pointer: (output parameter) Pointer to a variable that will point to
 743 *     either @fast_pointer, a newly allocated kernel array, or NULL,
 744 *     depending on which array was used.
 745 *
 746 * This function copies an array of &struct iovec of @nr_segs from
 747 * userspace into the kernel and checks that each element is valid (e.g.
 748 * it does not point to a kernel address or cause overflow by being too
 749 * large, etc.).
 750 *
 751 * As an optimization, the caller may provide a pointer to a small
 752 * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long
 753 * (the size of this array, or 0 if unused, should be given in @fast_segs).
 754 *
 755 * @ret_pointer will always point to the array that was used, so the
 756 * caller must take care not to call kfree() on it e.g. in case the
 757 * @fast_pointer array was used and it was allocated on the stack.
 758 *
 759 * Return: The total number of bytes covered by the iovec array on success
 760 *   or a negative error code on error.
 761 */
 762ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 763			      unsigned long nr_segs, unsigned long fast_segs,
 764			      struct iovec *fast_pointer,
 765			      struct iovec **ret_pointer)
 766{
 767	unsigned long seg;
 768	ssize_t ret;
 769	struct iovec *iov = fast_pointer;
 770
 771	/*
 772	 * SuS says "The readv() function *may* fail if the iovcnt argument
 773	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
 774	 * traditionally returned zero for zero segments, so...
 775	 */
 776	if (nr_segs == 0) {
 777		ret = 0;
 778		goto out;
 779	}
 780
 781	/*
 782	 * First get the "struct iovec" from user memory and
 783	 * verify all the pointers
 784	 */
 785	if (nr_segs > UIO_MAXIOV) {
 786		ret = -EINVAL;
 787		goto out;
 788	}
 789	if (nr_segs > fast_segs) {
 790		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
 791		if (iov == NULL) {
 792			ret = -ENOMEM;
 793			goto out;
 794		}
 795	}
 796	if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
 797		ret = -EFAULT;
 798		goto out;
 799	}
 
 
 800
 801	/*
 802	 * According to the Single Unix Specification we should return EINVAL
 803	 * if an element length is < 0 when cast to ssize_t or if the
 804	 * total length would overflow the ssize_t return value of the
 805	 * system call.
 806	 *
 807	 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
 808	 * overflow case.
 809	 */
 810	ret = 0;
 811	for (seg = 0; seg < nr_segs; seg++) {
 812		void __user *buf = iov[seg].iov_base;
 813		ssize_t len = (ssize_t)iov[seg].iov_len;
 814
 815		/* see if we we're about to use an invalid len or if
 816		 * it's about to overflow ssize_t */
 817		if (len < 0) {
 818			ret = -EINVAL;
 819			goto out;
 820		}
 821		if (type >= 0
 822		    && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
 823			ret = -EFAULT;
 824			goto out;
 825		}
 826		if (len > MAX_RW_COUNT - ret) {
 827			len = MAX_RW_COUNT - ret;
 828			iov[seg].iov_len = len;
 829		}
 830		ret += len;
 831	}
 832out:
 833	*ret_pointer = iov;
 
 834	return ret;
 835}
 
 836
 837static ssize_t do_readv_writev(int type, struct file *file,
 838			       const struct iovec __user * uvector,
 839			       unsigned long nr_segs, loff_t *pos,
 840			       int flags)
 841{
 842	size_t tot_len;
 843	struct iovec iovstack[UIO_FASTIOV];
 844	struct iovec *iov = iovstack;
 845	struct iov_iter iter;
 846	ssize_t ret;
 847	io_fn_t fn;
 848	iter_fn_t iter_fn;
 
 
 849
 850	ret = import_iovec(type, uvector, nr_segs,
 851			   ARRAY_SIZE(iovstack), &iov, &iter);
 
 
 852	if (ret < 0)
 853		return ret;
 854
 855	tot_len = iov_iter_count(&iter);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 856	if (!tot_len)
 857		goto out;
 858	ret = rw_verify_area(type, file, pos, tot_len);
 859	if (ret < 0)
 860		goto out;
 861
 862	if (type == READ) {
 863		fn = file->f_op->read;
 864		iter_fn = file->f_op->read_iter;
 865	} else {
 866		fn = (io_fn_t)file->f_op->write;
 867		iter_fn = file->f_op->write_iter;
 868		file_start_write(file);
 869	}
 870
 871	if (iter_fn)
 872		ret = do_iter_readv_writev(file, &iter, pos, iter_fn, flags);
 873	else
 874		ret = do_loop_readv_writev(file, &iter, pos, fn, flags);
 875
 876	if (type != READ)
 877		file_end_write(file);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 878
 879out:
 880	kfree(iov);
 881	if ((ret + (type == READ)) > 0) {
 882		if (type == READ)
 883			fsnotify_access(file);
 884		else
 885			fsnotify_modify(file);
 886	}
 887	return ret;
 888}
 
 889
 890ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 891		  unsigned long vlen, loff_t *pos, int flags)
 892{
 
 
 
 
 
 
 893	if (!(file->f_mode & FMODE_READ))
 894		return -EBADF;
 895	if (!(file->f_mode & FMODE_CAN_READ))
 896		return -EINVAL;
 897
 898	return do_readv_writev(READ, file, vec, vlen, pos, flags);
 899}
 
 
 900
 901EXPORT_SYMBOL(vfs_readv);
 
 
 902
 903ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 904		   unsigned long vlen, loff_t *pos, int flags)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 905{
 
 
 
 
 
 
 906	if (!(file->f_mode & FMODE_WRITE))
 907		return -EBADF;
 908	if (!(file->f_mode & FMODE_CAN_WRITE))
 909		return -EINVAL;
 910
 911	return do_readv_writev(WRITE, file, vec, vlen, pos, flags);
 912}
 
 
 
 
 
 
 
 
 
 
 913
 914EXPORT_SYMBOL(vfs_writev);
 
 
 
 
 
 
 
 
 
 
 
 915
 916static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
 917			unsigned long vlen, int flags)
 918{
 919	struct fd f = fdget_pos(fd);
 920	ssize_t ret = -EBADF;
 921
 922	if (f.file) {
 923		loff_t pos = file_pos_read(f.file);
 924		ret = vfs_readv(f.file, vec, vlen, &pos, flags);
 925		if (ret >= 0)
 926			file_pos_write(f.file, pos);
 927		fdput_pos(f);
 
 
 
 928	}
 929
 930	if (ret > 0)
 931		add_rchar(current, ret);
 932	inc_syscr(current);
 933	return ret;
 934}
 935
 936static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
 937			 unsigned long vlen, int flags)
 938{
 939	struct fd f = fdget_pos(fd);
 940	ssize_t ret = -EBADF;
 941
 942	if (f.file) {
 943		loff_t pos = file_pos_read(f.file);
 944		ret = vfs_writev(f.file, vec, vlen, &pos, flags);
 945		if (ret >= 0)
 946			file_pos_write(f.file, pos);
 947		fdput_pos(f);
 
 
 
 948	}
 949
 950	if (ret > 0)
 951		add_wchar(current, ret);
 952	inc_syscw(current);
 953	return ret;
 954}
 955
 956static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
 957{
 958#define HALF_LONG_BITS (BITS_PER_LONG / 2)
 959	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
 960}
 961
 962static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
 963			 unsigned long vlen, loff_t pos, int flags)
 964{
 965	struct fd f;
 966	ssize_t ret = -EBADF;
 967
 968	if (pos < 0)
 969		return -EINVAL;
 970
 971	f = fdget(fd);
 972	if (f.file) {
 973		ret = -ESPIPE;
 974		if (f.file->f_mode & FMODE_PREAD)
 975			ret = vfs_readv(f.file, vec, vlen, &pos, flags);
 976		fdput(f);
 977	}
 978
 979	if (ret > 0)
 980		add_rchar(current, ret);
 981	inc_syscr(current);
 982	return ret;
 983}
 984
 985static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
 986			  unsigned long vlen, loff_t pos, int flags)
 987{
 988	struct fd f;
 989	ssize_t ret = -EBADF;
 990
 991	if (pos < 0)
 992		return -EINVAL;
 993
 994	f = fdget(fd);
 995	if (f.file) {
 996		ret = -ESPIPE;
 997		if (f.file->f_mode & FMODE_PWRITE)
 998			ret = vfs_writev(f.file, vec, vlen, &pos, flags);
 999		fdput(f);
1000	}
1001
1002	if (ret > 0)
1003		add_wchar(current, ret);
1004	inc_syscw(current);
1005	return ret;
1006}
1007
1008SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
1009		unsigned long, vlen)
1010{
1011	return do_readv(fd, vec, vlen, 0);
1012}
1013
1014SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
1015		unsigned long, vlen)
1016{
1017	return do_writev(fd, vec, vlen, 0);
1018}
1019
1020SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
1021		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1022{
1023	loff_t pos = pos_from_hilo(pos_h, pos_l);
1024
1025	return do_preadv(fd, vec, vlen, pos, 0);
1026}
1027
1028SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
1029		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1030		int, flags)
1031{
1032	loff_t pos = pos_from_hilo(pos_h, pos_l);
1033
1034	if (pos == -1)
1035		return do_readv(fd, vec, vlen, flags);
1036
1037	return do_preadv(fd, vec, vlen, pos, flags);
1038}
1039
1040SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
1041		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1042{
1043	loff_t pos = pos_from_hilo(pos_h, pos_l);
1044
1045	return do_pwritev(fd, vec, vlen, pos, 0);
1046}
1047
1048SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
1049		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1050		int, flags)
1051{
1052	loff_t pos = pos_from_hilo(pos_h, pos_l);
1053
1054	if (pos == -1)
1055		return do_writev(fd, vec, vlen, flags);
1056
1057	return do_pwritev(fd, vec, vlen, pos, flags);
1058}
1059
 
 
 
 
 
1060#ifdef CONFIG_COMPAT
1061
1062static ssize_t compat_do_readv_writev(int type, struct file *file,
1063			       const struct compat_iovec __user *uvector,
1064			       unsigned long nr_segs, loff_t *pos,
1065			       int flags)
1066{
1067	compat_ssize_t tot_len;
1068	struct iovec iovstack[UIO_FASTIOV];
1069	struct iovec *iov = iovstack;
1070	struct iov_iter iter;
1071	ssize_t ret;
1072	io_fn_t fn;
1073	iter_fn_t iter_fn;
1074
1075	ret = compat_import_iovec(type, uvector, nr_segs,
1076				  UIO_FASTIOV, &iov, &iter);
1077	if (ret < 0)
1078		return ret;
1079
1080	tot_len = iov_iter_count(&iter);
1081	if (!tot_len)
1082		goto out;
1083	ret = rw_verify_area(type, file, pos, tot_len);
1084	if (ret < 0)
1085		goto out;
1086
1087	if (type == READ) {
1088		fn = file->f_op->read;
1089		iter_fn = file->f_op->read_iter;
1090	} else {
1091		fn = (io_fn_t)file->f_op->write;
1092		iter_fn = file->f_op->write_iter;
1093		file_start_write(file);
1094	}
1095
1096	if (iter_fn)
1097		ret = do_iter_readv_writev(file, &iter, pos, iter_fn, flags);
1098	else
1099		ret = do_loop_readv_writev(file, &iter, pos, fn, flags);
1100
1101	if (type != READ)
1102		file_end_write(file);
1103
1104out:
1105	kfree(iov);
1106	if ((ret + (type == READ)) > 0) {
1107		if (type == READ)
1108			fsnotify_access(file);
1109		else
1110			fsnotify_modify(file);
1111	}
1112	return ret;
1113}
1114
1115static size_t compat_readv(struct file *file,
1116			   const struct compat_iovec __user *vec,
1117			   unsigned long vlen, loff_t *pos, int flags)
1118{
1119	ssize_t ret = -EBADF;
1120
1121	if (!(file->f_mode & FMODE_READ))
1122		goto out;
1123
1124	ret = -EINVAL;
1125	if (!(file->f_mode & FMODE_CAN_READ))
1126		goto out;
1127
1128	ret = compat_do_readv_writev(READ, file, vec, vlen, pos, flags);
1129
1130out:
1131	if (ret > 0)
1132		add_rchar(current, ret);
1133	inc_syscr(current);
1134	return ret;
1135}
1136
1137static size_t do_compat_readv(compat_ulong_t fd,
1138				 const struct compat_iovec __user *vec,
1139				 compat_ulong_t vlen, int flags)
1140{
1141	struct fd f = fdget_pos(fd);
1142	ssize_t ret;
1143	loff_t pos;
1144
1145	if (!f.file)
1146		return -EBADF;
1147	pos = f.file->f_pos;
1148	ret = compat_readv(f.file, vec, vlen, &pos, flags);
1149	if (ret >= 0)
1150		f.file->f_pos = pos;
1151	fdput_pos(f);
1152	return ret;
1153
1154}
1155
1156COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
1157		const struct compat_iovec __user *,vec,
1158		compat_ulong_t, vlen)
1159{
1160	return do_compat_readv(fd, vec, vlen, 0);
1161}
1162
1163static long do_compat_preadv64(unsigned long fd,
1164				  const struct compat_iovec __user *vec,
1165				  unsigned long vlen, loff_t pos, int flags)
1166{
1167	struct fd f;
1168	ssize_t ret;
1169
1170	if (pos < 0)
1171		return -EINVAL;
1172	f = fdget(fd);
1173	if (!f.file)
1174		return -EBADF;
1175	ret = -ESPIPE;
1176	if (f.file->f_mode & FMODE_PREAD)
1177		ret = compat_readv(f.file, vec, vlen, &pos, flags);
1178	fdput(f);
1179	return ret;
1180}
1181
1182#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
1183COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1184		const struct compat_iovec __user *,vec,
1185		unsigned long, vlen, loff_t, pos)
1186{
1187	return do_compat_preadv64(fd, vec, vlen, pos, 0);
1188}
1189#endif
1190
1191COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1192		const struct compat_iovec __user *,vec,
1193		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1194{
1195	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1196
1197	return do_compat_preadv64(fd, vec, vlen, pos, 0);
1198}
1199
1200#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
1201COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
1202		const struct compat_iovec __user *,vec,
1203		unsigned long, vlen, loff_t, pos, int, flags)
1204{
1205	return do_compat_preadv64(fd, vec, vlen, pos, flags);
 
 
1206}
1207#endif
1208
1209COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
1210		const struct compat_iovec __user *,vec,
1211		compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
1212		int, flags)
1213{
1214	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1215
1216	if (pos == -1)
1217		return do_compat_readv(fd, vec, vlen, flags);
1218
1219	return do_compat_preadv64(fd, vec, vlen, pos, flags);
1220}
1221
1222static size_t compat_writev(struct file *file,
1223			    const struct compat_iovec __user *vec,
1224			    unsigned long vlen, loff_t *pos, int flags)
1225{
1226	ssize_t ret = -EBADF;
1227
1228	if (!(file->f_mode & FMODE_WRITE))
1229		goto out;
1230
1231	ret = -EINVAL;
1232	if (!(file->f_mode & FMODE_CAN_WRITE))
1233		goto out;
1234
1235	ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos, 0);
1236
1237out:
1238	if (ret > 0)
1239		add_wchar(current, ret);
1240	inc_syscw(current);
1241	return ret;
1242}
1243
1244static size_t do_compat_writev(compat_ulong_t fd,
1245				  const struct compat_iovec __user* vec,
1246				  compat_ulong_t vlen, int flags)
1247{
1248	struct fd f = fdget_pos(fd);
1249	ssize_t ret;
1250	loff_t pos;
1251
1252	if (!f.file)
1253		return -EBADF;
1254	pos = f.file->f_pos;
1255	ret = compat_writev(f.file, vec, vlen, &pos, flags);
1256	if (ret >= 0)
1257		f.file->f_pos = pos;
1258	fdput_pos(f);
1259	return ret;
1260}
1261
1262COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1263		const struct compat_iovec __user *, vec,
1264		compat_ulong_t, vlen)
1265{
1266	return do_compat_writev(fd, vec, vlen, 0);
1267}
1268
1269static long do_compat_pwritev64(unsigned long fd,
1270				   const struct compat_iovec __user *vec,
1271				   unsigned long vlen, loff_t pos, int flags)
1272{
1273	struct fd f;
1274	ssize_t ret;
1275
1276	if (pos < 0)
1277		return -EINVAL;
1278	f = fdget(fd);
1279	if (!f.file)
1280		return -EBADF;
1281	ret = -ESPIPE;
1282	if (f.file->f_mode & FMODE_PWRITE)
1283		ret = compat_writev(f.file, vec, vlen, &pos, flags);
1284	fdput(f);
1285	return ret;
1286}
1287
1288#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
1289COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1290		const struct compat_iovec __user *,vec,
1291		unsigned long, vlen, loff_t, pos)
1292{
1293	return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1294}
1295#endif
1296
1297COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1298		const struct compat_iovec __user *,vec,
1299		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1300{
1301	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1302
1303	return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1304}
1305
1306#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
1307COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
1308		const struct compat_iovec __user *,vec,
1309		unsigned long, vlen, loff_t, pos, int, flags)
1310{
1311	return do_compat_pwritev64(fd, vec, vlen, pos, flags);
 
 
1312}
1313#endif
1314
1315COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
1316		const struct compat_iovec __user *,vec,
1317		compat_ulong_t, vlen, u32, pos_low, u32, pos_high, int, flags)
1318{
1319	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1320
1321	if (pos == -1)
1322		return do_compat_writev(fd, vec, vlen, flags);
1323
1324	return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1325}
1326
1327#endif
1328
1329static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1330		  	   size_t count, loff_t max)
1331{
1332	struct fd in, out;
1333	struct inode *in_inode, *out_inode;
 
1334	loff_t pos;
1335	loff_t out_pos;
1336	ssize_t retval;
1337	int fl;
1338
1339	/*
1340	 * Get input file, and verify that it is ok..
1341	 */
1342	retval = -EBADF;
1343	in = fdget(in_fd);
1344	if (!in.file)
1345		goto out;
1346	if (!(in.file->f_mode & FMODE_READ))
1347		goto fput_in;
1348	retval = -ESPIPE;
1349	if (!ppos) {
1350		pos = in.file->f_pos;
1351	} else {
1352		pos = *ppos;
1353		if (!(in.file->f_mode & FMODE_PREAD))
1354			goto fput_in;
1355	}
1356	retval = rw_verify_area(READ, in.file, &pos, count);
1357	if (retval < 0)
1358		goto fput_in;
1359	if (count > MAX_RW_COUNT)
1360		count =  MAX_RW_COUNT;
1361
1362	/*
1363	 * Get output file, and verify that it is ok..
1364	 */
1365	retval = -EBADF;
1366	out = fdget(out_fd);
1367	if (!out.file)
1368		goto fput_in;
1369	if (!(out.file->f_mode & FMODE_WRITE))
1370		goto fput_out;
1371	retval = -EINVAL;
1372	in_inode = file_inode(in.file);
1373	out_inode = file_inode(out.file);
1374	out_pos = out.file->f_pos;
1375	retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1376	if (retval < 0)
1377		goto fput_out;
1378
1379	if (!max)
1380		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1381
1382	if (unlikely(pos + count > max)) {
1383		retval = -EOVERFLOW;
1384		if (pos >= max)
1385			goto fput_out;
1386		count = max - pos;
1387	}
1388
1389	fl = 0;
1390#if 0
1391	/*
1392	 * We need to debate whether we can enable this or not. The
1393	 * man page documents EAGAIN return for the output at least,
1394	 * and the application is arguably buggy if it doesn't expect
1395	 * EAGAIN on a non-blocking file descriptor.
1396	 */
1397	if (in.file->f_flags & O_NONBLOCK)
1398		fl = SPLICE_F_NONBLOCK;
1399#endif
1400	file_start_write(out.file);
1401	retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
1402	file_end_write(out.file);
 
 
 
 
 
 
 
 
 
 
1403
1404	if (retval > 0) {
1405		add_rchar(current, retval);
1406		add_wchar(current, retval);
1407		fsnotify_access(in.file);
1408		fsnotify_modify(out.file);
1409		out.file->f_pos = out_pos;
1410		if (ppos)
1411			*ppos = pos;
1412		else
1413			in.file->f_pos = pos;
1414	}
1415
1416	inc_syscr(current);
1417	inc_syscw(current);
1418	if (pos > max)
1419		retval = -EOVERFLOW;
1420
1421fput_out:
1422	fdput(out);
1423fput_in:
1424	fdput(in);
1425out:
1426	return retval;
1427}
1428
1429SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1430{
1431	loff_t pos;
1432	off_t off;
1433	ssize_t ret;
1434
1435	if (offset) {
1436		if (unlikely(get_user(off, offset)))
1437			return -EFAULT;
1438		pos = off;
1439		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1440		if (unlikely(put_user(pos, offset)))
1441			return -EFAULT;
1442		return ret;
1443	}
1444
1445	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1446}
1447
1448SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1449{
1450	loff_t pos;
1451	ssize_t ret;
1452
1453	if (offset) {
1454		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1455			return -EFAULT;
1456		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1457		if (unlikely(put_user(pos, offset)))
1458			return -EFAULT;
1459		return ret;
1460	}
1461
1462	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1463}
1464
1465#ifdef CONFIG_COMPAT
1466COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1467		compat_off_t __user *, offset, compat_size_t, count)
1468{
1469	loff_t pos;
1470	off_t off;
1471	ssize_t ret;
1472
1473	if (offset) {
1474		if (unlikely(get_user(off, offset)))
1475			return -EFAULT;
1476		pos = off;
1477		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1478		if (unlikely(put_user(pos, offset)))
1479			return -EFAULT;
1480		return ret;
1481	}
1482
1483	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1484}
1485
1486COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1487		compat_loff_t __user *, offset, compat_size_t, count)
1488{
1489	loff_t pos;
1490	ssize_t ret;
1491
1492	if (offset) {
1493		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1494			return -EFAULT;
1495		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1496		if (unlikely(put_user(pos, offset)))
1497			return -EFAULT;
1498		return ret;
1499	}
1500
1501	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1502}
1503#endif
1504
1505/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1506 * copy_file_range() differs from regular file read and write in that it
1507 * specifically allows return partial success.  When it does so is up to
1508 * the copy_file_range method.
1509 */
1510ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
1511			    struct file *file_out, loff_t pos_out,
1512			    size_t len, unsigned int flags)
1513{
1514	struct inode *inode_in = file_inode(file_in);
1515	struct inode *inode_out = file_inode(file_out);
1516	ssize_t ret;
 
 
1517
1518	if (flags != 0)
1519		return -EINVAL;
1520
 
 
 
 
 
1521	ret = rw_verify_area(READ, file_in, &pos_in, len);
1522	if (unlikely(ret))
1523		return ret;
1524
1525	ret = rw_verify_area(WRITE, file_out, &pos_out, len);
1526	if (unlikely(ret))
1527		return ret;
1528
1529	if (!(file_in->f_mode & FMODE_READ) ||
1530	    !(file_out->f_mode & FMODE_WRITE) ||
1531	    (file_out->f_flags & O_APPEND))
1532		return -EBADF;
1533
1534	/* this could be relaxed once a method supports cross-fs copies */
1535	if (inode_in->i_sb != inode_out->i_sb)
1536		return -EXDEV;
1537
1538	if (len == 0)
1539		return 0;
1540
1541	sb_start_write(inode_out->i_sb);
1542
1543	/*
1544	 * Try cloning first, this is supported by more file systems, and
1545	 * more efficient if both clone and copy are supported (e.g. NFS).
 
1546	 */
1547	if (file_in->f_op->clone_file_range) {
1548		ret = file_in->f_op->clone_file_range(file_in, pos_in,
1549				file_out, pos_out, len);
1550		if (ret == 0) {
1551			ret = len;
1552			goto done;
1553		}
 
 
 
 
 
 
 
 
1554	}
1555
1556	if (file_out->f_op->copy_file_range) {
1557		ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
1558						      pos_out, len, flags);
1559		if (ret != -EOPNOTSUPP)
1560			goto done;
1561	}
1562
1563	ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
1564			len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
1565
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1566done:
1567	if (ret > 0) {
1568		fsnotify_access(file_in);
1569		add_rchar(current, ret);
1570		fsnotify_modify(file_out);
1571		add_wchar(current, ret);
1572	}
1573
1574	inc_syscr(current);
1575	inc_syscw(current);
1576
1577	sb_end_write(inode_out->i_sb);
1578
1579	return ret;
1580}
1581EXPORT_SYMBOL(vfs_copy_file_range);
1582
1583SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
1584		int, fd_out, loff_t __user *, off_out,
1585		size_t, len, unsigned int, flags)
1586{
1587	loff_t pos_in;
1588	loff_t pos_out;
1589	struct fd f_in;
1590	struct fd f_out;
1591	ssize_t ret = -EBADF;
1592
1593	f_in = fdget(fd_in);
1594	if (!f_in.file)
1595		goto out2;
1596
1597	f_out = fdget(fd_out);
1598	if (!f_out.file)
1599		goto out1;
1600
1601	ret = -EFAULT;
1602	if (off_in) {
1603		if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
1604			goto out;
1605	} else {
1606		pos_in = f_in.file->f_pos;
1607	}
1608
1609	if (off_out) {
1610		if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
1611			goto out;
1612	} else {
1613		pos_out = f_out.file->f_pos;
1614	}
1615
1616	ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
 
 
 
1617				  flags);
1618	if (ret > 0) {
1619		pos_in += ret;
1620		pos_out += ret;
1621
1622		if (off_in) {
1623			if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
1624				ret = -EFAULT;
1625		} else {
1626			f_in.file->f_pos = pos_in;
1627		}
1628
1629		if (off_out) {
1630			if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
1631				ret = -EFAULT;
1632		} else {
1633			f_out.file->f_pos = pos_out;
1634		}
1635	}
1636
1637out:
1638	fdput(f_out);
1639out1:
1640	fdput(f_in);
1641out2:
1642	return ret;
1643}
1644
1645static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
 
 
 
 
 
1646{
1647	struct inode *inode = file_inode(file);
 
 
1648
1649	if (unlikely(pos < 0))
1650		return -EINVAL;
 
 
 
 
 
1651
1652	 if (unlikely((loff_t) (pos + len) < 0))
1653		return -EINVAL;
1654
1655	if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
1656		loff_t end = len ? pos + len - 1 : OFFSET_MAX;
1657		int retval;
1658
1659		retval = locks_mandatory_area(inode, file, pos, end,
1660				write ? F_WRLCK : F_RDLCK);
1661		if (retval < 0)
1662			return retval;
1663	}
1664
1665	return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
1666}
 
1667
1668/*
1669 * Check that the two inodes are eligible for cloning, the ranges make
1670 * sense, and then flush all dirty data.  Caller must ensure that the
1671 * inodes have been locked against any other modifications.
1672 *
1673 * Returns: 0 for "nothing to clone", 1 for "something to clone", or
1674 * the usual negative error code.
1675 */
1676int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
1677			       struct inode *inode_out, loff_t pos_out,
1678			       u64 *len, bool is_dedupe)
1679{
1680	loff_t bs = inode_out->i_sb->s_blocksize;
1681	loff_t blen;
1682	loff_t isize;
1683	bool same_inode = (inode_in == inode_out);
1684	int ret;
1685
1686	/* Don't touch certain kinds of inodes */
1687	if (IS_IMMUTABLE(inode_out))
1688		return -EPERM;
1689
1690	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
1691		return -ETXTBSY;
1692
1693	/* Don't reflink dirs, pipes, sockets... */
1694	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1695		return -EISDIR;
1696	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1697		return -EINVAL;
1698
1699	/* Are we going all the way to the end? */
1700	isize = i_size_read(inode_in);
1701	if (isize == 0)
1702		return 0;
1703
1704	/* Zero length dedupe exits immediately; reflink goes to EOF. */
1705	if (*len == 0) {
1706		if (is_dedupe || pos_in == isize)
1707			return 0;
1708		if (pos_in > isize)
1709			return -EINVAL;
1710		*len = isize - pos_in;
1711	}
1712
1713	/* Ensure offsets don't wrap and the input is inside i_size */
1714	if (pos_in + *len < pos_in || pos_out + *len < pos_out ||
1715	    pos_in + *len > isize)
1716		return -EINVAL;
1717
1718	/* Don't allow dedupe past EOF in the dest file */
1719	if (is_dedupe) {
1720		loff_t	disize;
1721
1722		disize = i_size_read(inode_out);
1723		if (pos_out >= disize || pos_out + *len > disize)
1724			return -EINVAL;
1725	}
1726
1727	/* If we're linking to EOF, continue to the block boundary. */
1728	if (pos_in + *len == isize)
1729		blen = ALIGN(isize, bs) - pos_in;
1730	else
1731		blen = *len;
1732
1733	/* Only reflink if we're aligned to block boundaries */
1734	if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
1735	    !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
1736		return -EINVAL;
1737
1738	/* Don't allow overlapped reflink within the same file */
1739	if (same_inode) {
1740		if (pos_out + blen > pos_in && pos_out < pos_in + blen)
1741			return -EINVAL;
1742	}
1743
1744	/* Wait for the completion of any pending IOs on both files */
1745	inode_dio_wait(inode_in);
1746	if (!same_inode)
1747		inode_dio_wait(inode_out);
1748
1749	ret = filemap_write_and_wait_range(inode_in->i_mapping,
1750			pos_in, pos_in + *len - 1);
1751	if (ret)
1752		return ret;
 
 
 
 
 
 
 
1753
1754	ret = filemap_write_and_wait_range(inode_out->i_mapping,
1755			pos_out, pos_out + *len - 1);
1756	if (ret)
1757		return ret;
1758
1759	/*
1760	 * Check that the extents are the same.
1761	 */
1762	if (is_dedupe) {
1763		bool		is_same = false;
1764
1765		ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
1766				inode_out, pos_out, *len, &is_same);
1767		if (ret)
1768			return ret;
1769		if (!is_same)
1770			return -EBADE;
1771	}
1772
1773	return 1;
1774}
1775EXPORT_SYMBOL(vfs_clone_file_prep_inodes);
1776
1777int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
1778		struct file *file_out, loff_t pos_out, u64 len)
 
 
 
1779{
1780	struct inode *inode_in = file_inode(file_in);
1781	struct inode *inode_out = file_inode(file_out);
1782	int ret;
1783
 
1784	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1785		return -EISDIR;
1786	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1787		return -EINVAL;
1788
1789	/*
1790	 * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
1791	 * the same mount. Practically, they only need to be on the same file
1792	 * system.
1793	 */
1794	if (inode_in->i_sb != inode_out->i_sb)
1795		return -EXDEV;
1796
1797	if (!(file_in->f_mode & FMODE_READ) ||
1798	    !(file_out->f_mode & FMODE_WRITE) ||
1799	    (file_out->f_flags & O_APPEND))
1800		return -EBADF;
1801
1802	if (!file_in->f_op->clone_file_range)
1803		return -EOPNOTSUPP;
1804
1805	ret = clone_verify_area(file_in, pos_in, len, false);
1806	if (ret)
1807		return ret;
1808
1809	ret = clone_verify_area(file_out, pos_out, len, true);
1810	if (ret)
1811		return ret;
1812
1813	if (pos_in + len > i_size_read(inode_in))
1814		return -EINVAL;
1815
1816	ret = file_in->f_op->clone_file_range(file_in, pos_in,
1817			file_out, pos_out, len);
1818	if (!ret) {
1819		fsnotify_access(file_in);
1820		fsnotify_modify(file_out);
1821	}
1822
1823	return ret;
1824}
1825EXPORT_SYMBOL(vfs_clone_file_range);
1826
1827/*
1828 * Read a page's worth of file data into the page cache.  Return the page
1829 * locked.
1830 */
1831static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
1832{
1833	struct address_space *mapping;
1834	struct page *page;
1835	pgoff_t n;
1836
1837	n = offset >> PAGE_SHIFT;
1838	mapping = inode->i_mapping;
1839	page = read_mapping_page(mapping, n, NULL);
1840	if (IS_ERR(page))
1841		return page;
1842	if (!PageUptodate(page)) {
1843		put_page(page);
1844		return ERR_PTR(-EIO);
1845	}
1846	lock_page(page);
1847	return page;
1848}
1849
1850/*
1851 * Compare extents of two files to see if they are the same.
1852 * Caller must have locked both inodes to prevent write races.
1853 */
1854int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
1855				  struct inode *dest, loff_t destoff,
1856				  loff_t len, bool *is_same)
1857{
1858	loff_t src_poff;
1859	loff_t dest_poff;
1860	void *src_addr;
1861	void *dest_addr;
1862	struct page *src_page;
1863	struct page *dest_page;
1864	loff_t cmp_len;
1865	bool same;
1866	int error;
1867
1868	error = -EINVAL;
1869	same = true;
1870	while (len) {
1871		src_poff = srcoff & (PAGE_SIZE - 1);
1872		dest_poff = destoff & (PAGE_SIZE - 1);
1873		cmp_len = min(PAGE_SIZE - src_poff,
1874			      PAGE_SIZE - dest_poff);
1875		cmp_len = min(cmp_len, len);
1876		if (cmp_len <= 0)
1877			goto out_error;
1878
1879		src_page = vfs_dedupe_get_page(src, srcoff);
1880		if (IS_ERR(src_page)) {
1881			error = PTR_ERR(src_page);
1882			goto out_error;
1883		}
1884		dest_page = vfs_dedupe_get_page(dest, destoff);
1885		if (IS_ERR(dest_page)) {
1886			error = PTR_ERR(dest_page);
1887			unlock_page(src_page);
1888			put_page(src_page);
1889			goto out_error;
1890		}
1891		src_addr = kmap_atomic(src_page);
1892		dest_addr = kmap_atomic(dest_page);
1893
1894		flush_dcache_page(src_page);
1895		flush_dcache_page(dest_page);
1896
1897		if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
1898			same = false;
1899
1900		kunmap_atomic(dest_addr);
1901		kunmap_atomic(src_addr);
1902		unlock_page(dest_page);
1903		unlock_page(src_page);
1904		put_page(dest_page);
1905		put_page(src_page);
1906
1907		if (!same)
1908			break;
1909
1910		srcoff += cmp_len;
1911		destoff += cmp_len;
1912		len -= cmp_len;
1913	}
1914
1915	*is_same = same;
1916	return 0;
1917
1918out_error:
1919	return error;
1920}
1921EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
1922
1923int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
1924{
1925	struct file_dedupe_range_info *info;
1926	struct inode *src = file_inode(file);
1927	u64 off;
1928	u64 len;
1929	int i;
1930	int ret;
1931	bool is_admin = capable(CAP_SYS_ADMIN);
1932	u16 count = same->dest_count;
1933	struct file *dst_file;
1934	loff_t dst_off;
1935	ssize_t deduped;
1936
1937	if (!(file->f_mode & FMODE_READ))
1938		return -EINVAL;
1939
1940	if (same->reserved1 || same->reserved2)
1941		return -EINVAL;
1942
1943	off = same->src_offset;
1944	len = same->src_length;
1945
1946	ret = -EISDIR;
1947	if (S_ISDIR(src->i_mode))
1948		goto out;
1949
1950	ret = -EINVAL;
1951	if (!S_ISREG(src->i_mode))
1952		goto out;
1953
1954	ret = clone_verify_area(file, off, len, false);
1955	if (ret < 0)
1956		goto out;
1957	ret = 0;
1958
1959	if (off + len > i_size_read(src))
1960		return -EINVAL;
1961
1962	/* pre-format output fields to sane values */
1963	for (i = 0; i < count; i++) {
1964		same->info[i].bytes_deduped = 0ULL;
1965		same->info[i].status = FILE_DEDUPE_RANGE_SAME;
1966	}
1967
1968	for (i = 0, info = same->info; i < count; i++, info++) {
1969		struct inode *dst;
1970		struct fd dst_fd = fdget(info->dest_fd);
1971
1972		dst_file = dst_fd.file;
1973		if (!dst_file) {
1974			info->status = -EBADF;
1975			goto next_loop;
1976		}
1977		dst = file_inode(dst_file);
1978
1979		ret = mnt_want_write_file(dst_file);
1980		if (ret) {
1981			info->status = ret;
1982			goto next_loop;
1983		}
1984
1985		dst_off = info->dest_offset;
1986		ret = clone_verify_area(dst_file, dst_off, len, true);
1987		if (ret < 0) {
1988			info->status = ret;
1989			goto next_file;
1990		}
1991		ret = 0;
1992
1993		if (info->reserved) {
1994			info->status = -EINVAL;
1995		} else if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) {
1996			info->status = -EINVAL;
1997		} else if (file->f_path.mnt != dst_file->f_path.mnt) {
1998			info->status = -EXDEV;
1999		} else if (S_ISDIR(dst->i_mode)) {
2000			info->status = -EISDIR;
2001		} else if (dst_file->f_op->dedupe_file_range == NULL) {
2002			info->status = -EINVAL;
2003		} else {
2004			deduped = dst_file->f_op->dedupe_file_range(file, off,
2005							len, dst_file,
2006							info->dest_offset);
2007			if (deduped == -EBADE)
2008				info->status = FILE_DEDUPE_RANGE_DIFFERS;
2009			else if (deduped < 0)
2010				info->status = deduped;
2011			else
2012				info->bytes_deduped += deduped;
2013		}
2014
2015next_file:
2016		mnt_drop_write_file(dst_file);
2017next_loop:
2018		fdput(dst_fd);
2019
2020		if (fatal_signal_pending(current))
2021			goto out;
2022	}
2023
2024out:
2025	return ret;
2026}
2027EXPORT_SYMBOL(vfs_dedupe_file_range);
v6.13.7
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  linux/fs/read_write.c
   4 *
   5 *  Copyright (C) 1991, 1992  Linus Torvalds
   6 */
   7
   8#include <linux/slab.h>
   9#include <linux/stat.h>
  10#include <linux/sched/xacct.h>
  11#include <linux/fcntl.h>
  12#include <linux/file.h>
  13#include <linux/uio.h>
  14#include <linux/fsnotify.h>
  15#include <linux/security.h>
  16#include <linux/export.h>
  17#include <linux/syscalls.h>
  18#include <linux/pagemap.h>
  19#include <linux/splice.h>
  20#include <linux/compat.h>
  21#include <linux/mount.h>
  22#include <linux/fs.h>
  23#include "internal.h"
  24
  25#include <linux/uaccess.h>
  26#include <asm/unistd.h>
  27
 
 
 
  28const struct file_operations generic_ro_fops = {
  29	.llseek		= generic_file_llseek,
  30	.read_iter	= generic_file_read_iter,
  31	.mmap		= generic_file_readonly_mmap,
  32	.splice_read	= filemap_splice_read,
  33};
  34
  35EXPORT_SYMBOL(generic_ro_fops);
  36
  37static inline bool unsigned_offsets(struct file *file)
  38{
  39	return file->f_op->fop_flags & FOP_UNSIGNED_OFFSET;
  40}
  41
  42/**
  43 * vfs_setpos_cookie - update the file offset for lseek and reset cookie
  44 * @file:	file structure in question
  45 * @offset:	file offset to seek to
  46 * @maxsize:	maximum file size
  47 * @cookie:	cookie to reset
  48 *
  49 * Update the file offset to the value specified by @offset if the given
  50 * offset is valid and it is not equal to the current file offset and
  51 * reset the specified cookie to indicate that a seek happened.
  52 *
  53 * Return the specified offset on success and -EINVAL on invalid offset.
  54 */
  55static loff_t vfs_setpos_cookie(struct file *file, loff_t offset,
  56				loff_t maxsize, u64 *cookie)
  57{
  58	if (offset < 0 && !unsigned_offsets(file))
  59		return -EINVAL;
  60	if (offset > maxsize)
  61		return -EINVAL;
  62
  63	if (offset != file->f_pos) {
  64		file->f_pos = offset;
  65		if (cookie)
  66			*cookie = 0;
  67	}
  68	return offset;
  69}
 
  70
  71/**
  72 * vfs_setpos - update the file offset for lseek
  73 * @file:	file structure in question
  74 * @offset:	file offset to seek to
  75 * @maxsize:	maximum file size
 
 
  76 *
  77 * This is a low-level filesystem helper for updating the file offset to
  78 * the value specified by @offset if the given offset is valid and it is
  79 * not equal to the current file offset.
  80 *
  81 * Return the specified offset on success and -EINVAL on invalid offset.
 
 
 
  82 */
  83loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
  84{
  85	return vfs_setpos_cookie(file, offset, maxsize, NULL);
  86}
  87EXPORT_SYMBOL(vfs_setpos);
  88
  89/**
  90 * must_set_pos - check whether f_pos has to be updated
  91 * @file: file to seek on
  92 * @offset: offset to use
  93 * @whence: type of seek operation
  94 * @eof: end of file
  95 *
  96 * Check whether f_pos needs to be updated and update @offset according
  97 * to @whence.
  98 *
  99 * Return: 0 if f_pos doesn't need to be updated, 1 if f_pos has to be
 100 * updated, and negative error code on failure.
 101 */
 102static int must_set_pos(struct file *file, loff_t *offset, int whence, loff_t eof)
 103{
 104	switch (whence) {
 105	case SEEK_END:
 106		*offset += eof;
 107		break;
 108	case SEEK_CUR:
 109		/*
 110		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
 111		 * position-querying operation.  Avoid rewriting the "same"
 112		 * f_pos value back to the file because a concurrent read(),
 113		 * write() or lseek() might have altered it
 114		 */
 115		if (*offset == 0) {
 116			*offset = file->f_pos;
 117			return 0;
 118		}
 119		break;
 
 
 
 
 
 
 120	case SEEK_DATA:
 121		/*
 122		 * In the generic case the entire file is data, so as long as
 123		 * offset isn't at the end of the file then the offset is data.
 124		 */
 125		if ((unsigned long long)*offset >= eof)
 126			return -ENXIO;
 127		break;
 128	case SEEK_HOLE:
 129		/*
 130		 * There is a virtual hole at the end of the file, so as long as
 131		 * offset isn't i_size or larger, return i_size.
 132		 */
 133		if ((unsigned long long)*offset >= eof)
 134			return -ENXIO;
 135		*offset = eof;
 136		break;
 137	}
 138
 139	return 1;
 140}
 141
 142/**
 143 * generic_file_llseek_size - generic llseek implementation for regular files
 144 * @file:	file structure to seek on
 145 * @offset:	file offset to seek to
 146 * @whence:	type of seek
 147 * @maxsize:	max size of this file in file system
 148 * @eof:	offset used for SEEK_END position
 149 *
 150 * This is a variant of generic_file_llseek that allows passing in a custom
 151 * maximum file size and a custom EOF position, for e.g. hashed directories
 152 *
 153 * Synchronization:
 154 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
 155 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
 156 * read/writes behave like SEEK_SET against seeks.
 157 */
 158loff_t
 159generic_file_llseek_size(struct file *file, loff_t offset, int whence,
 160		loff_t maxsize, loff_t eof)
 161{
 162	int ret;
 163
 164	ret = must_set_pos(file, &offset, whence, eof);
 165	if (ret < 0)
 166		return ret;
 167	if (ret == 0)
 168		return offset;
 169
 170	if (whence == SEEK_CUR) {
 171		/*
 172		 * f_lock protects against read/modify/write race with
 173		 * other SEEK_CURs. Note that parallel writes and reads
 174		 * behave like SEEK_SET.
 175		 */
 176		guard(spinlock)(&file->f_lock);
 177		return vfs_setpos(file, file->f_pos + offset, maxsize);
 178	}
 179
 180	return vfs_setpos(file, offset, maxsize);
 181}
 182EXPORT_SYMBOL(generic_file_llseek_size);
 183
 184/**
 185 * generic_llseek_cookie - versioned llseek implementation
 186 * @file:	file structure to seek on
 187 * @offset:	file offset to seek to
 188 * @whence:	type of seek
 189 * @cookie:	cookie to update
 190 *
 191 * See generic_file_llseek for a general description and locking assumptions.
 192 *
 193 * In contrast to generic_file_llseek, this function also resets a
 194 * specified cookie to indicate a seek took place.
 195 */
 196loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence,
 197			     u64 *cookie)
 198{
 199	struct inode *inode = file->f_mapping->host;
 200	loff_t maxsize = inode->i_sb->s_maxbytes;
 201	loff_t eof = i_size_read(inode);
 202	int ret;
 203
 204	if (WARN_ON_ONCE(!cookie))
 205		return -EINVAL;
 206
 207	/*
 208	 * Require that this is only used for directories that guarantee
 209	 * synchronization between readdir and seek so that an update to
 210	 * @cookie is correctly synchronized with concurrent readdir.
 211	 */
 212	if (WARN_ON_ONCE(!(file->f_mode & FMODE_ATOMIC_POS)))
 213		return -EINVAL;
 214
 215	ret = must_set_pos(file, &offset, whence, eof);
 216	if (ret < 0)
 217		return ret;
 218	if (ret == 0)
 219		return offset;
 220
 221	/* No need to hold f_lock because we know that f_pos_lock is held. */
 222	if (whence == SEEK_CUR)
 223		return vfs_setpos_cookie(file, file->f_pos + offset, maxsize, cookie);
 224
 225	return vfs_setpos_cookie(file, offset, maxsize, cookie);
 226}
 227EXPORT_SYMBOL(generic_llseek_cookie);
 228
 229/**
 230 * generic_file_llseek - generic llseek implementation for regular files
 231 * @file:	file structure to seek on
 232 * @offset:	file offset to seek to
 233 * @whence:	type of seek
 234 *
 235 * This is a generic implemenation of ->llseek useable for all normal local
 236 * filesystems.  It just updates the file offset to the value specified by
 237 * @offset and @whence.
 238 */
 239loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
 240{
 241	struct inode *inode = file->f_mapping->host;
 242
 243	return generic_file_llseek_size(file, offset, whence,
 244					inode->i_sb->s_maxbytes,
 245					i_size_read(inode));
 246}
 247EXPORT_SYMBOL(generic_file_llseek);
 248
 249/**
 250 * fixed_size_llseek - llseek implementation for fixed-sized devices
 251 * @file:	file structure to seek on
 252 * @offset:	file offset to seek to
 253 * @whence:	type of seek
 254 * @size:	size of the file
 255 *
 256 */
 257loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
 258{
 259	switch (whence) {
 260	case SEEK_SET: case SEEK_CUR: case SEEK_END:
 261		return generic_file_llseek_size(file, offset, whence,
 262						size, size);
 263	default:
 264		return -EINVAL;
 265	}
 266}
 267EXPORT_SYMBOL(fixed_size_llseek);
 268
 269/**
 270 * no_seek_end_llseek - llseek implementation for fixed-sized devices
 271 * @file:	file structure to seek on
 272 * @offset:	file offset to seek to
 273 * @whence:	type of seek
 274 *
 275 */
 276loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
 277{
 278	switch (whence) {
 279	case SEEK_SET: case SEEK_CUR:
 280		return generic_file_llseek_size(file, offset, whence,
 281						OFFSET_MAX, 0);
 282	default:
 283		return -EINVAL;
 284	}
 285}
 286EXPORT_SYMBOL(no_seek_end_llseek);
 287
 288/**
 289 * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
 290 * @file:	file structure to seek on
 291 * @offset:	file offset to seek to
 292 * @whence:	type of seek
 293 * @size:	maximal offset allowed
 294 *
 295 */
 296loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
 297{
 298	switch (whence) {
 299	case SEEK_SET: case SEEK_CUR:
 300		return generic_file_llseek_size(file, offset, whence,
 301						size, 0);
 302	default:
 303		return -EINVAL;
 304	}
 305}
 306EXPORT_SYMBOL(no_seek_end_llseek_size);
 307
 308/**
 309 * noop_llseek - No Operation Performed llseek implementation
 310 * @file:	file structure to seek on
 311 * @offset:	file offset to seek to
 312 * @whence:	type of seek
 313 *
 314 * This is an implementation of ->llseek useable for the rare special case when
 315 * userspace expects the seek to succeed but the (device) file is actually not
 316 * able to perform the seek. In this case you use noop_llseek() instead of
 317 * falling back to the default implementation of ->llseek.
 318 */
 319loff_t noop_llseek(struct file *file, loff_t offset, int whence)
 320{
 321	return file->f_pos;
 322}
 323EXPORT_SYMBOL(noop_llseek);
 324
 
 
 
 
 
 
 325loff_t default_llseek(struct file *file, loff_t offset, int whence)
 326{
 327	struct inode *inode = file_inode(file);
 328	loff_t retval;
 329
 330	inode_lock(inode);
 331	switch (whence) {
 332		case SEEK_END:
 333			offset += i_size_read(inode);
 334			break;
 335		case SEEK_CUR:
 336			if (offset == 0) {
 337				retval = file->f_pos;
 338				goto out;
 339			}
 340			offset += file->f_pos;
 341			break;
 342		case SEEK_DATA:
 343			/*
 344			 * In the generic case the entire file is data, so as
 345			 * long as offset isn't at the end of the file then the
 346			 * offset is data.
 347			 */
 348			if (offset >= inode->i_size) {
 349				retval = -ENXIO;
 350				goto out;
 351			}
 352			break;
 353		case SEEK_HOLE:
 354			/*
 355			 * There is a virtual hole at the end of the file, so
 356			 * as long as offset isn't i_size or larger, return
 357			 * i_size.
 358			 */
 359			if (offset >= inode->i_size) {
 360				retval = -ENXIO;
 361				goto out;
 362			}
 363			offset = inode->i_size;
 364			break;
 365	}
 366	retval = -EINVAL;
 367	if (offset >= 0 || unsigned_offsets(file)) {
 368		if (offset != file->f_pos)
 369			file->f_pos = offset;
 
 
 370		retval = offset;
 371	}
 372out:
 373	inode_unlock(inode);
 374	return retval;
 375}
 376EXPORT_SYMBOL(default_llseek);
 377
 378loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
 379{
 380	if (!(file->f_mode & FMODE_LSEEK))
 381		return -ESPIPE;
 382	return file->f_op->llseek(file, offset, whence);
 
 
 
 
 
 383}
 384EXPORT_SYMBOL(vfs_llseek);
 385
 386static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
 387{
 388	off_t retval;
 389	CLASS(fd_pos, f)(fd);
 390	if (fd_empty(f))
 391		return -EBADF;
 392
 393	retval = -EINVAL;
 394	if (whence <= SEEK_MAX) {
 395		loff_t res = vfs_llseek(fd_file(f), offset, whence);
 396		retval = res;
 397		if (res != (loff_t)retval)
 398			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */
 399	}
 
 400	return retval;
 401}
 402
 403SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
 404{
 405	return ksys_lseek(fd, offset, whence);
 406}
 407
 408#ifdef CONFIG_COMPAT
 409COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
 410{
 411	return ksys_lseek(fd, offset, whence);
 412}
 413#endif
 414
 415#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \
 416	defined(__ARCH_WANT_SYS_LLSEEK)
 417SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 418		unsigned long, offset_low, loff_t __user *, result,
 419		unsigned int, whence)
 420{
 421	int retval;
 422	CLASS(fd_pos, f)(fd);
 423	loff_t offset;
 424
 425	if (fd_empty(f))
 426		return -EBADF;
 427
 
 428	if (whence > SEEK_MAX)
 429		return -EINVAL;
 430
 431	offset = vfs_llseek(fd_file(f), ((loff_t) offset_high << 32) | offset_low,
 432			whence);
 433
 434	retval = (int)offset;
 435	if (offset >= 0) {
 436		retval = -EFAULT;
 437		if (!copy_to_user(result, &offset, sizeof(offset)))
 438			retval = 0;
 439	}
 
 
 440	return retval;
 441}
 442#endif
 443
 444int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
 445{
 446	int mask = read_write == READ ? MAY_READ : MAY_WRITE;
 447	int ret;
 448
 449	if (unlikely((ssize_t) count < 0))
 450		return -EINVAL;
 451
 452	if (ppos) {
 453		loff_t pos = *ppos;
 454
 455		if (unlikely(pos < 0)) {
 456			if (!unsigned_offsets(file))
 457				return -EINVAL;
 458			if (count >= -pos) /* both values are in 0..LLONG_MAX */
 459				return -EOVERFLOW;
 460		} else if (unlikely((loff_t) (pos + count) < 0)) {
 461			if (!unsigned_offsets(file))
 462				return -EINVAL;
 463		}
 464	}
 465
 466	ret = security_file_permission(file, mask);
 467	if (ret)
 468		return ret;
 469
 470	return fsnotify_file_area_perm(file, mask, ppos, count);
 471}
 472EXPORT_SYMBOL(rw_verify_area);
 473
 474static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 475{
 476	struct kiocb kiocb;
 477	struct iov_iter iter;
 478	ssize_t ret;
 479
 480	init_sync_kiocb(&kiocb, filp);
 481	kiocb.ki_pos = (ppos ? *ppos : 0);
 482	iov_iter_ubuf(&iter, ITER_DEST, buf, len);
 
 
 483
 484	ret = filp->f_op->read_iter(&kiocb, &iter);
 
 485	BUG_ON(ret == -EIOCBQUEUED);
 486	if (ppos)
 487		*ppos = kiocb.ki_pos;
 488	return ret;
 489}
 
 490
 491static int warn_unsupported(struct file *file, const char *op)
 492{
 493	pr_warn_ratelimited(
 494		"kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
 495		op, file, current->pid, current->comm);
 496	return -EINVAL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 497}
 498
 499ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
 500{
 501	struct kvec iov = {
 502		.iov_base	= buf,
 503		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
 504	};
 505	struct kiocb kiocb;
 506	struct iov_iter iter;
 507	ssize_t ret;
 508
 509	if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ)))
 510		return -EINVAL;
 511	if (!(file->f_mode & FMODE_CAN_READ))
 512		return -EINVAL;
 513	/*
 514	 * Also fail if ->read_iter and ->read are both wired up as that
 515	 * implies very convoluted semantics.
 516	 */
 517	if (unlikely(!file->f_op->read_iter || file->f_op->read))
 518		return warn_unsupported(file, "read");
 519
 520	init_sync_kiocb(&kiocb, file);
 521	kiocb.ki_pos = pos ? *pos : 0;
 522	iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len);
 523	ret = file->f_op->read_iter(&kiocb, &iter);
 524	if (ret > 0) {
 525		if (pos)
 526			*pos = kiocb.ki_pos;
 527		fsnotify_access(file);
 528		add_rchar(current, ret);
 529	}
 530	inc_syscr(current);
 531	return ret;
 532}
 533
 534ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
 
 535{
 536	ssize_t ret;
 537
 538	ret = rw_verify_area(READ, file, pos, count);
 539	if (ret)
 540		return ret;
 541	return __kernel_read(file, buf, count, pos);
 542}
 543EXPORT_SYMBOL(kernel_read);
 544
 545ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 546{
 547	ssize_t ret;
 548
 549	if (!(file->f_mode & FMODE_READ))
 550		return -EBADF;
 551	if (!(file->f_mode & FMODE_CAN_READ))
 552		return -EINVAL;
 553	if (unlikely(!access_ok(buf, count)))
 554		return -EFAULT;
 555
 556	ret = rw_verify_area(READ, file, pos, count);
 557	if (ret)
 558		return ret;
 559	if (count > MAX_RW_COUNT)
 560		count =  MAX_RW_COUNT;
 
 
 
 
 
 
 561
 562	if (file->f_op->read)
 563		ret = file->f_op->read(file, buf, count, pos);
 564	else if (file->f_op->read_iter)
 565		ret = new_sync_read(file, buf, count, pos);
 566	else
 567		ret = -EINVAL;
 568	if (ret > 0) {
 569		fsnotify_access(file);
 570		add_rchar(current, ret);
 571	}
 572	inc_syscr(current);
 573	return ret;
 574}
 575
 
 
 576static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 577{
 
 578	struct kiocb kiocb;
 579	struct iov_iter iter;
 580	ssize_t ret;
 581
 582	init_sync_kiocb(&kiocb, filp);
 583	kiocb.ki_pos = (ppos ? *ppos : 0);
 584	iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len);
 585
 586	ret = filp->f_op->write_iter(&kiocb, &iter);
 587	BUG_ON(ret == -EIOCBQUEUED);
 588	if (ret > 0 && ppos)
 589		*ppos = kiocb.ki_pos;
 590	return ret;
 591}
 592
 593/* caller is responsible for file_start_write/file_end_write */
 594ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos)
 595{
 596	struct kiocb kiocb;
 
 
 
 
 
 
 
 
 
 
 
 
 597	ssize_t ret;
 598
 599	if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
 600		return -EBADF;
 601	if (!(file->f_mode & FMODE_CAN_WRITE))
 602		return -EINVAL;
 603	/*
 604	 * Also fail if ->write_iter and ->write are both wired up as that
 605	 * implies very convoluted semantics.
 606	 */
 607	if (unlikely(!file->f_op->write_iter || file->f_op->write))
 608		return warn_unsupported(file, "write");
 609
 610	init_sync_kiocb(&kiocb, file);
 611	kiocb.ki_pos = pos ? *pos : 0;
 612	ret = file->f_op->write_iter(&kiocb, from);
 
 
 
 
 613	if (ret > 0) {
 614		if (pos)
 615			*pos = kiocb.ki_pos;
 616		fsnotify_modify(file);
 617		add_wchar(current, ret);
 618	}
 619	inc_syscw(current);
 620	return ret;
 621}
 622
 623/* caller is responsible for file_start_write/file_end_write */
 624ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
 625{
 626	struct kvec iov = {
 627		.iov_base	= (void *)buf,
 628		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
 629	};
 630	struct iov_iter iter;
 631	iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len);
 632	return __kernel_write_iter(file, &iter, pos);
 633}
 634/*
 635 * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()",
 636 * but autofs is one of the few internal kernel users that actually
 637 * wants this _and_ can be built as a module. So we need to export
 638 * this symbol for autofs, even though it really isn't appropriate
 639 * for any other kernel modules.
 640 */
 641EXPORT_SYMBOL_GPL(__kernel_write);
 642
 643ssize_t kernel_write(struct file *file, const void *buf, size_t count,
 644			    loff_t *pos)
 645{
 646	ssize_t ret;
 647
 648	ret = rw_verify_area(WRITE, file, pos, count);
 649	if (ret)
 650		return ret;
 651
 652	file_start_write(file);
 653	ret =  __kernel_write(file, buf, count, pos);
 654	file_end_write(file);
 655	return ret;
 656}
 657EXPORT_SYMBOL(kernel_write);
 658
 659ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 660{
 661	ssize_t ret;
 662
 663	if (!(file->f_mode & FMODE_WRITE))
 664		return -EBADF;
 665	if (!(file->f_mode & FMODE_CAN_WRITE))
 666		return -EINVAL;
 667	if (unlikely(!access_ok(buf, count)))
 668		return -EFAULT;
 669
 670	ret = rw_verify_area(WRITE, file, pos, count);
 671	if (ret)
 672		return ret;
 673	if (count > MAX_RW_COUNT)
 674		count =  MAX_RW_COUNT;
 675	file_start_write(file);
 676	if (file->f_op->write)
 677		ret = file->f_op->write(file, buf, count, pos);
 678	else if (file->f_op->write_iter)
 679		ret = new_sync_write(file, buf, count, pos);
 680	else
 681		ret = -EINVAL;
 682	if (ret > 0) {
 683		fsnotify_modify(file);
 684		add_wchar(current, ret);
 685	}
 686	inc_syscw(current);
 687	file_end_write(file);
 688	return ret;
 689}
 690
 691/* file_ppos returns &file->f_pos or NULL if file is stream */
 692static inline loff_t *file_ppos(struct file *file)
 
 693{
 694	return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos;
 695}
 696
 697ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
 698{
 699	CLASS(fd_pos, f)(fd);
 700	ssize_t ret = -EBADF;
 701
 702	if (!fd_empty(f)) {
 703		loff_t pos, *ppos = file_ppos(fd_file(f));
 704		if (ppos) {
 705			pos = *ppos;
 706			ppos = &pos;
 707		}
 708		ret = vfs_read(fd_file(f), buf, count, ppos);
 709		if (ret >= 0 && ppos)
 710			fd_file(f)->f_pos = pos;
 711	}
 712	return ret;
 713}
 714
 715SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 716{
 717	return ksys_read(fd, buf, count);
 718}
 719
 720ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
 721{
 722	CLASS(fd_pos, f)(fd);
 723	ssize_t ret = -EBADF;
 724
 725	if (!fd_empty(f)) {
 726		loff_t pos, *ppos = file_ppos(fd_file(f));
 727		if (ppos) {
 728			pos = *ppos;
 729			ppos = &pos;
 730		}
 731		ret = vfs_write(fd_file(f), buf, count, ppos);
 732		if (ret >= 0 && ppos)
 733			fd_file(f)->f_pos = pos;
 734	}
 735
 736	return ret;
 737}
 738
 739SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 740		size_t, count)
 741{
 742	return ksys_write(fd, buf, count);
 743}
 744
 745ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
 746		     loff_t pos)
 747{
 748	if (pos < 0)
 749		return -EINVAL;
 
 
 750
 751	CLASS(fd, f)(fd);
 752	if (fd_empty(f))
 753		return -EBADF;
 754
 755	if (fd_file(f)->f_mode & FMODE_PREAD)
 756		return vfs_read(fd_file(f), buf, count, &pos);
 757
 758	return -ESPIPE;
 759}
 760
 761SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
 762			size_t, count, loff_t, pos)
 763{
 764	return ksys_pread64(fd, buf, count, pos);
 765}
 766
 767#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64)
 768COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf,
 769		       size_t, count, compat_arg_u64_dual(pos))
 770{
 771	return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos));
 772}
 773#endif
 774
 775ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
 776		      size_t count, loff_t pos)
 777{
 778	if (pos < 0)
 779		return -EINVAL;
 780
 781	CLASS(fd, f)(fd);
 782	if (fd_empty(f))
 783		return -EBADF;
 
 
 
 
 784
 785	if (fd_file(f)->f_mode & FMODE_PWRITE)
 786		return vfs_write(fd_file(f), buf, count, &pos);
 787
 788	return -ESPIPE;
 789}
 790
 791SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
 792			 size_t, count, loff_t, pos)
 793{
 794	return ksys_pwrite64(fd, buf, count, pos);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 795}
 796
 797#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64)
 798COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf,
 799		       size_t, count, compat_arg_u64_dual(pos))
 
 800{
 801	return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos));
 
 
 
 
 
 
 
 
 
 
 
 
 802}
 803#endif
 804
 805static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 806		loff_t *ppos, int type, rwf_t flags)
 807{
 808	struct kiocb kiocb;
 809	ssize_t ret;
 810
 
 
 
 811	init_sync_kiocb(&kiocb, filp);
 812	ret = kiocb_set_rw_flags(&kiocb, flags, type);
 813	if (ret)
 814		return ret;
 815	kiocb.ki_pos = (ppos ? *ppos : 0);
 
 
 
 816
 817	if (type == READ)
 818		ret = filp->f_op->read_iter(&kiocb, iter);
 819	else
 820		ret = filp->f_op->write_iter(&kiocb, iter);
 821	BUG_ON(ret == -EIOCBQUEUED);
 822	if (ppos)
 823		*ppos = kiocb.ki_pos;
 824	return ret;
 825}
 826
 827/* Do it by hand, with file-ops */
 828static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
 829		loff_t *ppos, int type, rwf_t flags)
 830{
 831	ssize_t ret = 0;
 832
 833	if (flags & ~RWF_HIPRI)
 834		return -EOPNOTSUPP;
 835
 836	while (iov_iter_count(iter)) {
 
 837		ssize_t nr;
 838
 839		if (type == READ) {
 840			nr = filp->f_op->read(filp, iter_iov_addr(iter),
 841						iter_iov_len(iter), ppos);
 842		} else {
 843			nr = filp->f_op->write(filp, iter_iov_addr(iter),
 844						iter_iov_len(iter), ppos);
 845		}
 846
 847		if (nr < 0) {
 848			if (!ret)
 849				ret = nr;
 850			break;
 851		}
 852		ret += nr;
 853		if (nr != iter_iov_len(iter))
 854			break;
 855		iov_iter_advance(iter, nr);
 856	}
 857
 858	return ret;
 859}
 860
 861ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
 862			   struct iov_iter *iter)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 863{
 864	size_t tot_len;
 865	ssize_t ret = 0;
 
 866
 867	if (!file->f_op->read_iter)
 868		return -EINVAL;
 869	if (!(file->f_mode & FMODE_READ))
 870		return -EBADF;
 871	if (!(file->f_mode & FMODE_CAN_READ))
 872		return -EINVAL;
 
 
 
 873
 874	tot_len = iov_iter_count(iter);
 875	if (!tot_len)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 876		goto out;
 877	ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len);
 878	if (ret < 0)
 879		return ret;
 880
 881	ret = file->f_op->read_iter(iocb, iter);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 882out:
 883	if (ret >= 0)
 884		fsnotify_access(file);
 885	return ret;
 886}
 887EXPORT_SYMBOL(vfs_iocb_iter_read);
 888
 889ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
 890		      rwf_t flags)
 
 
 891{
 892	size_t tot_len;
 893	ssize_t ret = 0;
 894
 895	if (!file->f_op->read_iter)
 896		return -EINVAL;
 897	if (!(file->f_mode & FMODE_READ))
 898		return -EBADF;
 899	if (!(file->f_mode & FMODE_CAN_READ))
 900		return -EINVAL;
 901
 902	tot_len = iov_iter_count(iter);
 903	if (!tot_len)
 904		goto out;
 905	ret = rw_verify_area(READ, file, ppos, tot_len);
 906	if (ret < 0)
 907		return ret;
 908
 909	ret = do_iter_readv_writev(file, iter, ppos, READ, flags);
 910out:
 911	if (ret >= 0)
 912		fsnotify_access(file);
 913	return ret;
 914}
 915EXPORT_SYMBOL(vfs_iter_read);
 916
 917/*
 918 * Caller is responsible for calling kiocb_end_write() on completion
 919 * if async iocb was queued.
 920 */
 921ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
 922			    struct iov_iter *iter)
 923{
 924	size_t tot_len;
 925	ssize_t ret = 0;
 926
 927	if (!file->f_op->write_iter)
 928		return -EINVAL;
 929	if (!(file->f_mode & FMODE_WRITE))
 930		return -EBADF;
 931	if (!(file->f_mode & FMODE_CAN_WRITE))
 932		return -EINVAL;
 933
 934	tot_len = iov_iter_count(iter);
 935	if (!tot_len)
 936		return 0;
 937	ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len);
 938	if (ret < 0)
 939		return ret;
 940
 941	kiocb_start_write(iocb);
 942	ret = file->f_op->write_iter(iocb, iter);
 943	if (ret != -EIOCBQUEUED)
 944		kiocb_end_write(iocb);
 945	if (ret > 0)
 946		fsnotify_modify(file);
 
 
 947
 948	return ret;
 949}
 950EXPORT_SYMBOL(vfs_iocb_iter_write);
 
 951
 952ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
 953		       rwf_t flags)
 954{
 955	size_t tot_len;
 956	ssize_t ret;
 957
 958	if (!(file->f_mode & FMODE_WRITE))
 959		return -EBADF;
 960	if (!(file->f_mode & FMODE_CAN_WRITE))
 961		return -EINVAL;
 962	if (!file->f_op->write_iter)
 963		return -EINVAL;
 964
 965	tot_len = iov_iter_count(iter);
 966	if (!tot_len)
 967		return 0;
 968
 969	ret = rw_verify_area(WRITE, file, ppos, tot_len);
 970	if (ret < 0)
 971		return ret;
 972
 973	file_start_write(file);
 974	ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags);
 975	if (ret > 0)
 976		fsnotify_modify(file);
 977	file_end_write(file);
 978
 
 
 
 
 
 
 
 
 979	return ret;
 980}
 981EXPORT_SYMBOL(vfs_iter_write);
 982
 983static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 984			 unsigned long vlen, loff_t *pos, rwf_t flags)
 985{
 986	struct iovec iovstack[UIO_FASTIOV];
 987	struct iovec *iov = iovstack;
 988	struct iov_iter iter;
 989	size_t tot_len;
 990	ssize_t ret = 0;
 991
 992	if (!(file->f_mode & FMODE_READ))
 993		return -EBADF;
 994	if (!(file->f_mode & FMODE_CAN_READ))
 995		return -EINVAL;
 996
 997	ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov,
 998			   &iter);
 999	if (ret < 0)
1000		return ret;
1001
1002	tot_len = iov_iter_count(&iter);
1003	if (!tot_len)
1004		goto out;
1005
1006	ret = rw_verify_area(READ, file, pos, tot_len);
1007	if (ret < 0)
1008		goto out;
1009
1010	if (file->f_op->read_iter)
1011		ret = do_iter_readv_writev(file, &iter, pos, READ, flags);
1012	else
1013		ret = do_loop_readv_writev(file, &iter, pos, READ, flags);
1014out:
1015	if (ret >= 0)
1016		fsnotify_access(file);
1017	kfree(iov);
1018	return ret;
1019}
1020
1021static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
1022			  unsigned long vlen, loff_t *pos, rwf_t flags)
1023{
1024	struct iovec iovstack[UIO_FASTIOV];
1025	struct iovec *iov = iovstack;
1026	struct iov_iter iter;
1027	size_t tot_len;
1028	ssize_t ret = 0;
1029
1030	if (!(file->f_mode & FMODE_WRITE))
1031		return -EBADF;
1032	if (!(file->f_mode & FMODE_CAN_WRITE))
1033		return -EINVAL;
1034
1035	ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov,
1036			   &iter);
1037	if (ret < 0)
1038		return ret;
1039
1040	tot_len = iov_iter_count(&iter);
1041	if (!tot_len)
1042		goto out;
1043
1044	ret = rw_verify_area(WRITE, file, pos, tot_len);
1045	if (ret < 0)
1046		goto out;
1047
1048	file_start_write(file);
1049	if (file->f_op->write_iter)
1050		ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags);
1051	else
1052		ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags);
1053	if (ret > 0)
1054		fsnotify_modify(file);
1055	file_end_write(file);
1056out:
1057	kfree(iov);
1058	return ret;
1059}
1060
1061static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
1062			unsigned long vlen, rwf_t flags)
1063{
1064	CLASS(fd_pos, f)(fd);
1065	ssize_t ret = -EBADF;
1066
1067	if (!fd_empty(f)) {
1068		loff_t pos, *ppos = file_ppos(fd_file(f));
1069		if (ppos) {
1070			pos = *ppos;
1071			ppos = &pos;
1072		}
1073		ret = vfs_readv(fd_file(f), vec, vlen, ppos, flags);
1074		if (ret >= 0 && ppos)
1075			fd_file(f)->f_pos = pos;
1076	}
1077
1078	if (ret > 0)
1079		add_rchar(current, ret);
1080	inc_syscr(current);
1081	return ret;
1082}
1083
1084static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
1085			 unsigned long vlen, rwf_t flags)
1086{
1087	CLASS(fd_pos, f)(fd);
1088	ssize_t ret = -EBADF;
1089
1090	if (!fd_empty(f)) {
1091		loff_t pos, *ppos = file_ppos(fd_file(f));
1092		if (ppos) {
1093			pos = *ppos;
1094			ppos = &pos;
1095		}
1096		ret = vfs_writev(fd_file(f), vec, vlen, ppos, flags);
1097		if (ret >= 0 && ppos)
1098			fd_file(f)->f_pos = pos;
1099	}
1100
1101	if (ret > 0)
1102		add_wchar(current, ret);
1103	inc_syscw(current);
1104	return ret;
1105}
1106
1107static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
1108{
1109#define HALF_LONG_BITS (BITS_PER_LONG / 2)
1110	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
1111}
1112
1113static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
1114			 unsigned long vlen, loff_t pos, rwf_t flags)
1115{
 
1116	ssize_t ret = -EBADF;
1117
1118	if (pos < 0)
1119		return -EINVAL;
1120
1121	CLASS(fd, f)(fd);
1122	if (!fd_empty(f)) {
1123		ret = -ESPIPE;
1124		if (fd_file(f)->f_mode & FMODE_PREAD)
1125			ret = vfs_readv(fd_file(f), vec, vlen, &pos, flags);
 
1126	}
1127
1128	if (ret > 0)
1129		add_rchar(current, ret);
1130	inc_syscr(current);
1131	return ret;
1132}
1133
1134static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
1135			  unsigned long vlen, loff_t pos, rwf_t flags)
1136{
 
1137	ssize_t ret = -EBADF;
1138
1139	if (pos < 0)
1140		return -EINVAL;
1141
1142	CLASS(fd, f)(fd);
1143	if (!fd_empty(f)) {
1144		ret = -ESPIPE;
1145		if (fd_file(f)->f_mode & FMODE_PWRITE)
1146			ret = vfs_writev(fd_file(f), vec, vlen, &pos, flags);
 
1147	}
1148
1149	if (ret > 0)
1150		add_wchar(current, ret);
1151	inc_syscw(current);
1152	return ret;
1153}
1154
1155SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
1156		unsigned long, vlen)
1157{
1158	return do_readv(fd, vec, vlen, 0);
1159}
1160
1161SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
1162		unsigned long, vlen)
1163{
1164	return do_writev(fd, vec, vlen, 0);
1165}
1166
1167SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
1168		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1169{
1170	loff_t pos = pos_from_hilo(pos_h, pos_l);
1171
1172	return do_preadv(fd, vec, vlen, pos, 0);
1173}
1174
1175SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
1176		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1177		rwf_t, flags)
1178{
1179	loff_t pos = pos_from_hilo(pos_h, pos_l);
1180
1181	if (pos == -1)
1182		return do_readv(fd, vec, vlen, flags);
1183
1184	return do_preadv(fd, vec, vlen, pos, flags);
1185}
1186
1187SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
1188		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1189{
1190	loff_t pos = pos_from_hilo(pos_h, pos_l);
1191
1192	return do_pwritev(fd, vec, vlen, pos, 0);
1193}
1194
1195SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
1196		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1197		rwf_t, flags)
1198{
1199	loff_t pos = pos_from_hilo(pos_h, pos_l);
1200
1201	if (pos == -1)
1202		return do_writev(fd, vec, vlen, flags);
1203
1204	return do_pwritev(fd, vec, vlen, pos, flags);
1205}
1206
1207/*
1208 * Various compat syscalls.  Note that they all pretend to take a native
1209 * iovec - import_iovec will properly treat those as compat_iovecs based on
1210 * in_compat_syscall().
1211 */
1212#ifdef CONFIG_COMPAT
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1213#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
1214COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1215		const struct iovec __user *, vec,
1216		unsigned long, vlen, loff_t, pos)
1217{
1218	return do_preadv(fd, vec, vlen, pos, 0);
1219}
1220#endif
1221
1222COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1223		const struct iovec __user *, vec,
1224		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1225{
1226	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1227
1228	return do_preadv(fd, vec, vlen, pos, 0);
1229}
1230
1231#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
1232COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
1233		const struct iovec __user *, vec,
1234		unsigned long, vlen, loff_t, pos, rwf_t, flags)
1235{
1236	if (pos == -1)
1237		return do_readv(fd, vec, vlen, flags);
1238	return do_preadv(fd, vec, vlen, pos, flags);
1239}
1240#endif
1241
1242COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
1243		const struct iovec __user *, vec,
1244		compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
1245		rwf_t, flags)
1246{
1247	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1248
1249	if (pos == -1)
1250		return do_readv(fd, vec, vlen, flags);
1251	return do_preadv(fd, vec, vlen, pos, flags);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1252}
1253
1254#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
1255COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1256		const struct iovec __user *, vec,
1257		unsigned long, vlen, loff_t, pos)
1258{
1259	return do_pwritev(fd, vec, vlen, pos, 0);
1260}
1261#endif
1262
1263COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1264		const struct iovec __user *,vec,
1265		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1266{
1267	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1268
1269	return do_pwritev(fd, vec, vlen, pos, 0);
1270}
1271
1272#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
1273COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
1274		const struct iovec __user *, vec,
1275		unsigned long, vlen, loff_t, pos, rwf_t, flags)
1276{
1277	if (pos == -1)
1278		return do_writev(fd, vec, vlen, flags);
1279	return do_pwritev(fd, vec, vlen, pos, flags);
1280}
1281#endif
1282
1283COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
1284		const struct iovec __user *,vec,
1285		compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
1286{
1287	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1288
1289	if (pos == -1)
1290		return do_writev(fd, vec, vlen, flags);
1291	return do_pwritev(fd, vec, vlen, pos, flags);
 
1292}
1293#endif /* CONFIG_COMPAT */
 
1294
1295static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1296			   size_t count, loff_t max)
1297{
 
1298	struct inode *in_inode, *out_inode;
1299	struct pipe_inode_info *opipe;
1300	loff_t pos;
1301	loff_t out_pos;
1302	ssize_t retval;
1303	int fl;
1304
1305	/*
1306	 * Get input file, and verify that it is ok..
1307	 */
1308	CLASS(fd, in)(in_fd);
1309	if (fd_empty(in))
1310		return -EBADF;
1311	if (!(fd_file(in)->f_mode & FMODE_READ))
1312		return -EBADF;
 
 
1313	if (!ppos) {
1314		pos = fd_file(in)->f_pos;
1315	} else {
1316		pos = *ppos;
1317		if (!(fd_file(in)->f_mode & FMODE_PREAD))
1318			return -ESPIPE;
1319	}
1320	retval = rw_verify_area(READ, fd_file(in), &pos, count);
1321	if (retval < 0)
1322		return retval;
1323	if (count > MAX_RW_COUNT)
1324		count =  MAX_RW_COUNT;
1325
1326	/*
1327	 * Get output file, and verify that it is ok..
1328	 */
1329	CLASS(fd, out)(out_fd);
1330	if (fd_empty(out))
1331		return -EBADF;
1332	if (!(fd_file(out)->f_mode & FMODE_WRITE))
1333		return -EBADF;
1334	in_inode = file_inode(fd_file(in));
1335	out_inode = file_inode(fd_file(out));
1336	out_pos = fd_file(out)->f_pos;
 
 
 
 
 
1337
1338	if (!max)
1339		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1340
1341	if (unlikely(pos + count > max)) {
 
1342		if (pos >= max)
1343			return -EOVERFLOW;
1344		count = max - pos;
1345	}
1346
1347	fl = 0;
1348#if 0
1349	/*
1350	 * We need to debate whether we can enable this or not. The
1351	 * man page documents EAGAIN return for the output at least,
1352	 * and the application is arguably buggy if it doesn't expect
1353	 * EAGAIN on a non-blocking file descriptor.
1354	 */
1355	if (fd_file(in)->f_flags & O_NONBLOCK)
1356		fl = SPLICE_F_NONBLOCK;
1357#endif
1358	opipe = get_pipe_info(fd_file(out), true);
1359	if (!opipe) {
1360		retval = rw_verify_area(WRITE, fd_file(out), &out_pos, count);
1361		if (retval < 0)
1362			return retval;
1363		retval = do_splice_direct(fd_file(in), &pos, fd_file(out), &out_pos,
1364					  count, fl);
1365	} else {
1366		if (fd_file(out)->f_flags & O_NONBLOCK)
1367			fl |= SPLICE_F_NONBLOCK;
1368
1369		retval = splice_file_to_pipe(fd_file(in), opipe, &pos, count, fl);
1370	}
1371
1372	if (retval > 0) {
1373		add_rchar(current, retval);
1374		add_wchar(current, retval);
1375		fsnotify_access(fd_file(in));
1376		fsnotify_modify(fd_file(out));
1377		fd_file(out)->f_pos = out_pos;
1378		if (ppos)
1379			*ppos = pos;
1380		else
1381			fd_file(in)->f_pos = pos;
1382	}
1383
1384	inc_syscr(current);
1385	inc_syscw(current);
1386	if (pos > max)
1387		retval = -EOVERFLOW;
 
 
 
 
 
 
1388	return retval;
1389}
1390
1391SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1392{
1393	loff_t pos;
1394	off_t off;
1395	ssize_t ret;
1396
1397	if (offset) {
1398		if (unlikely(get_user(off, offset)))
1399			return -EFAULT;
1400		pos = off;
1401		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1402		if (unlikely(put_user(pos, offset)))
1403			return -EFAULT;
1404		return ret;
1405	}
1406
1407	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1408}
1409
1410SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1411{
1412	loff_t pos;
1413	ssize_t ret;
1414
1415	if (offset) {
1416		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1417			return -EFAULT;
1418		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1419		if (unlikely(put_user(pos, offset)))
1420			return -EFAULT;
1421		return ret;
1422	}
1423
1424	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1425}
1426
1427#ifdef CONFIG_COMPAT
1428COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1429		compat_off_t __user *, offset, compat_size_t, count)
1430{
1431	loff_t pos;
1432	off_t off;
1433	ssize_t ret;
1434
1435	if (offset) {
1436		if (unlikely(get_user(off, offset)))
1437			return -EFAULT;
1438		pos = off;
1439		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1440		if (unlikely(put_user(pos, offset)))
1441			return -EFAULT;
1442		return ret;
1443	}
1444
1445	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1446}
1447
1448COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1449		compat_loff_t __user *, offset, compat_size_t, count)
1450{
1451	loff_t pos;
1452	ssize_t ret;
1453
1454	if (offset) {
1455		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1456			return -EFAULT;
1457		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1458		if (unlikely(put_user(pos, offset)))
1459			return -EFAULT;
1460		return ret;
1461	}
1462
1463	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1464}
1465#endif
1466
1467/*
1468 * Performs necessary checks before doing a file copy
1469 *
1470 * Can adjust amount of bytes to copy via @req_count argument.
1471 * Returns appropriate error code that caller should return or
1472 * zero in case the copy should be allowed.
1473 */
1474static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
1475				    struct file *file_out, loff_t pos_out,
1476				    size_t *req_count, unsigned int flags)
1477{
1478	struct inode *inode_in = file_inode(file_in);
1479	struct inode *inode_out = file_inode(file_out);
1480	uint64_t count = *req_count;
1481	loff_t size_in;
1482	int ret;
1483
1484	ret = generic_file_rw_checks(file_in, file_out);
1485	if (ret)
1486		return ret;
1487
1488	/*
1489	 * We allow some filesystems to handle cross sb copy, but passing
1490	 * a file of the wrong filesystem type to filesystem driver can result
1491	 * in an attempt to dereference the wrong type of ->private_data, so
1492	 * avoid doing that until we really have a good reason.
1493	 *
1494	 * nfs and cifs define several different file_system_type structures
1495	 * and several different sets of file_operations, but they all end up
1496	 * using the same ->copy_file_range() function pointer.
1497	 */
1498	if (flags & COPY_FILE_SPLICE) {
1499		/* cross sb splice is allowed */
1500	} else if (file_out->f_op->copy_file_range) {
1501		if (file_in->f_op->copy_file_range !=
1502		    file_out->f_op->copy_file_range)
1503			return -EXDEV;
1504	} else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) {
1505		return -EXDEV;
1506	}
1507
1508	/* Don't touch certain kinds of inodes */
1509	if (IS_IMMUTABLE(inode_out))
1510		return -EPERM;
1511
1512	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
1513		return -ETXTBSY;
1514
1515	/* Ensure offsets don't wrap. */
1516	if (pos_in + count < pos_in || pos_out + count < pos_out)
1517		return -EOVERFLOW;
1518
1519	/* Shorten the copy to EOF */
1520	size_in = i_size_read(inode_in);
1521	if (pos_in >= size_in)
1522		count = 0;
1523	else
1524		count = min(count, size_in - (uint64_t)pos_in);
1525
1526	ret = generic_write_check_limits(file_out, pos_out, &count);
1527	if (ret)
1528		return ret;
1529
1530	/* Don't allow overlapped copying within the same file. */
1531	if (inode_in == inode_out &&
1532	    pos_out + count > pos_in &&
1533	    pos_out < pos_in + count)
1534		return -EINVAL;
1535
1536	*req_count = count;
1537	return 0;
1538}
1539
1540/*
1541 * copy_file_range() differs from regular file read and write in that it
1542 * specifically allows return partial success.  When it does so is up to
1543 * the copy_file_range method.
1544 */
1545ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
1546			    struct file *file_out, loff_t pos_out,
1547			    size_t len, unsigned int flags)
1548{
 
 
1549	ssize_t ret;
1550	bool splice = flags & COPY_FILE_SPLICE;
1551	bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb;
1552
1553	if (flags & ~COPY_FILE_SPLICE)
1554		return -EINVAL;
1555
1556	ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
1557				       flags);
1558	if (unlikely(ret))
1559		return ret;
1560
1561	ret = rw_verify_area(READ, file_in, &pos_in, len);
1562	if (unlikely(ret))
1563		return ret;
1564
1565	ret = rw_verify_area(WRITE, file_out, &pos_out, len);
1566	if (unlikely(ret))
1567		return ret;
1568
 
 
 
 
 
 
 
 
 
1569	if (len == 0)
1570		return 0;
1571
1572	file_start_write(file_out);
1573
1574	/*
1575	 * Cloning is supported by more file systems, so we implement copy on
1576	 * same sb using clone, but for filesystems where both clone and copy
1577	 * are supported (e.g. nfs,cifs), we only call the copy method.
1578	 */
1579	if (!splice && file_out->f_op->copy_file_range) {
1580		ret = file_out->f_op->copy_file_range(file_in, pos_in,
1581						      file_out, pos_out,
1582						      len, flags);
1583	} else if (!splice && file_in->f_op->remap_file_range && samesb) {
1584		ret = file_in->f_op->remap_file_range(file_in, pos_in,
1585				file_out, pos_out,
1586				min_t(loff_t, MAX_RW_COUNT, len),
1587				REMAP_FILE_CAN_SHORTEN);
1588		/* fallback to splice */
1589		if (ret <= 0)
1590			splice = true;
1591	} else if (samesb) {
1592		/* Fallback to splice for same sb copy for backward compat */
1593		splice = true;
1594	}
1595
1596	file_end_write(file_out);
 
 
 
 
 
1597
1598	if (!splice)
1599		goto done;
1600
1601	/*
1602	 * We can get here for same sb copy of filesystems that do not implement
1603	 * ->copy_file_range() in case filesystem does not support clone or in
1604	 * case filesystem supports clone but rejected the clone request (e.g.
1605	 * because it was not block aligned).
1606	 *
1607	 * In both cases, fall back to kernel copy so we are able to maintain a
1608	 * consistent story about which filesystems support copy_file_range()
1609	 * and which filesystems do not, that will allow userspace tools to
1610	 * make consistent desicions w.r.t using copy_file_range().
1611	 *
1612	 * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE
1613	 * for server-side-copy between any two sb.
1614	 *
1615	 * In any case, we call do_splice_direct() and not splice_file_range(),
1616	 * without file_start_write() held, to avoid possible deadlocks related
1617	 * to splicing from input file, while file_start_write() is held on
1618	 * the output file on a different sb.
1619	 */
1620	ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
1621			       min_t(size_t, len, MAX_RW_COUNT), 0);
1622done:
1623	if (ret > 0) {
1624		fsnotify_access(file_in);
1625		add_rchar(current, ret);
1626		fsnotify_modify(file_out);
1627		add_wchar(current, ret);
1628	}
1629
1630	inc_syscr(current);
1631	inc_syscw(current);
1632
 
 
1633	return ret;
1634}
1635EXPORT_SYMBOL(vfs_copy_file_range);
1636
1637SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
1638		int, fd_out, loff_t __user *, off_out,
1639		size_t, len, unsigned int, flags)
1640{
1641	loff_t pos_in;
1642	loff_t pos_out;
 
 
1643	ssize_t ret = -EBADF;
1644
1645	CLASS(fd, f_in)(fd_in);
1646	if (fd_empty(f_in))
1647		return -EBADF;
1648
1649	CLASS(fd, f_out)(fd_out);
1650	if (fd_empty(f_out))
1651		return -EBADF;
1652
 
1653	if (off_in) {
1654		if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
1655			return -EFAULT;
1656	} else {
1657		pos_in = fd_file(f_in)->f_pos;
1658	}
1659
1660	if (off_out) {
1661		if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
1662			return -EFAULT;
1663	} else {
1664		pos_out = fd_file(f_out)->f_pos;
1665	}
1666
1667	if (flags != 0)
1668		return -EINVAL;
1669
1670	ret = vfs_copy_file_range(fd_file(f_in), pos_in, fd_file(f_out), pos_out, len,
1671				  flags);
1672	if (ret > 0) {
1673		pos_in += ret;
1674		pos_out += ret;
1675
1676		if (off_in) {
1677			if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
1678				ret = -EFAULT;
1679		} else {
1680			fd_file(f_in)->f_pos = pos_in;
1681		}
1682
1683		if (off_out) {
1684			if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
1685				ret = -EFAULT;
1686		} else {
1687			fd_file(f_out)->f_pos = pos_out;
1688		}
1689	}
 
 
 
 
 
 
1690	return ret;
1691}
1692
1693/*
1694 * Don't operate on ranges the page cache doesn't support, and don't exceed the
1695 * LFS limits.  If pos is under the limit it becomes a short access.  If it
1696 * exceeds the limit we return -EFBIG.
1697 */
1698int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count)
1699{
1700	struct inode *inode = file->f_mapping->host;
1701	loff_t max_size = inode->i_sb->s_maxbytes;
1702	loff_t limit = rlimit(RLIMIT_FSIZE);
1703
1704	if (limit != RLIM_INFINITY) {
1705		if (pos >= limit) {
1706			send_sig(SIGXFSZ, current, 0);
1707			return -EFBIG;
1708		}
1709		*count = min(*count, limit - pos);
1710	}
1711
1712	if (!(file->f_flags & O_LARGEFILE))
1713		max_size = MAX_NON_LFS;
1714
1715	if (unlikely(pos >= max_size))
1716		return -EFBIG;
 
1717
1718	*count = min(*count, max_size - pos);
 
 
 
 
1719
1720	return 0;
1721}
1722EXPORT_SYMBOL_GPL(generic_write_check_limits);
1723
1724/* Like generic_write_checks(), but takes size of write instead of iter. */
1725int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
1726{
1727	struct file *file = iocb->ki_filp;
1728	struct inode *inode = file->f_mapping->host;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1729
1730	if (IS_SWAPFILE(inode))
1731		return -ETXTBSY;
1732
1733	if (!*count)
 
 
 
 
 
 
 
 
1734		return 0;
1735
1736	if (iocb->ki_flags & IOCB_APPEND)
1737		iocb->ki_pos = i_size_read(inode);
 
 
 
 
 
 
1738
1739	if ((iocb->ki_flags & IOCB_NOWAIT) &&
1740	    !((iocb->ki_flags & IOCB_DIRECT) ||
1741	      (file->f_op->fop_flags & FOP_BUFFER_WASYNC)))
1742		return -EINVAL;
1743
1744	return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
1745}
1746EXPORT_SYMBOL(generic_write_checks_count);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1747
1748/*
1749 * Performs necessary checks before doing a write
1750 *
1751 * Can adjust writing position or amount of bytes to write.
1752 * Returns appropriate error code that caller should return or
1753 * zero in case that write should be allowed.
1754 */
1755ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
1756{
1757	loff_t count = iov_iter_count(from);
1758	int ret;
1759
1760	ret = generic_write_checks_count(iocb, &count);
 
1761	if (ret)
1762		return ret;
1763
1764	iov_iter_truncate(from, count);
1765	return iov_iter_count(from);
 
 
 
 
 
 
 
 
 
 
 
 
 
1766}
1767EXPORT_SYMBOL(generic_write_checks);
1768
1769/*
1770 * Performs common checks before doing a file copy/clone
1771 * from @file_in to @file_out.
1772 */
1773int generic_file_rw_checks(struct file *file_in, struct file *file_out)
1774{
1775	struct inode *inode_in = file_inode(file_in);
1776	struct inode *inode_out = file_inode(file_out);
 
1777
1778	/* Don't copy dirs, pipes, sockets... */
1779	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1780		return -EISDIR;
1781	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1782		return -EINVAL;
1783
 
 
 
 
 
 
 
 
1784	if (!(file_in->f_mode & FMODE_READ) ||
1785	    !(file_out->f_mode & FMODE_WRITE) ||
1786	    (file_out->f_flags & O_APPEND))
1787		return -EBADF;
1788
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1789	return 0;
 
 
 
1790}
 
1791
1792int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter)
1793{
1794	size_t len = iov_iter_count(iter);
 
 
 
 
 
 
 
 
 
 
1795
1796	if (!iter_is_ubuf(iter))
1797		return -EINVAL;
1798
1799	if (!is_power_of_2(len))
1800		return -EINVAL;
1801
1802	if (!IS_ALIGNED(iocb->ki_pos, len))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1803		return -EINVAL;
1804
1805	if (!(iocb->ki_flags & IOCB_DIRECT))
1806		return -EOPNOTSUPP;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1807
1808	return 0;
 
1809}
1810EXPORT_SYMBOL_GPL(generic_atomic_write_valid);