Linux Audio

Check our new training course

Loading...
Note: File does not exist in v4.6.
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * VFIO core
   4 *
   5 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   6 *     Author: Alex Williamson <alex.williamson@redhat.com>
   7 *
   8 * Derived from original vfio:
   9 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  10 * Author: Tom Lyon, pugs@cisco.com
  11 */
  12
  13#include <linux/cdev.h>
  14#include <linux/compat.h>
  15#include <linux/device.h>
  16#include <linux/fs.h>
  17#include <linux/idr.h>
  18#include <linux/iommu.h>
  19#if IS_ENABLED(CONFIG_KVM)
  20#include <linux/kvm_host.h>
  21#endif
  22#include <linux/list.h>
  23#include <linux/miscdevice.h>
  24#include <linux/module.h>
  25#include <linux/mount.h>
  26#include <linux/mutex.h>
  27#include <linux/pci.h>
  28#include <linux/pseudo_fs.h>
  29#include <linux/rwsem.h>
  30#include <linux/sched.h>
  31#include <linux/slab.h>
  32#include <linux/stat.h>
  33#include <linux/string.h>
  34#include <linux/uaccess.h>
  35#include <linux/vfio.h>
  36#include <linux/wait.h>
  37#include <linux/sched/signal.h>
  38#include <linux/pm_runtime.h>
  39#include <linux/interval_tree.h>
  40#include <linux/iova_bitmap.h>
  41#include <linux/iommufd.h>
  42#include "vfio.h"
  43
  44#define DRIVER_VERSION	"0.3"
  45#define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
  46#define DRIVER_DESC	"VFIO - User Level meta-driver"
  47
  48#define VFIO_MAGIC 0x5646494f /* "VFIO" */
  49
  50static struct vfio {
  51	struct class			*device_class;
  52	struct ida			device_ida;
  53	struct vfsmount			*vfs_mount;
  54	int				fs_count;
  55} vfio;
  56
  57#ifdef CONFIG_VFIO_NOIOMMU
  58bool vfio_noiommu __read_mostly;
  59module_param_named(enable_unsafe_noiommu_mode,
  60		   vfio_noiommu, bool, S_IRUGO | S_IWUSR);
  61MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
  62#endif
  63
  64static DEFINE_XARRAY(vfio_device_set_xa);
  65
  66int vfio_assign_device_set(struct vfio_device *device, void *set_id)
  67{
  68	unsigned long idx = (unsigned long)set_id;
  69	struct vfio_device_set *new_dev_set;
  70	struct vfio_device_set *dev_set;
  71
  72	if (WARN_ON(!set_id))
  73		return -EINVAL;
  74
  75	/*
  76	 * Atomically acquire a singleton object in the xarray for this set_id
  77	 */
  78	xa_lock(&vfio_device_set_xa);
  79	dev_set = xa_load(&vfio_device_set_xa, idx);
  80	if (dev_set)
  81		goto found_get_ref;
  82	xa_unlock(&vfio_device_set_xa);
  83
  84	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
  85	if (!new_dev_set)
  86		return -ENOMEM;
  87	mutex_init(&new_dev_set->lock);
  88	INIT_LIST_HEAD(&new_dev_set->device_list);
  89	new_dev_set->set_id = set_id;
  90
  91	xa_lock(&vfio_device_set_xa);
  92	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
  93			       GFP_KERNEL);
  94	if (!dev_set) {
  95		dev_set = new_dev_set;
  96		goto found_get_ref;
  97	}
  98
  99	kfree(new_dev_set);
 100	if (xa_is_err(dev_set)) {
 101		xa_unlock(&vfio_device_set_xa);
 102		return xa_err(dev_set);
 103	}
 104
 105found_get_ref:
 106	dev_set->device_count++;
 107	xa_unlock(&vfio_device_set_xa);
 108	mutex_lock(&dev_set->lock);
 109	device->dev_set = dev_set;
 110	list_add_tail(&device->dev_set_list, &dev_set->device_list);
 111	mutex_unlock(&dev_set->lock);
 112	return 0;
 113}
 114EXPORT_SYMBOL_GPL(vfio_assign_device_set);
 115
 116static void vfio_release_device_set(struct vfio_device *device)
 117{
 118	struct vfio_device_set *dev_set = device->dev_set;
 119
 120	if (!dev_set)
 121		return;
 122
 123	mutex_lock(&dev_set->lock);
 124	list_del(&device->dev_set_list);
 125	mutex_unlock(&dev_set->lock);
 126
 127	xa_lock(&vfio_device_set_xa);
 128	if (!--dev_set->device_count) {
 129		__xa_erase(&vfio_device_set_xa,
 130			   (unsigned long)dev_set->set_id);
 131		mutex_destroy(&dev_set->lock);
 132		kfree(dev_set);
 133	}
 134	xa_unlock(&vfio_device_set_xa);
 135}
 136
 137unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
 138{
 139	struct vfio_device *cur;
 140	unsigned int open_count = 0;
 141
 142	lockdep_assert_held(&dev_set->lock);
 143
 144	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
 145		open_count += cur->open_count;
 146	return open_count;
 147}
 148EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
 149
 150struct vfio_device *
 151vfio_find_device_in_devset(struct vfio_device_set *dev_set,
 152			   struct device *dev)
 153{
 154	struct vfio_device *cur;
 155
 156	lockdep_assert_held(&dev_set->lock);
 157
 158	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
 159		if (cur->dev == dev)
 160			return cur;
 161	return NULL;
 162}
 163EXPORT_SYMBOL_GPL(vfio_find_device_in_devset);
 164
 165/*
 166 * Device objects - create, release, get, put, search
 167 */
 168/* Device reference always implies a group reference */
 169void vfio_device_put_registration(struct vfio_device *device)
 170{
 171	if (refcount_dec_and_test(&device->refcount))
 172		complete(&device->comp);
 173}
 174
 175bool vfio_device_try_get_registration(struct vfio_device *device)
 176{
 177	return refcount_inc_not_zero(&device->refcount);
 178}
 179
 180/*
 181 * VFIO driver API
 182 */
 183/* Release helper called by vfio_put_device() */
 184static void vfio_device_release(struct device *dev)
 185{
 186	struct vfio_device *device =
 187			container_of(dev, struct vfio_device, device);
 188
 189	vfio_release_device_set(device);
 190	ida_free(&vfio.device_ida, device->index);
 191
 192	if (device->ops->release)
 193		device->ops->release(device);
 194
 195	iput(device->inode);
 196	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
 197	kvfree(device);
 198}
 199
 200static int vfio_init_device(struct vfio_device *device, struct device *dev,
 201			    const struct vfio_device_ops *ops);
 202
 203/*
 204 * Allocate and initialize vfio_device so it can be registered to vfio
 205 * core.
 206 *
 207 * Drivers should use the wrapper vfio_alloc_device() for allocation.
 208 * @size is the size of the structure to be allocated, including any
 209 * private data used by the driver.
 210 *
 211 * Driver may provide an @init callback to cover device private data.
 212 *
 213 * Use vfio_put_device() to release the structure after success return.
 214 */
 215struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
 216				       const struct vfio_device_ops *ops)
 217{
 218	struct vfio_device *device;
 219	int ret;
 220
 221	if (WARN_ON(size < sizeof(struct vfio_device)))
 222		return ERR_PTR(-EINVAL);
 223
 224	device = kvzalloc(size, GFP_KERNEL);
 225	if (!device)
 226		return ERR_PTR(-ENOMEM);
 227
 228	ret = vfio_init_device(device, dev, ops);
 229	if (ret)
 230		goto out_free;
 231	return device;
 232
 233out_free:
 234	kvfree(device);
 235	return ERR_PTR(ret);
 236}
 237EXPORT_SYMBOL_GPL(_vfio_alloc_device);
 238
 239static int vfio_fs_init_fs_context(struct fs_context *fc)
 240{
 241	return init_pseudo(fc, VFIO_MAGIC) ? 0 : -ENOMEM;
 242}
 243
 244static struct file_system_type vfio_fs_type = {
 245	.name = "vfio",
 246	.owner = THIS_MODULE,
 247	.init_fs_context = vfio_fs_init_fs_context,
 248	.kill_sb = kill_anon_super,
 249};
 250
 251static struct inode *vfio_fs_inode_new(void)
 252{
 253	struct inode *inode;
 254	int ret;
 255
 256	ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count);
 257	if (ret)
 258		return ERR_PTR(ret);
 259
 260	inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb);
 261	if (IS_ERR(inode))
 262		simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
 263
 264	return inode;
 265}
 266
 267/*
 268 * Initialize a vfio_device so it can be registered to vfio core.
 269 */
 270static int vfio_init_device(struct vfio_device *device, struct device *dev,
 271			    const struct vfio_device_ops *ops)
 272{
 273	int ret;
 274
 275	ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
 276	if (ret < 0) {
 277		dev_dbg(dev, "Error to alloc index\n");
 278		return ret;
 279	}
 280
 281	device->index = ret;
 282	init_completion(&device->comp);
 283	device->dev = dev;
 284	device->ops = ops;
 285	device->inode = vfio_fs_inode_new();
 286	if (IS_ERR(device->inode)) {
 287		ret = PTR_ERR(device->inode);
 288		goto out_inode;
 289	}
 290
 291	if (ops->init) {
 292		ret = ops->init(device);
 293		if (ret)
 294			goto out_uninit;
 295	}
 296
 297	device_initialize(&device->device);
 298	device->device.release = vfio_device_release;
 299	device->device.class = vfio.device_class;
 300	device->device.parent = device->dev;
 301	return 0;
 302
 303out_uninit:
 304	iput(device->inode);
 305	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
 306out_inode:
 307	vfio_release_device_set(device);
 308	ida_free(&vfio.device_ida, device->index);
 309	return ret;
 310}
 311
 312static int __vfio_register_dev(struct vfio_device *device,
 313			       enum vfio_group_type type)
 314{
 315	int ret;
 316
 317	if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
 318		    (!device->ops->bind_iommufd ||
 319		     !device->ops->unbind_iommufd ||
 320		     !device->ops->attach_ioas ||
 321		     !device->ops->detach_ioas)))
 322		return -EINVAL;
 323
 324	/*
 325	 * If the driver doesn't specify a set then the device is added to a
 326	 * singleton set just for itself.
 327	 */
 328	if (!device->dev_set)
 329		vfio_assign_device_set(device, device);
 330
 331	ret = dev_set_name(&device->device, "vfio%d", device->index);
 332	if (ret)
 333		return ret;
 334
 335	ret = vfio_device_set_group(device, type);
 336	if (ret)
 337		return ret;
 338
 339	/*
 340	 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
 341	 * restore cache coherency. It has to be checked here because it is only
 342	 * valid for cases where we are using iommu groups.
 343	 */
 344	if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) &&
 345	    !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) {
 346		ret = -EINVAL;
 347		goto err_out;
 348	}
 349
 350	ret = vfio_device_add(device);
 351	if (ret)
 352		goto err_out;
 353
 354	/* Refcounting can't start until the driver calls register */
 355	refcount_set(&device->refcount, 1);
 356
 357	vfio_device_group_register(device);
 358	vfio_device_debugfs_init(device);
 359
 360	return 0;
 361err_out:
 362	vfio_device_remove_group(device);
 363	return ret;
 364}
 365
 366int vfio_register_group_dev(struct vfio_device *device)
 367{
 368	return __vfio_register_dev(device, VFIO_IOMMU);
 369}
 370EXPORT_SYMBOL_GPL(vfio_register_group_dev);
 371
 372/*
 373 * Register a virtual device without IOMMU backing.  The user of this
 374 * device must not be able to directly trigger unmediated DMA.
 375 */
 376int vfio_register_emulated_iommu_dev(struct vfio_device *device)
 377{
 378	return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
 379}
 380EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
 381
 382/*
 383 * Decrement the device reference count and wait for the device to be
 384 * removed.  Open file descriptors for the device... */
 385void vfio_unregister_group_dev(struct vfio_device *device)
 386{
 387	unsigned int i = 0;
 388	bool interrupted = false;
 389	long rc;
 390
 391	/*
 392	 * Prevent new device opened by userspace via the
 393	 * VFIO_GROUP_GET_DEVICE_FD in the group path.
 394	 */
 395	vfio_device_group_unregister(device);
 396
 397	/*
 398	 * Balances vfio_device_add() in register path, also prevents
 399	 * new device opened by userspace in the cdev path.
 400	 */
 401	vfio_device_del(device);
 402
 403	vfio_device_put_registration(device);
 404	rc = try_wait_for_completion(&device->comp);
 405	while (rc <= 0) {
 406		if (device->ops->request)
 407			device->ops->request(device, i++);
 408
 409		if (interrupted) {
 410			rc = wait_for_completion_timeout(&device->comp,
 411							 HZ * 10);
 412		} else {
 413			rc = wait_for_completion_interruptible_timeout(
 414				&device->comp, HZ * 10);
 415			if (rc < 0) {
 416				interrupted = true;
 417				dev_warn(device->dev,
 418					 "Device is currently in use, task"
 419					 " \"%s\" (%d) "
 420					 "blocked until device is released",
 421					 current->comm, task_pid_nr(current));
 422			}
 423		}
 424	}
 425
 426	vfio_device_debugfs_exit(device);
 427	/* Balances vfio_device_set_group in register path */
 428	vfio_device_remove_group(device);
 429}
 430EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
 431
 432#if IS_ENABLED(CONFIG_KVM)
 433void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
 434{
 435	void (*pfn)(struct kvm *kvm);
 436	bool (*fn)(struct kvm *kvm);
 437	bool ret;
 438
 439	lockdep_assert_held(&device->dev_set->lock);
 440
 441	if (!kvm)
 442		return;
 443
 444	pfn = symbol_get(kvm_put_kvm);
 445	if (WARN_ON(!pfn))
 446		return;
 447
 448	fn = symbol_get(kvm_get_kvm_safe);
 449	if (WARN_ON(!fn)) {
 450		symbol_put(kvm_put_kvm);
 451		return;
 452	}
 453
 454	ret = fn(kvm);
 455	symbol_put(kvm_get_kvm_safe);
 456	if (!ret) {
 457		symbol_put(kvm_put_kvm);
 458		return;
 459	}
 460
 461	device->put_kvm = pfn;
 462	device->kvm = kvm;
 463}
 464
 465void vfio_device_put_kvm(struct vfio_device *device)
 466{
 467	lockdep_assert_held(&device->dev_set->lock);
 468
 469	if (!device->kvm)
 470		return;
 471
 472	if (WARN_ON(!device->put_kvm))
 473		goto clear;
 474
 475	device->put_kvm(device->kvm);
 476	device->put_kvm = NULL;
 477	symbol_put(kvm_put_kvm);
 478
 479clear:
 480	device->kvm = NULL;
 481}
 482#endif
 483
 484/* true if the vfio_device has open_device() called but not close_device() */
 485static bool vfio_assert_device_open(struct vfio_device *device)
 486{
 487	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
 488}
 489
 490struct vfio_device_file *
 491vfio_allocate_device_file(struct vfio_device *device)
 492{
 493	struct vfio_device_file *df;
 494
 495	df = kzalloc(sizeof(*df), GFP_KERNEL_ACCOUNT);
 496	if (!df)
 497		return ERR_PTR(-ENOMEM);
 498
 499	df->device = device;
 500	spin_lock_init(&df->kvm_ref_lock);
 501
 502	return df;
 503}
 504
 505static int vfio_df_device_first_open(struct vfio_device_file *df)
 506{
 507	struct vfio_device *device = df->device;
 508	struct iommufd_ctx *iommufd = df->iommufd;
 509	int ret;
 510
 511	lockdep_assert_held(&device->dev_set->lock);
 512
 513	if (!try_module_get(device->dev->driver->owner))
 514		return -ENODEV;
 515
 516	if (iommufd)
 517		ret = vfio_df_iommufd_bind(df);
 518	else
 519		ret = vfio_device_group_use_iommu(device);
 520	if (ret)
 521		goto err_module_put;
 522
 523	if (device->ops->open_device) {
 524		ret = device->ops->open_device(device);
 525		if (ret)
 526			goto err_unuse_iommu;
 527	}
 528	return 0;
 529
 530err_unuse_iommu:
 531	if (iommufd)
 532		vfio_df_iommufd_unbind(df);
 533	else
 534		vfio_device_group_unuse_iommu(device);
 535err_module_put:
 536	module_put(device->dev->driver->owner);
 537	return ret;
 538}
 539
 540static void vfio_df_device_last_close(struct vfio_device_file *df)
 541{
 542	struct vfio_device *device = df->device;
 543	struct iommufd_ctx *iommufd = df->iommufd;
 544
 545	lockdep_assert_held(&device->dev_set->lock);
 546
 547	if (device->ops->close_device)
 548		device->ops->close_device(device);
 549	if (iommufd)
 550		vfio_df_iommufd_unbind(df);
 551	else
 552		vfio_device_group_unuse_iommu(device);
 553	module_put(device->dev->driver->owner);
 554}
 555
 556int vfio_df_open(struct vfio_device_file *df)
 557{
 558	struct vfio_device *device = df->device;
 559	int ret = 0;
 560
 561	lockdep_assert_held(&device->dev_set->lock);
 562
 563	/*
 564	 * Only the group path allows the device to be opened multiple
 565	 * times.  The device cdev path doesn't have a secure way for it.
 566	 */
 567	if (device->open_count != 0 && !df->group)
 568		return -EINVAL;
 569
 570	device->open_count++;
 571	if (device->open_count == 1) {
 572		ret = vfio_df_device_first_open(df);
 573		if (ret)
 574			device->open_count--;
 575	}
 576
 577	return ret;
 578}
 579
 580void vfio_df_close(struct vfio_device_file *df)
 581{
 582	struct vfio_device *device = df->device;
 583
 584	lockdep_assert_held(&device->dev_set->lock);
 585
 586	vfio_assert_device_open(device);
 587	if (device->open_count == 1)
 588		vfio_df_device_last_close(df);
 589	device->open_count--;
 590}
 591
 592/*
 593 * Wrapper around pm_runtime_resume_and_get().
 594 * Return error code on failure or 0 on success.
 595 */
 596static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
 597{
 598	struct device *dev = device->dev;
 599
 600	if (dev->driver && dev->driver->pm) {
 601		int ret;
 602
 603		ret = pm_runtime_resume_and_get(dev);
 604		if (ret) {
 605			dev_info_ratelimited(dev,
 606				"vfio: runtime resume failed %d\n", ret);
 607			return -EIO;
 608		}
 609	}
 610
 611	return 0;
 612}
 613
 614/*
 615 * Wrapper around pm_runtime_put().
 616 */
 617static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
 618{
 619	struct device *dev = device->dev;
 620
 621	if (dev->driver && dev->driver->pm)
 622		pm_runtime_put(dev);
 623}
 624
 625/*
 626 * VFIO Device fd
 627 */
 628static int vfio_device_fops_release(struct inode *inode, struct file *filep)
 629{
 630	struct vfio_device_file *df = filep->private_data;
 631	struct vfio_device *device = df->device;
 632
 633	if (df->group)
 634		vfio_df_group_close(df);
 635	else
 636		vfio_df_unbind_iommufd(df);
 637
 638	vfio_device_put_registration(device);
 639
 640	kfree(df);
 641
 642	return 0;
 643}
 644
 645/*
 646 * vfio_mig_get_next_state - Compute the next step in the FSM
 647 * @cur_fsm - The current state the device is in
 648 * @new_fsm - The target state to reach
 649 * @next_fsm - Pointer to the next step to get to new_fsm
 650 *
 651 * Return 0 upon success, otherwise -errno
 652 * Upon success the next step in the state progression between cur_fsm and
 653 * new_fsm will be set in next_fsm.
 654 *
 655 * This breaks down requests for combination transitions into smaller steps and
 656 * returns the next step to get to new_fsm. The function may need to be called
 657 * multiple times before reaching new_fsm.
 658 *
 659 */
 660int vfio_mig_get_next_state(struct vfio_device *device,
 661			    enum vfio_device_mig_state cur_fsm,
 662			    enum vfio_device_mig_state new_fsm,
 663			    enum vfio_device_mig_state *next_fsm)
 664{
 665	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
 666	/*
 667	 * The coding in this table requires the driver to implement the
 668	 * following FSM arcs:
 669	 *         RESUMING -> STOP
 670	 *         STOP -> RESUMING
 671	 *         STOP -> STOP_COPY
 672	 *         STOP_COPY -> STOP
 673	 *
 674	 * If P2P is supported then the driver must also implement these FSM
 675	 * arcs:
 676	 *         RUNNING -> RUNNING_P2P
 677	 *         RUNNING_P2P -> RUNNING
 678	 *         RUNNING_P2P -> STOP
 679	 *         STOP -> RUNNING_P2P
 680	 *
 681	 * If precopy is supported then the driver must support these additional
 682	 * FSM arcs:
 683	 *         RUNNING -> PRE_COPY
 684	 *         PRE_COPY -> RUNNING
 685	 *         PRE_COPY -> STOP_COPY
 686	 * However, if precopy and P2P are supported together then the driver
 687	 * must support these additional arcs beyond the P2P arcs above:
 688	 *         PRE_COPY -> RUNNING
 689	 *         PRE_COPY -> PRE_COPY_P2P
 690	 *         PRE_COPY_P2P -> PRE_COPY
 691	 *         PRE_COPY_P2P -> RUNNING_P2P
 692	 *         PRE_COPY_P2P -> STOP_COPY
 693	 *         RUNNING -> PRE_COPY
 694	 *         RUNNING_P2P -> PRE_COPY_P2P
 695	 *
 696	 * Without P2P and precopy the driver must implement:
 697	 *         RUNNING -> STOP
 698	 *         STOP -> RUNNING
 699	 *
 700	 * The coding will step through multiple states for some combination
 701	 * transitions; if all optional features are supported, this means the
 702	 * following ones:
 703	 *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
 704	 *         PRE_COPY -> RUNNING -> RUNNING_P2P
 705	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
 706	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
 707	 *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
 708	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
 709	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
 710	 *         RESUMING -> STOP -> RUNNING_P2P
 711	 *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
 712	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
 713	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
 714	 *         RESUMING -> STOP -> STOP_COPY
 715	 *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
 716	 *         RUNNING -> RUNNING_P2P -> STOP
 717	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
 718	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
 719	 *         RUNNING_P2P -> RUNNING -> PRE_COPY
 720	 *         RUNNING_P2P -> STOP -> RESUMING
 721	 *         RUNNING_P2P -> STOP -> STOP_COPY
 722	 *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
 723	 *         STOP -> RUNNING_P2P -> RUNNING
 724	 *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
 725	 *         STOP_COPY -> STOP -> RESUMING
 726	 *         STOP_COPY -> STOP -> RUNNING_P2P
 727	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
 728	 *
 729	 *  The following transitions are blocked:
 730	 *         STOP_COPY -> PRE_COPY
 731	 *         STOP_COPY -> PRE_COPY_P2P
 732	 */
 733	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
 734		[VFIO_DEVICE_STATE_STOP] = {
 735			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
 736			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
 737			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
 738			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 739			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
 740			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
 741			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 742			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 743		},
 744		[VFIO_DEVICE_STATE_RUNNING] = {
 745			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
 746			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
 747			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
 748			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 749			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
 750			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
 751			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 752			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 753		},
 754		[VFIO_DEVICE_STATE_PRE_COPY] = {
 755			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
 756			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
 757			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
 758			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
 759			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
 760			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
 761			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
 762			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 763		},
 764		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
 765			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
 766			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
 767			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
 768			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
 769			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
 770			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
 771			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 772			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 773		},
 774		[VFIO_DEVICE_STATE_STOP_COPY] = {
 775			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
 776			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
 777			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
 778			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
 779			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
 780			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
 781			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
 782			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 783		},
 784		[VFIO_DEVICE_STATE_RESUMING] = {
 785			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
 786			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
 787			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
 788			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
 789			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
 790			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
 791			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
 792			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 793		},
 794		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
 795			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
 796			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
 797			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
 798			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
 799			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
 800			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
 801			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 802			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 803		},
 804		[VFIO_DEVICE_STATE_ERROR] = {
 805			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
 806			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
 807			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
 808			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
 809			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
 810			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
 811			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
 812			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 813		},
 814	};
 815
 816	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
 817		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
 818		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
 819		[VFIO_DEVICE_STATE_PRE_COPY] =
 820			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
 821		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
 822						   VFIO_MIGRATION_P2P |
 823						   VFIO_MIGRATION_PRE_COPY,
 824		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
 825		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
 826		[VFIO_DEVICE_STATE_RUNNING_P2P] =
 827			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
 828		[VFIO_DEVICE_STATE_ERROR] = ~0U,
 829	};
 830
 831	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
 832		    (state_flags_table[cur_fsm] & device->migration_flags) !=
 833			state_flags_table[cur_fsm]))
 834		return -EINVAL;
 835
 836	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
 837	   (state_flags_table[new_fsm] & device->migration_flags) !=
 838			state_flags_table[new_fsm])
 839		return -EINVAL;
 840
 841	/*
 842	 * Arcs touching optional and unsupported states are skipped over. The
 843	 * driver will instead see an arc from the original state to the next
 844	 * logical state, as per the above comment.
 845	 */
 846	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
 847	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
 848			state_flags_table[*next_fsm])
 849		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
 850
 851	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
 852}
 853EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
 854
 855/*
 856 * Convert the drivers's struct file into a FD number and return it to userspace
 857 */
 858static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
 859				   struct vfio_device_feature_mig_state *mig)
 860{
 861	int ret;
 862	int fd;
 863
 864	fd = get_unused_fd_flags(O_CLOEXEC);
 865	if (fd < 0) {
 866		ret = fd;
 867		goto out_fput;
 868	}
 869
 870	mig->data_fd = fd;
 871	if (copy_to_user(arg, mig, sizeof(*mig))) {
 872		ret = -EFAULT;
 873		goto out_put_unused;
 874	}
 875	fd_install(fd, filp);
 876	return 0;
 877
 878out_put_unused:
 879	put_unused_fd(fd);
 880out_fput:
 881	fput(filp);
 882	return ret;
 883}
 884
 885static int
 886vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
 887					   u32 flags, void __user *arg,
 888					   size_t argsz)
 889{
 890	size_t minsz =
 891		offsetofend(struct vfio_device_feature_mig_state, data_fd);
 892	struct vfio_device_feature_mig_state mig;
 893	struct file *filp = NULL;
 894	int ret;
 895
 896	if (!device->mig_ops)
 897		return -ENOTTY;
 898
 899	ret = vfio_check_feature(flags, argsz,
 900				 VFIO_DEVICE_FEATURE_SET |
 901				 VFIO_DEVICE_FEATURE_GET,
 902				 sizeof(mig));
 903	if (ret != 1)
 904		return ret;
 905
 906	if (copy_from_user(&mig, arg, minsz))
 907		return -EFAULT;
 908
 909	if (flags & VFIO_DEVICE_FEATURE_GET) {
 910		enum vfio_device_mig_state curr_state;
 911
 912		ret = device->mig_ops->migration_get_state(device,
 913							   &curr_state);
 914		if (ret)
 915			return ret;
 916		mig.device_state = curr_state;
 917		goto out_copy;
 918	}
 919
 920	/* Handle the VFIO_DEVICE_FEATURE_SET */
 921	filp = device->mig_ops->migration_set_state(device, mig.device_state);
 922	if (IS_ERR(filp) || !filp)
 923		goto out_copy;
 924
 925	return vfio_ioct_mig_return_fd(filp, arg, &mig);
 926out_copy:
 927	mig.data_fd = -1;
 928	if (copy_to_user(arg, &mig, sizeof(mig)))
 929		return -EFAULT;
 930	if (IS_ERR(filp))
 931		return PTR_ERR(filp);
 932	return 0;
 933}
 934
 935static int
 936vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
 937					      u32 flags, void __user *arg,
 938					      size_t argsz)
 939{
 940	struct vfio_device_feature_mig_data_size data_size = {};
 941	unsigned long stop_copy_length;
 942	int ret;
 943
 944	if (!device->mig_ops)
 945		return -ENOTTY;
 946
 947	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
 948				 sizeof(data_size));
 949	if (ret != 1)
 950		return ret;
 951
 952	ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
 953	if (ret)
 954		return ret;
 955
 956	data_size.stop_copy_length = stop_copy_length;
 957	if (copy_to_user(arg, &data_size, sizeof(data_size)))
 958		return -EFAULT;
 959
 960	return 0;
 961}
 962
 963static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
 964					       u32 flags, void __user *arg,
 965					       size_t argsz)
 966{
 967	struct vfio_device_feature_migration mig = {
 968		.flags = device->migration_flags,
 969	};
 970	int ret;
 971
 972	if (!device->mig_ops)
 973		return -ENOTTY;
 974
 975	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
 976				 sizeof(mig));
 977	if (ret != 1)
 978		return ret;
 979	if (copy_to_user(arg, &mig, sizeof(mig)))
 980		return -EFAULT;
 981	return 0;
 982}
 983
 984void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
 985			      u32 req_nodes)
 986{
 987	struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
 988	unsigned long min_gap, curr_gap;
 989
 990	/* Special shortcut when a single range is required */
 991	if (req_nodes == 1) {
 992		unsigned long last;
 993
 994		comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
 995
 996		/* Empty list */
 997		if (WARN_ON_ONCE(!comb_start))
 998			return;
 999
1000		curr = comb_start;
1001		while (curr) {
1002			last = curr->last;
1003			prev = curr;
1004			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
1005			if (prev != comb_start)
1006				interval_tree_remove(prev, root);
1007		}
1008		comb_start->last = last;
1009		return;
1010	}
1011
1012	/* Combine ranges which have the smallest gap */
1013	while (cur_nodes > req_nodes) {
1014		prev = NULL;
1015		min_gap = ULONG_MAX;
1016		curr = interval_tree_iter_first(root, 0, ULONG_MAX);
1017		while (curr) {
1018			if (prev) {
1019				curr_gap = curr->start - prev->last;
1020				if (curr_gap < min_gap) {
1021					min_gap = curr_gap;
1022					comb_start = prev;
1023					comb_end = curr;
1024				}
1025			}
1026			prev = curr;
1027			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
1028		}
1029
1030		/* Empty list or no nodes to combine */
1031		if (WARN_ON_ONCE(min_gap == ULONG_MAX))
1032			break;
1033
1034		comb_start->last = comb_end->last;
1035		interval_tree_remove(comb_end, root);
1036		cur_nodes--;
1037	}
1038}
1039EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges);
1040
1041/* Ranges should fit into a single kernel page */
1042#define LOG_MAX_RANGES \
1043	(PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
1044
1045static int
1046vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
1047					u32 flags, void __user *arg,
1048					size_t argsz)
1049{
1050	size_t minsz =
1051		offsetofend(struct vfio_device_feature_dma_logging_control,
1052			    ranges);
1053	struct vfio_device_feature_dma_logging_range __user *ranges;
1054	struct vfio_device_feature_dma_logging_control control;
1055	struct vfio_device_feature_dma_logging_range range;
1056	struct rb_root_cached root = RB_ROOT_CACHED;
1057	struct interval_tree_node *nodes;
1058	u64 iova_end;
1059	u32 nnodes;
1060	int i, ret;
1061
1062	if (!device->log_ops)
1063		return -ENOTTY;
1064
1065	ret = vfio_check_feature(flags, argsz,
1066				 VFIO_DEVICE_FEATURE_SET,
1067				 sizeof(control));
1068	if (ret != 1)
1069		return ret;
1070
1071	if (copy_from_user(&control, arg, minsz))
1072		return -EFAULT;
1073
1074	nnodes = control.num_ranges;
1075	if (!nnodes)
1076		return -EINVAL;
1077
1078	if (nnodes > LOG_MAX_RANGES)
1079		return -E2BIG;
1080
1081	ranges = u64_to_user_ptr(control.ranges);
1082	nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
1083			      GFP_KERNEL);
1084	if (!nodes)
1085		return -ENOMEM;
1086
1087	for (i = 0; i < nnodes; i++) {
1088		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
1089			ret = -EFAULT;
1090			goto end;
1091		}
1092		if (!IS_ALIGNED(range.iova, control.page_size) ||
1093		    !IS_ALIGNED(range.length, control.page_size)) {
1094			ret = -EINVAL;
1095			goto end;
1096		}
1097
1098		if (check_add_overflow(range.iova, range.length, &iova_end) ||
1099		    iova_end > ULONG_MAX) {
1100			ret = -EOVERFLOW;
1101			goto end;
1102		}
1103
1104		nodes[i].start = range.iova;
1105		nodes[i].last = range.iova + range.length - 1;
1106		if (interval_tree_iter_first(&root, nodes[i].start,
1107					     nodes[i].last)) {
1108			/* Range overlapping */
1109			ret = -EINVAL;
1110			goto end;
1111		}
1112		interval_tree_insert(nodes + i, &root);
1113	}
1114
1115	ret = device->log_ops->log_start(device, &root, nnodes,
1116					 &control.page_size);
1117	if (ret)
1118		goto end;
1119
1120	if (copy_to_user(arg, &control, sizeof(control))) {
1121		ret = -EFAULT;
1122		device->log_ops->log_stop(device);
1123	}
1124
1125end:
1126	kfree(nodes);
1127	return ret;
1128}
1129
1130static int
1131vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
1132				       u32 flags, void __user *arg,
1133				       size_t argsz)
1134{
1135	int ret;
1136
1137	if (!device->log_ops)
1138		return -ENOTTY;
1139
1140	ret = vfio_check_feature(flags, argsz,
1141				 VFIO_DEVICE_FEATURE_SET, 0);
1142	if (ret != 1)
1143		return ret;
1144
1145	return device->log_ops->log_stop(device);
1146}
1147
1148static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
1149					  unsigned long iova, size_t length,
1150					  void *opaque)
1151{
1152	struct vfio_device *device = opaque;
1153
1154	return device->log_ops->log_read_and_clear(device, iova, length, iter);
1155}
1156
1157static int
1158vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
1159					 u32 flags, void __user *arg,
1160					 size_t argsz)
1161{
1162	size_t minsz =
1163		offsetofend(struct vfio_device_feature_dma_logging_report,
1164			    bitmap);
1165	struct vfio_device_feature_dma_logging_report report;
1166	struct iova_bitmap *iter;
1167	u64 iova_end;
1168	int ret;
1169
1170	if (!device->log_ops)
1171		return -ENOTTY;
1172
1173	ret = vfio_check_feature(flags, argsz,
1174				 VFIO_DEVICE_FEATURE_GET,
1175				 sizeof(report));
1176	if (ret != 1)
1177		return ret;
1178
1179	if (copy_from_user(&report, arg, minsz))
1180		return -EFAULT;
1181
1182	if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1183		return -EINVAL;
1184
1185	if (check_add_overflow(report.iova, report.length, &iova_end) ||
1186	    iova_end > ULONG_MAX)
1187		return -EOVERFLOW;
1188
1189	iter = iova_bitmap_alloc(report.iova, report.length,
1190				 report.page_size,
1191				 u64_to_user_ptr(report.bitmap));
1192	if (IS_ERR(iter))
1193		return PTR_ERR(iter);
1194
1195	ret = iova_bitmap_for_each(iter, device,
1196				   vfio_device_log_read_and_clear);
1197
1198	iova_bitmap_free(iter);
1199	return ret;
1200}
1201
1202static int vfio_ioctl_device_feature(struct vfio_device *device,
1203				     struct vfio_device_feature __user *arg)
1204{
1205	size_t minsz = offsetofend(struct vfio_device_feature, flags);
1206	struct vfio_device_feature feature;
1207
1208	if (copy_from_user(&feature, arg, minsz))
1209		return -EFAULT;
1210
1211	if (feature.argsz < minsz)
1212		return -EINVAL;
1213
1214	/* Check unknown flags */
1215	if (feature.flags &
1216	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1217	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1218		return -EINVAL;
1219
1220	/* GET & SET are mutually exclusive except with PROBE */
1221	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1222	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1223	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
1224		return -EINVAL;
1225
1226	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1227	case VFIO_DEVICE_FEATURE_MIGRATION:
1228		return vfio_ioctl_device_feature_migration(
1229			device, feature.flags, arg->data,
1230			feature.argsz - minsz);
1231	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1232		return vfio_ioctl_device_feature_mig_device_state(
1233			device, feature.flags, arg->data,
1234			feature.argsz - minsz);
1235	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1236		return vfio_ioctl_device_feature_logging_start(
1237			device, feature.flags, arg->data,
1238			feature.argsz - minsz);
1239	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1240		return vfio_ioctl_device_feature_logging_stop(
1241			device, feature.flags, arg->data,
1242			feature.argsz - minsz);
1243	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1244		return vfio_ioctl_device_feature_logging_report(
1245			device, feature.flags, arg->data,
1246			feature.argsz - minsz);
1247	case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1248		return vfio_ioctl_device_feature_migration_data_size(
1249			device, feature.flags, arg->data,
1250			feature.argsz - minsz);
1251	default:
1252		if (unlikely(!device->ops->device_feature))
1253			return -EINVAL;
1254		return device->ops->device_feature(device, feature.flags,
1255						   arg->data,
1256						   feature.argsz - minsz);
1257	}
1258}
1259
1260static long vfio_device_fops_unl_ioctl(struct file *filep,
1261				       unsigned int cmd, unsigned long arg)
1262{
1263	struct vfio_device_file *df = filep->private_data;
1264	struct vfio_device *device = df->device;
1265	void __user *uptr = (void __user *)arg;
1266	int ret;
1267
1268	if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
1269		return vfio_df_ioctl_bind_iommufd(df, uptr);
1270
1271	/* Paired with smp_store_release() following vfio_df_open() */
1272	if (!smp_load_acquire(&df->access_granted))
1273		return -EINVAL;
1274
1275	ret = vfio_device_pm_runtime_get(device);
1276	if (ret)
1277		return ret;
1278
1279	/* cdev only ioctls */
1280	if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) {
1281		switch (cmd) {
1282		case VFIO_DEVICE_ATTACH_IOMMUFD_PT:
1283			ret = vfio_df_ioctl_attach_pt(df, uptr);
1284			goto out;
1285
1286		case VFIO_DEVICE_DETACH_IOMMUFD_PT:
1287			ret = vfio_df_ioctl_detach_pt(df, uptr);
1288			goto out;
1289		}
1290	}
1291
1292	switch (cmd) {
1293	case VFIO_DEVICE_FEATURE:
1294		ret = vfio_ioctl_device_feature(device, uptr);
1295		break;
1296
1297	default:
1298		if (unlikely(!device->ops->ioctl))
1299			ret = -EINVAL;
1300		else
1301			ret = device->ops->ioctl(device, cmd, arg);
1302		break;
1303	}
1304out:
1305	vfio_device_pm_runtime_put(device);
1306	return ret;
1307}
1308
1309static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1310				     size_t count, loff_t *ppos)
1311{
1312	struct vfio_device_file *df = filep->private_data;
1313	struct vfio_device *device = df->device;
1314
1315	/* Paired with smp_store_release() following vfio_df_open() */
1316	if (!smp_load_acquire(&df->access_granted))
1317		return -EINVAL;
1318
1319	if (unlikely(!device->ops->read))
1320		return -EINVAL;
1321
1322	return device->ops->read(device, buf, count, ppos);
1323}
1324
1325static ssize_t vfio_device_fops_write(struct file *filep,
1326				      const char __user *buf,
1327				      size_t count, loff_t *ppos)
1328{
1329	struct vfio_device_file *df = filep->private_data;
1330	struct vfio_device *device = df->device;
1331
1332	/* Paired with smp_store_release() following vfio_df_open() */
1333	if (!smp_load_acquire(&df->access_granted))
1334		return -EINVAL;
1335
1336	if (unlikely(!device->ops->write))
1337		return -EINVAL;
1338
1339	return device->ops->write(device, buf, count, ppos);
1340}
1341
1342static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1343{
1344	struct vfio_device_file *df = filep->private_data;
1345	struct vfio_device *device = df->device;
1346
1347	/* Paired with smp_store_release() following vfio_df_open() */
1348	if (!smp_load_acquire(&df->access_granted))
1349		return -EINVAL;
1350
1351	if (unlikely(!device->ops->mmap))
1352		return -EINVAL;
1353
1354	return device->ops->mmap(device, vma);
1355}
1356
1357const struct file_operations vfio_device_fops = {
1358	.owner		= THIS_MODULE,
1359	.open		= vfio_device_fops_cdev_open,
1360	.release	= vfio_device_fops_release,
1361	.read		= vfio_device_fops_read,
1362	.write		= vfio_device_fops_write,
1363	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1364	.compat_ioctl	= compat_ptr_ioctl,
1365	.mmap		= vfio_device_fops_mmap,
1366};
1367
1368static struct vfio_device *vfio_device_from_file(struct file *file)
1369{
1370	struct vfio_device_file *df = file->private_data;
1371
1372	if (file->f_op != &vfio_device_fops)
1373		return NULL;
1374	return df->device;
1375}
1376
1377/**
1378 * vfio_file_is_valid - True if the file is valid vfio file
1379 * @file: VFIO group file or VFIO device file
1380 */
1381bool vfio_file_is_valid(struct file *file)
1382{
1383	return vfio_group_from_file(file) ||
1384	       vfio_device_from_file(file);
1385}
1386EXPORT_SYMBOL_GPL(vfio_file_is_valid);
1387
1388/**
1389 * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1390 *        is always CPU cache coherent
1391 * @file: VFIO group file or VFIO device file
1392 *
1393 * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1394 * bit in DMA transactions. A return of false indicates that the user has
1395 * rights to access additional instructions such as wbinvd on x86.
1396 */
1397bool vfio_file_enforced_coherent(struct file *file)
1398{
1399	struct vfio_device *device;
1400	struct vfio_group *group;
1401
1402	group = vfio_group_from_file(file);
1403	if (group)
1404		return vfio_group_enforced_coherent(group);
1405
1406	device = vfio_device_from_file(file);
1407	if (device)
1408		return device_iommu_capable(device->dev,
1409					    IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
1410
1411	return true;
1412}
1413EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1414
1415static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
1416{
1417	struct vfio_device_file *df = file->private_data;
1418
1419	/*
1420	 * The kvm is first recorded in the vfio_device_file, and will
1421	 * be propagated to vfio_device::kvm when the file is bound to
1422	 * iommufd successfully in the vfio device cdev path.
1423	 */
1424	spin_lock(&df->kvm_ref_lock);
1425	df->kvm = kvm;
1426	spin_unlock(&df->kvm_ref_lock);
1427}
1428
1429/**
1430 * vfio_file_set_kvm - Link a kvm with VFIO drivers
1431 * @file: VFIO group file or VFIO device file
1432 * @kvm: KVM to link
1433 *
1434 * When a VFIO device is first opened the KVM will be available in
1435 * device->kvm if one was associated with the file.
1436 */
1437void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1438{
1439	struct vfio_group *group;
1440
1441	group = vfio_group_from_file(file);
1442	if (group)
1443		vfio_group_set_kvm(group, kvm);
1444
1445	if (vfio_device_from_file(file))
1446		vfio_device_file_set_kvm(file, kvm);
1447}
1448EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1449
1450/*
1451 * Sub-module support
1452 */
1453/*
1454 * Helper for managing a buffer of info chain capabilities, allocate or
1455 * reallocate a buffer with additional @size, filling in @id and @version
1456 * of the capability.  A pointer to the new capability is returned.
1457 *
1458 * NB. The chain is based at the head of the buffer, so new entries are
1459 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1460 * next offsets prior to copying to the user buffer.
1461 */
1462struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1463					       size_t size, u16 id, u16 version)
1464{
1465	void *buf;
1466	struct vfio_info_cap_header *header, *tmp;
1467
1468	/* Ensure that the next capability struct will be aligned */
1469	size = ALIGN(size, sizeof(u64));
1470
1471	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1472	if (!buf) {
1473		kfree(caps->buf);
1474		caps->buf = NULL;
1475		caps->size = 0;
1476		return ERR_PTR(-ENOMEM);
1477	}
1478
1479	caps->buf = buf;
1480	header = buf + caps->size;
1481
1482	/* Eventually copied to user buffer, zero */
1483	memset(header, 0, size);
1484
1485	header->id = id;
1486	header->version = version;
1487
1488	/* Add to the end of the capability chain */
1489	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1490		; /* nothing */
1491
1492	tmp->next = caps->size;
1493	caps->size += size;
1494
1495	return header;
1496}
1497EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1498
1499void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1500{
1501	struct vfio_info_cap_header *tmp;
1502	void *buf = (void *)caps->buf;
1503
1504	/* Capability structs should start with proper alignment */
1505	WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));
1506
1507	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1508		tmp->next += offset;
1509}
1510EXPORT_SYMBOL(vfio_info_cap_shift);
1511
1512int vfio_info_add_capability(struct vfio_info_cap *caps,
1513			     struct vfio_info_cap_header *cap, size_t size)
1514{
1515	struct vfio_info_cap_header *header;
1516
1517	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1518	if (IS_ERR(header))
1519		return PTR_ERR(header);
1520
1521	memcpy(header + 1, cap + 1, size - sizeof(*header));
1522
1523	return 0;
1524}
1525EXPORT_SYMBOL(vfio_info_add_capability);
1526
1527int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1528				       int max_irq_type, size_t *data_size)
1529{
1530	unsigned long minsz;
1531	size_t size;
1532
1533	minsz = offsetofend(struct vfio_irq_set, count);
1534
1535	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1536	    (hdr->count >= (U32_MAX - hdr->start)) ||
1537	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1538				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1539		return -EINVAL;
1540
1541	if (data_size)
1542		*data_size = 0;
1543
1544	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1545		return -EINVAL;
1546
1547	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1548	case VFIO_IRQ_SET_DATA_NONE:
1549		size = 0;
1550		break;
1551	case VFIO_IRQ_SET_DATA_BOOL:
1552		size = sizeof(uint8_t);
1553		break;
1554	case VFIO_IRQ_SET_DATA_EVENTFD:
1555		size = sizeof(int32_t);
1556		break;
1557	default:
1558		return -EINVAL;
1559	}
1560
1561	if (size) {
1562		if (hdr->argsz - minsz < hdr->count * size)
1563			return -EINVAL;
1564
1565		if (!data_size)
1566			return -EINVAL;
1567
1568		*data_size = hdr->count * size;
1569	}
1570
1571	return 0;
1572}
1573EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1574
1575/*
1576 * Pin contiguous user pages and return their associated host pages for local
1577 * domain only.
1578 * @device [in]  : device
1579 * @iova [in]    : starting IOVA of user pages to be pinned.
1580 * @npage [in]   : count of pages to be pinned.  This count should not
1581 *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1582 * @prot [in]    : protection flags
1583 * @pages[out]   : array of host pages
1584 * Return error or number of pages pinned.
1585 *
1586 * A driver may only call this function if the vfio_device was created
1587 * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1588 */
1589int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1590		   int npage, int prot, struct page **pages)
1591{
1592	/* group->container cannot change while a vfio device is open */
1593	if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1594		return -EINVAL;
1595	if (!device->ops->dma_unmap)
1596		return -EINVAL;
1597	if (vfio_device_has_container(device))
1598		return vfio_device_container_pin_pages(device, iova,
1599						       npage, prot, pages);
1600	if (device->iommufd_access) {
1601		int ret;
1602
1603		if (iova > ULONG_MAX)
1604			return -EINVAL;
1605		/*
1606		 * VFIO ignores the sub page offset, npages is from the start of
1607		 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1608		 * the sub page offset by doing:
1609		 *     pages[0] + (iova % PAGE_SIZE)
1610		 */
1611		ret = iommufd_access_pin_pages(
1612			device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1613			npage * PAGE_SIZE, pages,
1614			(prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1615		if (ret)
1616			return ret;
1617		return npage;
1618	}
1619	return -EINVAL;
1620}
1621EXPORT_SYMBOL(vfio_pin_pages);
1622
1623/*
1624 * Unpin contiguous host pages for local domain only.
1625 * @device [in]  : device
1626 * @iova [in]    : starting address of user pages to be unpinned.
1627 * @npage [in]   : count of pages to be unpinned.  This count should not
1628 *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1629 */
1630void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1631{
1632	if (WARN_ON(!vfio_assert_device_open(device)))
1633		return;
1634	if (WARN_ON(!device->ops->dma_unmap))
1635		return;
1636
1637	if (vfio_device_has_container(device)) {
1638		vfio_device_container_unpin_pages(device, iova, npage);
1639		return;
1640	}
1641	if (device->iommufd_access) {
1642		if (WARN_ON(iova > ULONG_MAX))
1643			return;
1644		iommufd_access_unpin_pages(device->iommufd_access,
1645					   ALIGN_DOWN(iova, PAGE_SIZE),
1646					   npage * PAGE_SIZE);
1647		return;
1648	}
1649}
1650EXPORT_SYMBOL(vfio_unpin_pages);
1651
1652/*
1653 * This interface allows the CPUs to perform some sort of virtual DMA on
1654 * behalf of the device.
1655 *
1656 * CPUs read/write from/into a range of IOVAs pointing to user space memory
1657 * into/from a kernel buffer.
1658 *
1659 * As the read/write of user space memory is conducted via the CPUs and is
1660 * not a real device DMA, it is not necessary to pin the user space memory.
1661 *
1662 * @device [in]		: VFIO device
1663 * @iova [in]		: base IOVA of a user space buffer
1664 * @data [in]		: pointer to kernel buffer
1665 * @len [in]		: kernel buffer length
1666 * @write		: indicate read or write
1667 * Return error code on failure or 0 on success.
1668 */
1669int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1670		size_t len, bool write)
1671{
1672	if (!data || len <= 0 || !vfio_assert_device_open(device))
1673		return -EINVAL;
1674
1675	if (vfio_device_has_container(device))
1676		return vfio_device_container_dma_rw(device, iova,
1677						    data, len, write);
1678
1679	if (device->iommufd_access) {
1680		unsigned int flags = 0;
1681
1682		if (iova > ULONG_MAX)
1683			return -EINVAL;
1684
1685		/* VFIO historically tries to auto-detect a kthread */
1686		if (!current->mm)
1687			flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1688		if (write)
1689			flags |= IOMMUFD_ACCESS_RW_WRITE;
1690		return iommufd_access_rw(device->iommufd_access, iova, data,
1691					 len, flags);
1692	}
1693	return -EINVAL;
1694}
1695EXPORT_SYMBOL(vfio_dma_rw);
1696
1697/*
1698 * Module/class support
1699 */
1700static int __init vfio_init(void)
1701{
1702	int ret;
1703
1704	ida_init(&vfio.device_ida);
1705
1706	ret = vfio_group_init();
1707	if (ret)
1708		return ret;
1709
1710	ret = vfio_virqfd_init();
1711	if (ret)
1712		goto err_virqfd;
1713
1714	/* /sys/class/vfio-dev/vfioX */
1715	vfio.device_class = class_create("vfio-dev");
1716	if (IS_ERR(vfio.device_class)) {
1717		ret = PTR_ERR(vfio.device_class);
1718		goto err_dev_class;
1719	}
1720
1721	ret = vfio_cdev_init(vfio.device_class);
1722	if (ret)
1723		goto err_alloc_dev_chrdev;
1724
1725	vfio_debugfs_create_root();
1726	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1727	return 0;
1728
1729err_alloc_dev_chrdev:
1730	class_destroy(vfio.device_class);
1731	vfio.device_class = NULL;
1732err_dev_class:
1733	vfio_virqfd_exit();
1734err_virqfd:
1735	vfio_group_cleanup();
1736	return ret;
1737}
1738
1739static void __exit vfio_cleanup(void)
1740{
1741	vfio_debugfs_remove_root();
1742	ida_destroy(&vfio.device_ida);
1743	vfio_cdev_cleanup();
1744	class_destroy(vfio.device_class);
1745	vfio.device_class = NULL;
1746	vfio_virqfd_exit();
1747	vfio_group_cleanup();
1748	xa_destroy(&vfio_device_set_xa);
1749}
1750
1751module_init(vfio_init);
1752module_exit(vfio_cleanup);
1753
1754MODULE_IMPORT_NS("IOMMUFD");
1755MODULE_VERSION(DRIVER_VERSION);
1756MODULE_LICENSE("GPL v2");
1757MODULE_AUTHOR(DRIVER_AUTHOR);
1758MODULE_DESCRIPTION(DRIVER_DESC);
1759MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");