Linux Audio

Check our new training course

Loading...
v6.13.7
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * VFIO core
   4 *
   5 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   6 *     Author: Alex Williamson <alex.williamson@redhat.com>
   7 *
   8 * Derived from original vfio:
   9 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  10 * Author: Tom Lyon, pugs@cisco.com
  11 */
  12
  13#include <linux/cdev.h>
  14#include <linux/compat.h>
  15#include <linux/device.h>
  16#include <linux/fs.h>
  17#include <linux/idr.h>
  18#include <linux/iommu.h>
  19#if IS_ENABLED(CONFIG_KVM)
  20#include <linux/kvm_host.h>
  21#endif
  22#include <linux/list.h>
  23#include <linux/miscdevice.h>
  24#include <linux/module.h>
  25#include <linux/mount.h>
  26#include <linux/mutex.h>
  27#include <linux/pci.h>
  28#include <linux/pseudo_fs.h>
  29#include <linux/rwsem.h>
  30#include <linux/sched.h>
  31#include <linux/slab.h>
  32#include <linux/stat.h>
  33#include <linux/string.h>
  34#include <linux/uaccess.h>
  35#include <linux/vfio.h>
  36#include <linux/wait.h>
  37#include <linux/sched/signal.h>
  38#include <linux/pm_runtime.h>
  39#include <linux/interval_tree.h>
  40#include <linux/iova_bitmap.h>
  41#include <linux/iommufd.h>
  42#include "vfio.h"
  43
  44#define DRIVER_VERSION	"0.3"
  45#define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
  46#define DRIVER_DESC	"VFIO - User Level meta-driver"
  47
  48#define VFIO_MAGIC 0x5646494f /* "VFIO" */
  49
  50static struct vfio {
  51	struct class			*device_class;
  52	struct ida			device_ida;
  53	struct vfsmount			*vfs_mount;
  54	int				fs_count;
  55} vfio;
  56
  57#ifdef CONFIG_VFIO_NOIOMMU
  58bool vfio_noiommu __read_mostly;
  59module_param_named(enable_unsafe_noiommu_mode,
  60		   vfio_noiommu, bool, S_IRUGO | S_IWUSR);
  61MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
  62#endif
  63
  64static DEFINE_XARRAY(vfio_device_set_xa);
  65
  66int vfio_assign_device_set(struct vfio_device *device, void *set_id)
  67{
  68	unsigned long idx = (unsigned long)set_id;
  69	struct vfio_device_set *new_dev_set;
  70	struct vfio_device_set *dev_set;
  71
  72	if (WARN_ON(!set_id))
  73		return -EINVAL;
  74
  75	/*
  76	 * Atomically acquire a singleton object in the xarray for this set_id
  77	 */
  78	xa_lock(&vfio_device_set_xa);
  79	dev_set = xa_load(&vfio_device_set_xa, idx);
  80	if (dev_set)
  81		goto found_get_ref;
  82	xa_unlock(&vfio_device_set_xa);
  83
  84	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
  85	if (!new_dev_set)
  86		return -ENOMEM;
  87	mutex_init(&new_dev_set->lock);
  88	INIT_LIST_HEAD(&new_dev_set->device_list);
  89	new_dev_set->set_id = set_id;
  90
  91	xa_lock(&vfio_device_set_xa);
  92	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
  93			       GFP_KERNEL);
  94	if (!dev_set) {
  95		dev_set = new_dev_set;
  96		goto found_get_ref;
  97	}
  98
  99	kfree(new_dev_set);
 100	if (xa_is_err(dev_set)) {
 101		xa_unlock(&vfio_device_set_xa);
 102		return xa_err(dev_set);
 103	}
 104
 105found_get_ref:
 106	dev_set->device_count++;
 107	xa_unlock(&vfio_device_set_xa);
 108	mutex_lock(&dev_set->lock);
 109	device->dev_set = dev_set;
 110	list_add_tail(&device->dev_set_list, &dev_set->device_list);
 111	mutex_unlock(&dev_set->lock);
 112	return 0;
 113}
 114EXPORT_SYMBOL_GPL(vfio_assign_device_set);
 115
 116static void vfio_release_device_set(struct vfio_device *device)
 117{
 118	struct vfio_device_set *dev_set = device->dev_set;
 119
 120	if (!dev_set)
 121		return;
 122
 123	mutex_lock(&dev_set->lock);
 124	list_del(&device->dev_set_list);
 125	mutex_unlock(&dev_set->lock);
 126
 127	xa_lock(&vfio_device_set_xa);
 128	if (!--dev_set->device_count) {
 129		__xa_erase(&vfio_device_set_xa,
 130			   (unsigned long)dev_set->set_id);
 131		mutex_destroy(&dev_set->lock);
 132		kfree(dev_set);
 133	}
 134	xa_unlock(&vfio_device_set_xa);
 135}
 136
 137unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
 138{
 139	struct vfio_device *cur;
 140	unsigned int open_count = 0;
 141
 142	lockdep_assert_held(&dev_set->lock);
 143
 144	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
 145		open_count += cur->open_count;
 146	return open_count;
 147}
 148EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
 149
 150struct vfio_device *
 151vfio_find_device_in_devset(struct vfio_device_set *dev_set,
 152			   struct device *dev)
 153{
 154	struct vfio_device *cur;
 155
 156	lockdep_assert_held(&dev_set->lock);
 157
 158	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
 159		if (cur->dev == dev)
 160			return cur;
 161	return NULL;
 162}
 163EXPORT_SYMBOL_GPL(vfio_find_device_in_devset);
 164
 165/*
 166 * Device objects - create, release, get, put, search
 167 */
 168/* Device reference always implies a group reference */
 169void vfio_device_put_registration(struct vfio_device *device)
 170{
 171	if (refcount_dec_and_test(&device->refcount))
 172		complete(&device->comp);
 173}
 174
 175bool vfio_device_try_get_registration(struct vfio_device *device)
 176{
 177	return refcount_inc_not_zero(&device->refcount);
 178}
 179
 180/*
 181 * VFIO driver API
 182 */
 183/* Release helper called by vfio_put_device() */
 184static void vfio_device_release(struct device *dev)
 185{
 186	struct vfio_device *device =
 187			container_of(dev, struct vfio_device, device);
 188
 189	vfio_release_device_set(device);
 190	ida_free(&vfio.device_ida, device->index);
 191
 192	if (device->ops->release)
 193		device->ops->release(device);
 194
 195	iput(device->inode);
 196	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
 197	kvfree(device);
 198}
 199
 200static int vfio_init_device(struct vfio_device *device, struct device *dev,
 201			    const struct vfio_device_ops *ops);
 202
 203/*
 204 * Allocate and initialize vfio_device so it can be registered to vfio
 205 * core.
 206 *
 207 * Drivers should use the wrapper vfio_alloc_device() for allocation.
 208 * @size is the size of the structure to be allocated, including any
 209 * private data used by the driver.
 210 *
 211 * Driver may provide an @init callback to cover device private data.
 212 *
 213 * Use vfio_put_device() to release the structure after success return.
 214 */
 215struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
 216				       const struct vfio_device_ops *ops)
 217{
 218	struct vfio_device *device;
 219	int ret;
 220
 221	if (WARN_ON(size < sizeof(struct vfio_device)))
 222		return ERR_PTR(-EINVAL);
 223
 224	device = kvzalloc(size, GFP_KERNEL);
 225	if (!device)
 226		return ERR_PTR(-ENOMEM);
 227
 228	ret = vfio_init_device(device, dev, ops);
 229	if (ret)
 230		goto out_free;
 231	return device;
 232
 233out_free:
 234	kvfree(device);
 235	return ERR_PTR(ret);
 236}
 237EXPORT_SYMBOL_GPL(_vfio_alloc_device);
 238
 239static int vfio_fs_init_fs_context(struct fs_context *fc)
 240{
 241	return init_pseudo(fc, VFIO_MAGIC) ? 0 : -ENOMEM;
 242}
 243
 244static struct file_system_type vfio_fs_type = {
 245	.name = "vfio",
 246	.owner = THIS_MODULE,
 247	.init_fs_context = vfio_fs_init_fs_context,
 248	.kill_sb = kill_anon_super,
 249};
 250
 251static struct inode *vfio_fs_inode_new(void)
 252{
 253	struct inode *inode;
 254	int ret;
 255
 256	ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count);
 257	if (ret)
 258		return ERR_PTR(ret);
 259
 260	inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb);
 261	if (IS_ERR(inode))
 262		simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
 263
 264	return inode;
 265}
 266
 267/*
 268 * Initialize a vfio_device so it can be registered to vfio core.
 269 */
 270static int vfio_init_device(struct vfio_device *device, struct device *dev,
 271			    const struct vfio_device_ops *ops)
 272{
 273	int ret;
 274
 275	ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
 276	if (ret < 0) {
 277		dev_dbg(dev, "Error to alloc index\n");
 278		return ret;
 279	}
 280
 281	device->index = ret;
 282	init_completion(&device->comp);
 283	device->dev = dev;
 284	device->ops = ops;
 285	device->inode = vfio_fs_inode_new();
 286	if (IS_ERR(device->inode)) {
 287		ret = PTR_ERR(device->inode);
 288		goto out_inode;
 289	}
 290
 291	if (ops->init) {
 292		ret = ops->init(device);
 293		if (ret)
 294			goto out_uninit;
 295	}
 296
 297	device_initialize(&device->device);
 298	device->device.release = vfio_device_release;
 299	device->device.class = vfio.device_class;
 300	device->device.parent = device->dev;
 301	return 0;
 302
 303out_uninit:
 304	iput(device->inode);
 305	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
 306out_inode:
 307	vfio_release_device_set(device);
 308	ida_free(&vfio.device_ida, device->index);
 309	return ret;
 310}
 311
 312static int __vfio_register_dev(struct vfio_device *device,
 313			       enum vfio_group_type type)
 314{
 315	int ret;
 316
 317	if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
 318		    (!device->ops->bind_iommufd ||
 319		     !device->ops->unbind_iommufd ||
 320		     !device->ops->attach_ioas ||
 321		     !device->ops->detach_ioas)))
 322		return -EINVAL;
 323
 324	/*
 325	 * If the driver doesn't specify a set then the device is added to a
 326	 * singleton set just for itself.
 327	 */
 328	if (!device->dev_set)
 329		vfio_assign_device_set(device, device);
 330
 331	ret = dev_set_name(&device->device, "vfio%d", device->index);
 332	if (ret)
 333		return ret;
 334
 335	ret = vfio_device_set_group(device, type);
 336	if (ret)
 337		return ret;
 338
 339	/*
 340	 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
 341	 * restore cache coherency. It has to be checked here because it is only
 342	 * valid for cases where we are using iommu groups.
 343	 */
 344	if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) &&
 345	    !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) {
 346		ret = -EINVAL;
 347		goto err_out;
 348	}
 349
 350	ret = vfio_device_add(device);
 351	if (ret)
 352		goto err_out;
 353
 354	/* Refcounting can't start until the driver calls register */
 355	refcount_set(&device->refcount, 1);
 356
 357	vfio_device_group_register(device);
 358	vfio_device_debugfs_init(device);
 359
 360	return 0;
 361err_out:
 362	vfio_device_remove_group(device);
 363	return ret;
 364}
 365
 366int vfio_register_group_dev(struct vfio_device *device)
 367{
 368	return __vfio_register_dev(device, VFIO_IOMMU);
 369}
 370EXPORT_SYMBOL_GPL(vfio_register_group_dev);
 371
 372/*
 373 * Register a virtual device without IOMMU backing.  The user of this
 374 * device must not be able to directly trigger unmediated DMA.
 375 */
 376int vfio_register_emulated_iommu_dev(struct vfio_device *device)
 377{
 378	return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
 379}
 380EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
 381
 382/*
 383 * Decrement the device reference count and wait for the device to be
 384 * removed.  Open file descriptors for the device... */
 385void vfio_unregister_group_dev(struct vfio_device *device)
 386{
 387	unsigned int i = 0;
 388	bool interrupted = false;
 389	long rc;
 390
 391	/*
 392	 * Prevent new device opened by userspace via the
 393	 * VFIO_GROUP_GET_DEVICE_FD in the group path.
 394	 */
 395	vfio_device_group_unregister(device);
 396
 397	/*
 398	 * Balances vfio_device_add() in register path, also prevents
 399	 * new device opened by userspace in the cdev path.
 400	 */
 401	vfio_device_del(device);
 402
 403	vfio_device_put_registration(device);
 404	rc = try_wait_for_completion(&device->comp);
 405	while (rc <= 0) {
 406		if (device->ops->request)
 407			device->ops->request(device, i++);
 408
 409		if (interrupted) {
 410			rc = wait_for_completion_timeout(&device->comp,
 411							 HZ * 10);
 412		} else {
 413			rc = wait_for_completion_interruptible_timeout(
 414				&device->comp, HZ * 10);
 415			if (rc < 0) {
 416				interrupted = true;
 417				dev_warn(device->dev,
 418					 "Device is currently in use, task"
 419					 " \"%s\" (%d) "
 420					 "blocked until device is released",
 421					 current->comm, task_pid_nr(current));
 422			}
 423		}
 424	}
 425
 426	vfio_device_debugfs_exit(device);
 427	/* Balances vfio_device_set_group in register path */
 428	vfio_device_remove_group(device);
 429}
 430EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
 431
 432#if IS_ENABLED(CONFIG_KVM)
 433void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
 434{
 435	void (*pfn)(struct kvm *kvm);
 436	bool (*fn)(struct kvm *kvm);
 437	bool ret;
 438
 439	lockdep_assert_held(&device->dev_set->lock);
 440
 441	if (!kvm)
 442		return;
 443
 444	pfn = symbol_get(kvm_put_kvm);
 445	if (WARN_ON(!pfn))
 446		return;
 447
 448	fn = symbol_get(kvm_get_kvm_safe);
 449	if (WARN_ON(!fn)) {
 450		symbol_put(kvm_put_kvm);
 451		return;
 452	}
 453
 454	ret = fn(kvm);
 455	symbol_put(kvm_get_kvm_safe);
 456	if (!ret) {
 457		symbol_put(kvm_put_kvm);
 458		return;
 459	}
 460
 461	device->put_kvm = pfn;
 462	device->kvm = kvm;
 463}
 464
 465void vfio_device_put_kvm(struct vfio_device *device)
 466{
 467	lockdep_assert_held(&device->dev_set->lock);
 468
 469	if (!device->kvm)
 470		return;
 471
 472	if (WARN_ON(!device->put_kvm))
 473		goto clear;
 474
 475	device->put_kvm(device->kvm);
 476	device->put_kvm = NULL;
 477	symbol_put(kvm_put_kvm);
 478
 479clear:
 480	device->kvm = NULL;
 481}
 482#endif
 483
 484/* true if the vfio_device has open_device() called but not close_device() */
 485static bool vfio_assert_device_open(struct vfio_device *device)
 486{
 487	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
 488}
 489
 490struct vfio_device_file *
 491vfio_allocate_device_file(struct vfio_device *device)
 492{
 493	struct vfio_device_file *df;
 494
 495	df = kzalloc(sizeof(*df), GFP_KERNEL_ACCOUNT);
 496	if (!df)
 497		return ERR_PTR(-ENOMEM);
 498
 499	df->device = device;
 500	spin_lock_init(&df->kvm_ref_lock);
 501
 502	return df;
 503}
 504
 505static int vfio_df_device_first_open(struct vfio_device_file *df)
 506{
 507	struct vfio_device *device = df->device;
 508	struct iommufd_ctx *iommufd = df->iommufd;
 509	int ret;
 510
 511	lockdep_assert_held(&device->dev_set->lock);
 512
 513	if (!try_module_get(device->dev->driver->owner))
 514		return -ENODEV;
 515
 516	if (iommufd)
 517		ret = vfio_df_iommufd_bind(df);
 518	else
 519		ret = vfio_device_group_use_iommu(device);
 520	if (ret)
 521		goto err_module_put;
 522
 523	if (device->ops->open_device) {
 524		ret = device->ops->open_device(device);
 525		if (ret)
 526			goto err_unuse_iommu;
 527	}
 528	return 0;
 529
 530err_unuse_iommu:
 531	if (iommufd)
 532		vfio_df_iommufd_unbind(df);
 533	else
 534		vfio_device_group_unuse_iommu(device);
 535err_module_put:
 536	module_put(device->dev->driver->owner);
 537	return ret;
 538}
 539
 540static void vfio_df_device_last_close(struct vfio_device_file *df)
 541{
 542	struct vfio_device *device = df->device;
 543	struct iommufd_ctx *iommufd = df->iommufd;
 544
 545	lockdep_assert_held(&device->dev_set->lock);
 546
 547	if (device->ops->close_device)
 548		device->ops->close_device(device);
 549	if (iommufd)
 550		vfio_df_iommufd_unbind(df);
 551	else
 552		vfio_device_group_unuse_iommu(device);
 553	module_put(device->dev->driver->owner);
 554}
 555
 556int vfio_df_open(struct vfio_device_file *df)
 557{
 558	struct vfio_device *device = df->device;
 559	int ret = 0;
 560
 561	lockdep_assert_held(&device->dev_set->lock);
 562
 563	/*
 564	 * Only the group path allows the device to be opened multiple
 565	 * times.  The device cdev path doesn't have a secure way for it.
 566	 */
 567	if (device->open_count != 0 && !df->group)
 568		return -EINVAL;
 569
 570	device->open_count++;
 571	if (device->open_count == 1) {
 572		ret = vfio_df_device_first_open(df);
 573		if (ret)
 574			device->open_count--;
 575	}
 576
 577	return ret;
 578}
 579
 580void vfio_df_close(struct vfio_device_file *df)
 581{
 582	struct vfio_device *device = df->device;
 583
 584	lockdep_assert_held(&device->dev_set->lock);
 585
 586	vfio_assert_device_open(device);
 587	if (device->open_count == 1)
 588		vfio_df_device_last_close(df);
 589	device->open_count--;
 590}
 591
 592/*
 593 * Wrapper around pm_runtime_resume_and_get().
 594 * Return error code on failure or 0 on success.
 595 */
 596static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
 597{
 598	struct device *dev = device->dev;
 599
 600	if (dev->driver && dev->driver->pm) {
 601		int ret;
 602
 603		ret = pm_runtime_resume_and_get(dev);
 604		if (ret) {
 605			dev_info_ratelimited(dev,
 606				"vfio: runtime resume failed %d\n", ret);
 607			return -EIO;
 608		}
 609	}
 610
 611	return 0;
 612}
 613
 614/*
 615 * Wrapper around pm_runtime_put().
 616 */
 617static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
 618{
 619	struct device *dev = device->dev;
 620
 621	if (dev->driver && dev->driver->pm)
 622		pm_runtime_put(dev);
 623}
 624
 625/*
 626 * VFIO Device fd
 627 */
 628static int vfio_device_fops_release(struct inode *inode, struct file *filep)
 629{
 630	struct vfio_device_file *df = filep->private_data;
 631	struct vfio_device *device = df->device;
 632
 633	if (df->group)
 634		vfio_df_group_close(df);
 635	else
 636		vfio_df_unbind_iommufd(df);
 637
 638	vfio_device_put_registration(device);
 639
 640	kfree(df);
 641
 642	return 0;
 643}
 644
 645/*
 646 * vfio_mig_get_next_state - Compute the next step in the FSM
 647 * @cur_fsm - The current state the device is in
 648 * @new_fsm - The target state to reach
 649 * @next_fsm - Pointer to the next step to get to new_fsm
 650 *
 651 * Return 0 upon success, otherwise -errno
 652 * Upon success the next step in the state progression between cur_fsm and
 653 * new_fsm will be set in next_fsm.
 654 *
 655 * This breaks down requests for combination transitions into smaller steps and
 656 * returns the next step to get to new_fsm. The function may need to be called
 657 * multiple times before reaching new_fsm.
 658 *
 659 */
 660int vfio_mig_get_next_state(struct vfio_device *device,
 661			    enum vfio_device_mig_state cur_fsm,
 662			    enum vfio_device_mig_state new_fsm,
 663			    enum vfio_device_mig_state *next_fsm)
 664{
 665	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
 666	/*
 667	 * The coding in this table requires the driver to implement the
 668	 * following FSM arcs:
 669	 *         RESUMING -> STOP
 670	 *         STOP -> RESUMING
 671	 *         STOP -> STOP_COPY
 672	 *         STOP_COPY -> STOP
 673	 *
 674	 * If P2P is supported then the driver must also implement these FSM
 675	 * arcs:
 676	 *         RUNNING -> RUNNING_P2P
 677	 *         RUNNING_P2P -> RUNNING
 678	 *         RUNNING_P2P -> STOP
 679	 *         STOP -> RUNNING_P2P
 680	 *
 681	 * If precopy is supported then the driver must support these additional
 682	 * FSM arcs:
 683	 *         RUNNING -> PRE_COPY
 684	 *         PRE_COPY -> RUNNING
 685	 *         PRE_COPY -> STOP_COPY
 686	 * However, if precopy and P2P are supported together then the driver
 687	 * must support these additional arcs beyond the P2P arcs above:
 688	 *         PRE_COPY -> RUNNING
 689	 *         PRE_COPY -> PRE_COPY_P2P
 690	 *         PRE_COPY_P2P -> PRE_COPY
 691	 *         PRE_COPY_P2P -> RUNNING_P2P
 692	 *         PRE_COPY_P2P -> STOP_COPY
 693	 *         RUNNING -> PRE_COPY
 694	 *         RUNNING_P2P -> PRE_COPY_P2P
 695	 *
 696	 * Without P2P and precopy the driver must implement:
 697	 *         RUNNING -> STOP
 698	 *         STOP -> RUNNING
 699	 *
 700	 * The coding will step through multiple states for some combination
 701	 * transitions; if all optional features are supported, this means the
 702	 * following ones:
 703	 *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
 704	 *         PRE_COPY -> RUNNING -> RUNNING_P2P
 705	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
 706	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
 707	 *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
 708	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
 709	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
 710	 *         RESUMING -> STOP -> RUNNING_P2P
 711	 *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
 712	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
 713	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
 714	 *         RESUMING -> STOP -> STOP_COPY
 715	 *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
 716	 *         RUNNING -> RUNNING_P2P -> STOP
 717	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
 718	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
 719	 *         RUNNING_P2P -> RUNNING -> PRE_COPY
 720	 *         RUNNING_P2P -> STOP -> RESUMING
 721	 *         RUNNING_P2P -> STOP -> STOP_COPY
 722	 *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
 723	 *         STOP -> RUNNING_P2P -> RUNNING
 724	 *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
 725	 *         STOP_COPY -> STOP -> RESUMING
 726	 *         STOP_COPY -> STOP -> RUNNING_P2P
 727	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
 728	 *
 729	 *  The following transitions are blocked:
 730	 *         STOP_COPY -> PRE_COPY
 731	 *         STOP_COPY -> PRE_COPY_P2P
 732	 */
 733	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
 734		[VFIO_DEVICE_STATE_STOP] = {
 735			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
 736			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
 737			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
 738			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 739			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
 740			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
 741			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 742			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 743		},
 744		[VFIO_DEVICE_STATE_RUNNING] = {
 745			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
 746			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
 747			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
 748			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 749			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
 750			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
 751			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 752			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 753		},
 754		[VFIO_DEVICE_STATE_PRE_COPY] = {
 755			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
 756			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
 757			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
 758			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
 759			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
 760			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
 761			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
 762			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 763		},
 764		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
 765			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
 766			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
 767			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
 768			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
 769			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
 770			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
 771			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 772			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 773		},
 774		[VFIO_DEVICE_STATE_STOP_COPY] = {
 775			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
 776			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
 777			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
 778			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
 779			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
 780			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
 781			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
 782			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 783		},
 784		[VFIO_DEVICE_STATE_RESUMING] = {
 785			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
 786			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
 787			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
 788			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
 789			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
 790			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
 791			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
 792			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 793		},
 794		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
 795			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
 796			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
 797			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
 798			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
 799			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
 800			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
 801			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 802			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 803		},
 804		[VFIO_DEVICE_STATE_ERROR] = {
 805			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
 806			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
 807			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
 808			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
 809			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
 810			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
 811			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
 812			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 813		},
 814	};
 815
 816	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
 817		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
 818		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
 819		[VFIO_DEVICE_STATE_PRE_COPY] =
 820			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
 821		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
 822						   VFIO_MIGRATION_P2P |
 823						   VFIO_MIGRATION_PRE_COPY,
 824		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
 825		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
 826		[VFIO_DEVICE_STATE_RUNNING_P2P] =
 827			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
 828		[VFIO_DEVICE_STATE_ERROR] = ~0U,
 829	};
 830
 831	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
 832		    (state_flags_table[cur_fsm] & device->migration_flags) !=
 833			state_flags_table[cur_fsm]))
 834		return -EINVAL;
 835
 836	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
 837	   (state_flags_table[new_fsm] & device->migration_flags) !=
 838			state_flags_table[new_fsm])
 839		return -EINVAL;
 840
 841	/*
 842	 * Arcs touching optional and unsupported states are skipped over. The
 843	 * driver will instead see an arc from the original state to the next
 844	 * logical state, as per the above comment.
 845	 */
 846	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
 847	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
 848			state_flags_table[*next_fsm])
 849		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
 850
 851	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
 852}
 853EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
 854
 855/*
 856 * Convert the drivers's struct file into a FD number and return it to userspace
 857 */
 858static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
 859				   struct vfio_device_feature_mig_state *mig)
 860{
 861	int ret;
 862	int fd;
 863
 864	fd = get_unused_fd_flags(O_CLOEXEC);
 865	if (fd < 0) {
 866		ret = fd;
 867		goto out_fput;
 868	}
 869
 870	mig->data_fd = fd;
 871	if (copy_to_user(arg, mig, sizeof(*mig))) {
 872		ret = -EFAULT;
 873		goto out_put_unused;
 874	}
 875	fd_install(fd, filp);
 876	return 0;
 877
 878out_put_unused:
 879	put_unused_fd(fd);
 880out_fput:
 881	fput(filp);
 882	return ret;
 883}
 884
 885static int
 886vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
 887					   u32 flags, void __user *arg,
 888					   size_t argsz)
 889{
 890	size_t minsz =
 891		offsetofend(struct vfio_device_feature_mig_state, data_fd);
 892	struct vfio_device_feature_mig_state mig;
 893	struct file *filp = NULL;
 894	int ret;
 895
 896	if (!device->mig_ops)
 897		return -ENOTTY;
 898
 899	ret = vfio_check_feature(flags, argsz,
 900				 VFIO_DEVICE_FEATURE_SET |
 901				 VFIO_DEVICE_FEATURE_GET,
 902				 sizeof(mig));
 903	if (ret != 1)
 904		return ret;
 905
 906	if (copy_from_user(&mig, arg, minsz))
 907		return -EFAULT;
 908
 909	if (flags & VFIO_DEVICE_FEATURE_GET) {
 910		enum vfio_device_mig_state curr_state;
 911
 912		ret = device->mig_ops->migration_get_state(device,
 913							   &curr_state);
 914		if (ret)
 915			return ret;
 916		mig.device_state = curr_state;
 917		goto out_copy;
 918	}
 919
 920	/* Handle the VFIO_DEVICE_FEATURE_SET */
 921	filp = device->mig_ops->migration_set_state(device, mig.device_state);
 922	if (IS_ERR(filp) || !filp)
 923		goto out_copy;
 924
 925	return vfio_ioct_mig_return_fd(filp, arg, &mig);
 926out_copy:
 927	mig.data_fd = -1;
 928	if (copy_to_user(arg, &mig, sizeof(mig)))
 929		return -EFAULT;
 930	if (IS_ERR(filp))
 931		return PTR_ERR(filp);
 932	return 0;
 933}
 934
 935static int
 936vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
 937					      u32 flags, void __user *arg,
 938					      size_t argsz)
 939{
 940	struct vfio_device_feature_mig_data_size data_size = {};
 941	unsigned long stop_copy_length;
 942	int ret;
 943
 944	if (!device->mig_ops)
 945		return -ENOTTY;
 946
 947	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
 948				 sizeof(data_size));
 949	if (ret != 1)
 950		return ret;
 951
 952	ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
 953	if (ret)
 954		return ret;
 955
 956	data_size.stop_copy_length = stop_copy_length;
 957	if (copy_to_user(arg, &data_size, sizeof(data_size)))
 958		return -EFAULT;
 959
 960	return 0;
 961}
 962
 963static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
 964					       u32 flags, void __user *arg,
 965					       size_t argsz)
 966{
 967	struct vfio_device_feature_migration mig = {
 968		.flags = device->migration_flags,
 969	};
 970	int ret;
 971
 972	if (!device->mig_ops)
 973		return -ENOTTY;
 974
 975	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
 976				 sizeof(mig));
 977	if (ret != 1)
 978		return ret;
 979	if (copy_to_user(arg, &mig, sizeof(mig)))
 980		return -EFAULT;
 981	return 0;
 982}
 983
 984void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
 985			      u32 req_nodes)
 986{
 987	struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
 988	unsigned long min_gap, curr_gap;
 989
 990	/* Special shortcut when a single range is required */
 991	if (req_nodes == 1) {
 992		unsigned long last;
 993
 994		comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
 995
 996		/* Empty list */
 997		if (WARN_ON_ONCE(!comb_start))
 998			return;
 999
1000		curr = comb_start;
1001		while (curr) {
1002			last = curr->last;
1003			prev = curr;
1004			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
1005			if (prev != comb_start)
1006				interval_tree_remove(prev, root);
1007		}
1008		comb_start->last = last;
1009		return;
1010	}
1011
1012	/* Combine ranges which have the smallest gap */
1013	while (cur_nodes > req_nodes) {
1014		prev = NULL;
1015		min_gap = ULONG_MAX;
1016		curr = interval_tree_iter_first(root, 0, ULONG_MAX);
1017		while (curr) {
1018			if (prev) {
1019				curr_gap = curr->start - prev->last;
1020				if (curr_gap < min_gap) {
1021					min_gap = curr_gap;
1022					comb_start = prev;
1023					comb_end = curr;
1024				}
1025			}
1026			prev = curr;
1027			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
1028		}
1029
1030		/* Empty list or no nodes to combine */
1031		if (WARN_ON_ONCE(min_gap == ULONG_MAX))
1032			break;
1033
1034		comb_start->last = comb_end->last;
1035		interval_tree_remove(comb_end, root);
1036		cur_nodes--;
1037	}
1038}
1039EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges);
1040
1041/* Ranges should fit into a single kernel page */
1042#define LOG_MAX_RANGES \
1043	(PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
1044
1045static int
1046vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
1047					u32 flags, void __user *arg,
1048					size_t argsz)
1049{
1050	size_t minsz =
1051		offsetofend(struct vfio_device_feature_dma_logging_control,
1052			    ranges);
1053	struct vfio_device_feature_dma_logging_range __user *ranges;
1054	struct vfio_device_feature_dma_logging_control control;
1055	struct vfio_device_feature_dma_logging_range range;
1056	struct rb_root_cached root = RB_ROOT_CACHED;
1057	struct interval_tree_node *nodes;
1058	u64 iova_end;
1059	u32 nnodes;
1060	int i, ret;
1061
1062	if (!device->log_ops)
1063		return -ENOTTY;
1064
1065	ret = vfio_check_feature(flags, argsz,
1066				 VFIO_DEVICE_FEATURE_SET,
1067				 sizeof(control));
1068	if (ret != 1)
1069		return ret;
1070
1071	if (copy_from_user(&control, arg, minsz))
1072		return -EFAULT;
1073
1074	nnodes = control.num_ranges;
1075	if (!nnodes)
1076		return -EINVAL;
1077
1078	if (nnodes > LOG_MAX_RANGES)
1079		return -E2BIG;
1080
1081	ranges = u64_to_user_ptr(control.ranges);
1082	nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
1083			      GFP_KERNEL);
1084	if (!nodes)
1085		return -ENOMEM;
1086
1087	for (i = 0; i < nnodes; i++) {
1088		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
1089			ret = -EFAULT;
1090			goto end;
1091		}
1092		if (!IS_ALIGNED(range.iova, control.page_size) ||
1093		    !IS_ALIGNED(range.length, control.page_size)) {
1094			ret = -EINVAL;
1095			goto end;
1096		}
1097
1098		if (check_add_overflow(range.iova, range.length, &iova_end) ||
1099		    iova_end > ULONG_MAX) {
1100			ret = -EOVERFLOW;
1101			goto end;
1102		}
1103
1104		nodes[i].start = range.iova;
1105		nodes[i].last = range.iova + range.length - 1;
1106		if (interval_tree_iter_first(&root, nodes[i].start,
1107					     nodes[i].last)) {
1108			/* Range overlapping */
1109			ret = -EINVAL;
1110			goto end;
1111		}
1112		interval_tree_insert(nodes + i, &root);
1113	}
1114
1115	ret = device->log_ops->log_start(device, &root, nnodes,
1116					 &control.page_size);
1117	if (ret)
1118		goto end;
1119
1120	if (copy_to_user(arg, &control, sizeof(control))) {
1121		ret = -EFAULT;
1122		device->log_ops->log_stop(device);
1123	}
1124
1125end:
1126	kfree(nodes);
1127	return ret;
1128}
1129
1130static int
1131vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
1132				       u32 flags, void __user *arg,
1133				       size_t argsz)
1134{
1135	int ret;
1136
1137	if (!device->log_ops)
1138		return -ENOTTY;
1139
1140	ret = vfio_check_feature(flags, argsz,
1141				 VFIO_DEVICE_FEATURE_SET, 0);
1142	if (ret != 1)
1143		return ret;
1144
1145	return device->log_ops->log_stop(device);
1146}
1147
1148static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
1149					  unsigned long iova, size_t length,
1150					  void *opaque)
1151{
1152	struct vfio_device *device = opaque;
1153
1154	return device->log_ops->log_read_and_clear(device, iova, length, iter);
1155}
1156
1157static int
1158vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
1159					 u32 flags, void __user *arg,
1160					 size_t argsz)
1161{
1162	size_t minsz =
1163		offsetofend(struct vfio_device_feature_dma_logging_report,
1164			    bitmap);
1165	struct vfio_device_feature_dma_logging_report report;
1166	struct iova_bitmap *iter;
1167	u64 iova_end;
1168	int ret;
1169
1170	if (!device->log_ops)
1171		return -ENOTTY;
1172
1173	ret = vfio_check_feature(flags, argsz,
1174				 VFIO_DEVICE_FEATURE_GET,
1175				 sizeof(report));
1176	if (ret != 1)
1177		return ret;
1178
1179	if (copy_from_user(&report, arg, minsz))
1180		return -EFAULT;
1181
1182	if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1183		return -EINVAL;
1184
1185	if (check_add_overflow(report.iova, report.length, &iova_end) ||
1186	    iova_end > ULONG_MAX)
1187		return -EOVERFLOW;
1188
1189	iter = iova_bitmap_alloc(report.iova, report.length,
1190				 report.page_size,
1191				 u64_to_user_ptr(report.bitmap));
1192	if (IS_ERR(iter))
1193		return PTR_ERR(iter);
1194
1195	ret = iova_bitmap_for_each(iter, device,
1196				   vfio_device_log_read_and_clear);
1197
1198	iova_bitmap_free(iter);
1199	return ret;
1200}
1201
1202static int vfio_ioctl_device_feature(struct vfio_device *device,
1203				     struct vfio_device_feature __user *arg)
1204{
1205	size_t minsz = offsetofend(struct vfio_device_feature, flags);
1206	struct vfio_device_feature feature;
1207
1208	if (copy_from_user(&feature, arg, minsz))
1209		return -EFAULT;
1210
1211	if (feature.argsz < minsz)
1212		return -EINVAL;
1213
1214	/* Check unknown flags */
1215	if (feature.flags &
1216	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1217	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1218		return -EINVAL;
1219
1220	/* GET & SET are mutually exclusive except with PROBE */
1221	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1222	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1223	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
1224		return -EINVAL;
1225
1226	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1227	case VFIO_DEVICE_FEATURE_MIGRATION:
1228		return vfio_ioctl_device_feature_migration(
1229			device, feature.flags, arg->data,
1230			feature.argsz - minsz);
1231	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1232		return vfio_ioctl_device_feature_mig_device_state(
1233			device, feature.flags, arg->data,
1234			feature.argsz - minsz);
1235	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1236		return vfio_ioctl_device_feature_logging_start(
1237			device, feature.flags, arg->data,
1238			feature.argsz - minsz);
1239	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1240		return vfio_ioctl_device_feature_logging_stop(
1241			device, feature.flags, arg->data,
1242			feature.argsz - minsz);
1243	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1244		return vfio_ioctl_device_feature_logging_report(
1245			device, feature.flags, arg->data,
1246			feature.argsz - minsz);
1247	case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1248		return vfio_ioctl_device_feature_migration_data_size(
1249			device, feature.flags, arg->data,
1250			feature.argsz - minsz);
1251	default:
1252		if (unlikely(!device->ops->device_feature))
1253			return -EINVAL;
1254		return device->ops->device_feature(device, feature.flags,
1255						   arg->data,
1256						   feature.argsz - minsz);
1257	}
1258}
1259
1260static long vfio_device_fops_unl_ioctl(struct file *filep,
1261				       unsigned int cmd, unsigned long arg)
1262{
1263	struct vfio_device_file *df = filep->private_data;
1264	struct vfio_device *device = df->device;
1265	void __user *uptr = (void __user *)arg;
1266	int ret;
1267
1268	if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
1269		return vfio_df_ioctl_bind_iommufd(df, uptr);
1270
1271	/* Paired with smp_store_release() following vfio_df_open() */
1272	if (!smp_load_acquire(&df->access_granted))
1273		return -EINVAL;
1274
1275	ret = vfio_device_pm_runtime_get(device);
1276	if (ret)
1277		return ret;
1278
1279	/* cdev only ioctls */
1280	if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) {
1281		switch (cmd) {
1282		case VFIO_DEVICE_ATTACH_IOMMUFD_PT:
1283			ret = vfio_df_ioctl_attach_pt(df, uptr);
1284			goto out;
1285
1286		case VFIO_DEVICE_DETACH_IOMMUFD_PT:
1287			ret = vfio_df_ioctl_detach_pt(df, uptr);
1288			goto out;
1289		}
1290	}
1291
1292	switch (cmd) {
1293	case VFIO_DEVICE_FEATURE:
1294		ret = vfio_ioctl_device_feature(device, uptr);
1295		break;
1296
1297	default:
1298		if (unlikely(!device->ops->ioctl))
1299			ret = -EINVAL;
1300		else
1301			ret = device->ops->ioctl(device, cmd, arg);
1302		break;
1303	}
1304out:
1305	vfio_device_pm_runtime_put(device);
1306	return ret;
1307}
1308
1309static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1310				     size_t count, loff_t *ppos)
1311{
1312	struct vfio_device_file *df = filep->private_data;
1313	struct vfio_device *device = df->device;
1314
1315	/* Paired with smp_store_release() following vfio_df_open() */
1316	if (!smp_load_acquire(&df->access_granted))
1317		return -EINVAL;
1318
1319	if (unlikely(!device->ops->read))
1320		return -EINVAL;
1321
1322	return device->ops->read(device, buf, count, ppos);
1323}
1324
1325static ssize_t vfio_device_fops_write(struct file *filep,
1326				      const char __user *buf,
1327				      size_t count, loff_t *ppos)
1328{
1329	struct vfio_device_file *df = filep->private_data;
1330	struct vfio_device *device = df->device;
1331
1332	/* Paired with smp_store_release() following vfio_df_open() */
1333	if (!smp_load_acquire(&df->access_granted))
1334		return -EINVAL;
1335
1336	if (unlikely(!device->ops->write))
1337		return -EINVAL;
1338
1339	return device->ops->write(device, buf, count, ppos);
1340}
1341
1342static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1343{
1344	struct vfio_device_file *df = filep->private_data;
1345	struct vfio_device *device = df->device;
1346
1347	/* Paired with smp_store_release() following vfio_df_open() */
1348	if (!smp_load_acquire(&df->access_granted))
1349		return -EINVAL;
1350
1351	if (unlikely(!device->ops->mmap))
1352		return -EINVAL;
1353
1354	return device->ops->mmap(device, vma);
1355}
1356
1357const struct file_operations vfio_device_fops = {
1358	.owner		= THIS_MODULE,
1359	.open		= vfio_device_fops_cdev_open,
1360	.release	= vfio_device_fops_release,
1361	.read		= vfio_device_fops_read,
1362	.write		= vfio_device_fops_write,
1363	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1364	.compat_ioctl	= compat_ptr_ioctl,
1365	.mmap		= vfio_device_fops_mmap,
1366};
1367
1368static struct vfio_device *vfio_device_from_file(struct file *file)
1369{
1370	struct vfio_device_file *df = file->private_data;
1371
1372	if (file->f_op != &vfio_device_fops)
1373		return NULL;
1374	return df->device;
1375}
1376
1377/**
1378 * vfio_file_is_valid - True if the file is valid vfio file
1379 * @file: VFIO group file or VFIO device file
1380 */
1381bool vfio_file_is_valid(struct file *file)
1382{
1383	return vfio_group_from_file(file) ||
1384	       vfio_device_from_file(file);
1385}
1386EXPORT_SYMBOL_GPL(vfio_file_is_valid);
1387
1388/**
1389 * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1390 *        is always CPU cache coherent
1391 * @file: VFIO group file or VFIO device file
1392 *
1393 * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1394 * bit in DMA transactions. A return of false indicates that the user has
1395 * rights to access additional instructions such as wbinvd on x86.
1396 */
1397bool vfio_file_enforced_coherent(struct file *file)
1398{
1399	struct vfio_device *device;
1400	struct vfio_group *group;
1401
1402	group = vfio_group_from_file(file);
1403	if (group)
1404		return vfio_group_enforced_coherent(group);
1405
1406	device = vfio_device_from_file(file);
1407	if (device)
1408		return device_iommu_capable(device->dev,
1409					    IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
1410
1411	return true;
1412}
1413EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1414
1415static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
1416{
1417	struct vfio_device_file *df = file->private_data;
1418
1419	/*
1420	 * The kvm is first recorded in the vfio_device_file, and will
1421	 * be propagated to vfio_device::kvm when the file is bound to
1422	 * iommufd successfully in the vfio device cdev path.
1423	 */
1424	spin_lock(&df->kvm_ref_lock);
1425	df->kvm = kvm;
1426	spin_unlock(&df->kvm_ref_lock);
1427}
1428
1429/**
1430 * vfio_file_set_kvm - Link a kvm with VFIO drivers
1431 * @file: VFIO group file or VFIO device file
1432 * @kvm: KVM to link
1433 *
1434 * When a VFIO device is first opened the KVM will be available in
1435 * device->kvm if one was associated with the file.
1436 */
1437void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1438{
1439	struct vfio_group *group;
1440
1441	group = vfio_group_from_file(file);
1442	if (group)
1443		vfio_group_set_kvm(group, kvm);
1444
1445	if (vfio_device_from_file(file))
1446		vfio_device_file_set_kvm(file, kvm);
1447}
1448EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1449
1450/*
1451 * Sub-module support
1452 */
1453/*
1454 * Helper for managing a buffer of info chain capabilities, allocate or
1455 * reallocate a buffer with additional @size, filling in @id and @version
1456 * of the capability.  A pointer to the new capability is returned.
1457 *
1458 * NB. The chain is based at the head of the buffer, so new entries are
1459 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1460 * next offsets prior to copying to the user buffer.
1461 */
1462struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1463					       size_t size, u16 id, u16 version)
1464{
1465	void *buf;
1466	struct vfio_info_cap_header *header, *tmp;
1467
1468	/* Ensure that the next capability struct will be aligned */
1469	size = ALIGN(size, sizeof(u64));
1470
1471	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1472	if (!buf) {
1473		kfree(caps->buf);
1474		caps->buf = NULL;
1475		caps->size = 0;
1476		return ERR_PTR(-ENOMEM);
1477	}
1478
1479	caps->buf = buf;
1480	header = buf + caps->size;
1481
1482	/* Eventually copied to user buffer, zero */
1483	memset(header, 0, size);
1484
1485	header->id = id;
1486	header->version = version;
1487
1488	/* Add to the end of the capability chain */
1489	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1490		; /* nothing */
1491
1492	tmp->next = caps->size;
1493	caps->size += size;
1494
1495	return header;
1496}
1497EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1498
1499void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1500{
1501	struct vfio_info_cap_header *tmp;
1502	void *buf = (void *)caps->buf;
1503
1504	/* Capability structs should start with proper alignment */
1505	WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));
1506
1507	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1508		tmp->next += offset;
1509}
1510EXPORT_SYMBOL(vfio_info_cap_shift);
1511
1512int vfio_info_add_capability(struct vfio_info_cap *caps,
1513			     struct vfio_info_cap_header *cap, size_t size)
1514{
1515	struct vfio_info_cap_header *header;
1516
1517	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1518	if (IS_ERR(header))
1519		return PTR_ERR(header);
1520
1521	memcpy(header + 1, cap + 1, size - sizeof(*header));
1522
1523	return 0;
1524}
1525EXPORT_SYMBOL(vfio_info_add_capability);
1526
1527int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1528				       int max_irq_type, size_t *data_size)
1529{
1530	unsigned long minsz;
1531	size_t size;
1532
1533	minsz = offsetofend(struct vfio_irq_set, count);
1534
1535	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1536	    (hdr->count >= (U32_MAX - hdr->start)) ||
1537	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1538				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1539		return -EINVAL;
1540
1541	if (data_size)
1542		*data_size = 0;
1543
1544	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1545		return -EINVAL;
1546
1547	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1548	case VFIO_IRQ_SET_DATA_NONE:
1549		size = 0;
1550		break;
1551	case VFIO_IRQ_SET_DATA_BOOL:
1552		size = sizeof(uint8_t);
1553		break;
1554	case VFIO_IRQ_SET_DATA_EVENTFD:
1555		size = sizeof(int32_t);
1556		break;
1557	default:
1558		return -EINVAL;
1559	}
1560
1561	if (size) {
1562		if (hdr->argsz - minsz < hdr->count * size)
1563			return -EINVAL;
1564
1565		if (!data_size)
1566			return -EINVAL;
1567
1568		*data_size = hdr->count * size;
1569	}
1570
1571	return 0;
1572}
1573EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1574
1575/*
1576 * Pin contiguous user pages and return their associated host pages for local
1577 * domain only.
1578 * @device [in]  : device
1579 * @iova [in]    : starting IOVA of user pages to be pinned.
1580 * @npage [in]   : count of pages to be pinned.  This count should not
1581 *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1582 * @prot [in]    : protection flags
1583 * @pages[out]   : array of host pages
1584 * Return error or number of pages pinned.
1585 *
1586 * A driver may only call this function if the vfio_device was created
1587 * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1588 */
1589int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1590		   int npage, int prot, struct page **pages)
1591{
1592	/* group->container cannot change while a vfio device is open */
1593	if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1594		return -EINVAL;
1595	if (!device->ops->dma_unmap)
1596		return -EINVAL;
1597	if (vfio_device_has_container(device))
1598		return vfio_device_container_pin_pages(device, iova,
1599						       npage, prot, pages);
1600	if (device->iommufd_access) {
1601		int ret;
1602
1603		if (iova > ULONG_MAX)
1604			return -EINVAL;
1605		/*
1606		 * VFIO ignores the sub page offset, npages is from the start of
1607		 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1608		 * the sub page offset by doing:
1609		 *     pages[0] + (iova % PAGE_SIZE)
1610		 */
1611		ret = iommufd_access_pin_pages(
1612			device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1613			npage * PAGE_SIZE, pages,
1614			(prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1615		if (ret)
1616			return ret;
1617		return npage;
1618	}
1619	return -EINVAL;
1620}
1621EXPORT_SYMBOL(vfio_pin_pages);
1622
1623/*
1624 * Unpin contiguous host pages for local domain only.
1625 * @device [in]  : device
1626 * @iova [in]    : starting address of user pages to be unpinned.
1627 * @npage [in]   : count of pages to be unpinned.  This count should not
1628 *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1629 */
1630void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1631{
1632	if (WARN_ON(!vfio_assert_device_open(device)))
1633		return;
1634	if (WARN_ON(!device->ops->dma_unmap))
1635		return;
1636
1637	if (vfio_device_has_container(device)) {
1638		vfio_device_container_unpin_pages(device, iova, npage);
1639		return;
1640	}
1641	if (device->iommufd_access) {
1642		if (WARN_ON(iova > ULONG_MAX))
1643			return;
1644		iommufd_access_unpin_pages(device->iommufd_access,
1645					   ALIGN_DOWN(iova, PAGE_SIZE),
1646					   npage * PAGE_SIZE);
1647		return;
1648	}
1649}
1650EXPORT_SYMBOL(vfio_unpin_pages);
1651
1652/*
1653 * This interface allows the CPUs to perform some sort of virtual DMA on
1654 * behalf of the device.
1655 *
1656 * CPUs read/write from/into a range of IOVAs pointing to user space memory
1657 * into/from a kernel buffer.
1658 *
1659 * As the read/write of user space memory is conducted via the CPUs and is
1660 * not a real device DMA, it is not necessary to pin the user space memory.
1661 *
1662 * @device [in]		: VFIO device
1663 * @iova [in]		: base IOVA of a user space buffer
1664 * @data [in]		: pointer to kernel buffer
1665 * @len [in]		: kernel buffer length
1666 * @write		: indicate read or write
1667 * Return error code on failure or 0 on success.
1668 */
1669int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1670		size_t len, bool write)
1671{
1672	if (!data || len <= 0 || !vfio_assert_device_open(device))
1673		return -EINVAL;
1674
1675	if (vfio_device_has_container(device))
1676		return vfio_device_container_dma_rw(device, iova,
1677						    data, len, write);
1678
1679	if (device->iommufd_access) {
1680		unsigned int flags = 0;
1681
1682		if (iova > ULONG_MAX)
1683			return -EINVAL;
1684
1685		/* VFIO historically tries to auto-detect a kthread */
1686		if (!current->mm)
1687			flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1688		if (write)
1689			flags |= IOMMUFD_ACCESS_RW_WRITE;
1690		return iommufd_access_rw(device->iommufd_access, iova, data,
1691					 len, flags);
1692	}
1693	return -EINVAL;
1694}
1695EXPORT_SYMBOL(vfio_dma_rw);
1696
1697/*
1698 * Module/class support
1699 */
1700static int __init vfio_init(void)
1701{
1702	int ret;
1703
1704	ida_init(&vfio.device_ida);
1705
1706	ret = vfio_group_init();
1707	if (ret)
1708		return ret;
1709
1710	ret = vfio_virqfd_init();
1711	if (ret)
1712		goto err_virqfd;
1713
1714	/* /sys/class/vfio-dev/vfioX */
1715	vfio.device_class = class_create("vfio-dev");
1716	if (IS_ERR(vfio.device_class)) {
1717		ret = PTR_ERR(vfio.device_class);
1718		goto err_dev_class;
1719	}
1720
1721	ret = vfio_cdev_init(vfio.device_class);
1722	if (ret)
1723		goto err_alloc_dev_chrdev;
1724
1725	vfio_debugfs_create_root();
1726	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1727	return 0;
1728
1729err_alloc_dev_chrdev:
1730	class_destroy(vfio.device_class);
1731	vfio.device_class = NULL;
1732err_dev_class:
1733	vfio_virqfd_exit();
1734err_virqfd:
1735	vfio_group_cleanup();
1736	return ret;
1737}
1738
1739static void __exit vfio_cleanup(void)
1740{
1741	vfio_debugfs_remove_root();
1742	ida_destroy(&vfio.device_ida);
1743	vfio_cdev_cleanup();
1744	class_destroy(vfio.device_class);
1745	vfio.device_class = NULL;
1746	vfio_virqfd_exit();
1747	vfio_group_cleanup();
1748	xa_destroy(&vfio_device_set_xa);
1749}
1750
1751module_init(vfio_init);
1752module_exit(vfio_cleanup);
1753
1754MODULE_IMPORT_NS("IOMMUFD");
1755MODULE_VERSION(DRIVER_VERSION);
1756MODULE_LICENSE("GPL v2");
1757MODULE_AUTHOR(DRIVER_AUTHOR);
1758MODULE_DESCRIPTION(DRIVER_DESC);
1759MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
v6.8
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * VFIO core
   4 *
   5 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   6 *     Author: Alex Williamson <alex.williamson@redhat.com>
   7 *
   8 * Derived from original vfio:
   9 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  10 * Author: Tom Lyon, pugs@cisco.com
  11 */
  12
  13#include <linux/cdev.h>
  14#include <linux/compat.h>
  15#include <linux/device.h>
  16#include <linux/fs.h>
  17#include <linux/idr.h>
  18#include <linux/iommu.h>
  19#ifdef CONFIG_HAVE_KVM
  20#include <linux/kvm_host.h>
  21#endif
  22#include <linux/list.h>
  23#include <linux/miscdevice.h>
  24#include <linux/module.h>
 
  25#include <linux/mutex.h>
  26#include <linux/pci.h>
 
  27#include <linux/rwsem.h>
  28#include <linux/sched.h>
  29#include <linux/slab.h>
  30#include <linux/stat.h>
  31#include <linux/string.h>
  32#include <linux/uaccess.h>
  33#include <linux/vfio.h>
  34#include <linux/wait.h>
  35#include <linux/sched/signal.h>
  36#include <linux/pm_runtime.h>
  37#include <linux/interval_tree.h>
  38#include <linux/iova_bitmap.h>
  39#include <linux/iommufd.h>
  40#include "vfio.h"
  41
  42#define DRIVER_VERSION	"0.3"
  43#define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
  44#define DRIVER_DESC	"VFIO - User Level meta-driver"
  45
 
 
  46static struct vfio {
  47	struct class			*device_class;
  48	struct ida			device_ida;
 
 
  49} vfio;
  50
  51#ifdef CONFIG_VFIO_NOIOMMU
  52bool vfio_noiommu __read_mostly;
  53module_param_named(enable_unsafe_noiommu_mode,
  54		   vfio_noiommu, bool, S_IRUGO | S_IWUSR);
  55MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
  56#endif
  57
  58static DEFINE_XARRAY(vfio_device_set_xa);
  59
  60int vfio_assign_device_set(struct vfio_device *device, void *set_id)
  61{
  62	unsigned long idx = (unsigned long)set_id;
  63	struct vfio_device_set *new_dev_set;
  64	struct vfio_device_set *dev_set;
  65
  66	if (WARN_ON(!set_id))
  67		return -EINVAL;
  68
  69	/*
  70	 * Atomically acquire a singleton object in the xarray for this set_id
  71	 */
  72	xa_lock(&vfio_device_set_xa);
  73	dev_set = xa_load(&vfio_device_set_xa, idx);
  74	if (dev_set)
  75		goto found_get_ref;
  76	xa_unlock(&vfio_device_set_xa);
  77
  78	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
  79	if (!new_dev_set)
  80		return -ENOMEM;
  81	mutex_init(&new_dev_set->lock);
  82	INIT_LIST_HEAD(&new_dev_set->device_list);
  83	new_dev_set->set_id = set_id;
  84
  85	xa_lock(&vfio_device_set_xa);
  86	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
  87			       GFP_KERNEL);
  88	if (!dev_set) {
  89		dev_set = new_dev_set;
  90		goto found_get_ref;
  91	}
  92
  93	kfree(new_dev_set);
  94	if (xa_is_err(dev_set)) {
  95		xa_unlock(&vfio_device_set_xa);
  96		return xa_err(dev_set);
  97	}
  98
  99found_get_ref:
 100	dev_set->device_count++;
 101	xa_unlock(&vfio_device_set_xa);
 102	mutex_lock(&dev_set->lock);
 103	device->dev_set = dev_set;
 104	list_add_tail(&device->dev_set_list, &dev_set->device_list);
 105	mutex_unlock(&dev_set->lock);
 106	return 0;
 107}
 108EXPORT_SYMBOL_GPL(vfio_assign_device_set);
 109
 110static void vfio_release_device_set(struct vfio_device *device)
 111{
 112	struct vfio_device_set *dev_set = device->dev_set;
 113
 114	if (!dev_set)
 115		return;
 116
 117	mutex_lock(&dev_set->lock);
 118	list_del(&device->dev_set_list);
 119	mutex_unlock(&dev_set->lock);
 120
 121	xa_lock(&vfio_device_set_xa);
 122	if (!--dev_set->device_count) {
 123		__xa_erase(&vfio_device_set_xa,
 124			   (unsigned long)dev_set->set_id);
 125		mutex_destroy(&dev_set->lock);
 126		kfree(dev_set);
 127	}
 128	xa_unlock(&vfio_device_set_xa);
 129}
 130
 131unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
 132{
 133	struct vfio_device *cur;
 134	unsigned int open_count = 0;
 135
 136	lockdep_assert_held(&dev_set->lock);
 137
 138	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
 139		open_count += cur->open_count;
 140	return open_count;
 141}
 142EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
 143
 144struct vfio_device *
 145vfio_find_device_in_devset(struct vfio_device_set *dev_set,
 146			   struct device *dev)
 147{
 148	struct vfio_device *cur;
 149
 150	lockdep_assert_held(&dev_set->lock);
 151
 152	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
 153		if (cur->dev == dev)
 154			return cur;
 155	return NULL;
 156}
 157EXPORT_SYMBOL_GPL(vfio_find_device_in_devset);
 158
 159/*
 160 * Device objects - create, release, get, put, search
 161 */
 162/* Device reference always implies a group reference */
 163void vfio_device_put_registration(struct vfio_device *device)
 164{
 165	if (refcount_dec_and_test(&device->refcount))
 166		complete(&device->comp);
 167}
 168
 169bool vfio_device_try_get_registration(struct vfio_device *device)
 170{
 171	return refcount_inc_not_zero(&device->refcount);
 172}
 173
 174/*
 175 * VFIO driver API
 176 */
 177/* Release helper called by vfio_put_device() */
 178static void vfio_device_release(struct device *dev)
 179{
 180	struct vfio_device *device =
 181			container_of(dev, struct vfio_device, device);
 182
 183	vfio_release_device_set(device);
 184	ida_free(&vfio.device_ida, device->index);
 185
 186	if (device->ops->release)
 187		device->ops->release(device);
 188
 
 
 189	kvfree(device);
 190}
 191
 192static int vfio_init_device(struct vfio_device *device, struct device *dev,
 193			    const struct vfio_device_ops *ops);
 194
 195/*
 196 * Allocate and initialize vfio_device so it can be registered to vfio
 197 * core.
 198 *
 199 * Drivers should use the wrapper vfio_alloc_device() for allocation.
 200 * @size is the size of the structure to be allocated, including any
 201 * private data used by the driver.
 202 *
 203 * Driver may provide an @init callback to cover device private data.
 204 *
 205 * Use vfio_put_device() to release the structure after success return.
 206 */
 207struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
 208				       const struct vfio_device_ops *ops)
 209{
 210	struct vfio_device *device;
 211	int ret;
 212
 213	if (WARN_ON(size < sizeof(struct vfio_device)))
 214		return ERR_PTR(-EINVAL);
 215
 216	device = kvzalloc(size, GFP_KERNEL);
 217	if (!device)
 218		return ERR_PTR(-ENOMEM);
 219
 220	ret = vfio_init_device(device, dev, ops);
 221	if (ret)
 222		goto out_free;
 223	return device;
 224
 225out_free:
 226	kvfree(device);
 227	return ERR_PTR(ret);
 228}
 229EXPORT_SYMBOL_GPL(_vfio_alloc_device);
 230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 231/*
 232 * Initialize a vfio_device so it can be registered to vfio core.
 233 */
 234static int vfio_init_device(struct vfio_device *device, struct device *dev,
 235			    const struct vfio_device_ops *ops)
 236{
 237	int ret;
 238
 239	ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
 240	if (ret < 0) {
 241		dev_dbg(dev, "Error to alloc index\n");
 242		return ret;
 243	}
 244
 245	device->index = ret;
 246	init_completion(&device->comp);
 247	device->dev = dev;
 248	device->ops = ops;
 
 
 
 
 
 249
 250	if (ops->init) {
 251		ret = ops->init(device);
 252		if (ret)
 253			goto out_uninit;
 254	}
 255
 256	device_initialize(&device->device);
 257	device->device.release = vfio_device_release;
 258	device->device.class = vfio.device_class;
 259	device->device.parent = device->dev;
 260	return 0;
 261
 262out_uninit:
 
 
 
 263	vfio_release_device_set(device);
 264	ida_free(&vfio.device_ida, device->index);
 265	return ret;
 266}
 267
 268static int __vfio_register_dev(struct vfio_device *device,
 269			       enum vfio_group_type type)
 270{
 271	int ret;
 272
 273	if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
 274		    (!device->ops->bind_iommufd ||
 275		     !device->ops->unbind_iommufd ||
 276		     !device->ops->attach_ioas ||
 277		     !device->ops->detach_ioas)))
 278		return -EINVAL;
 279
 280	/*
 281	 * If the driver doesn't specify a set then the device is added to a
 282	 * singleton set just for itself.
 283	 */
 284	if (!device->dev_set)
 285		vfio_assign_device_set(device, device);
 286
 287	ret = dev_set_name(&device->device, "vfio%d", device->index);
 288	if (ret)
 289		return ret;
 290
 291	ret = vfio_device_set_group(device, type);
 292	if (ret)
 293		return ret;
 294
 295	/*
 296	 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
 297	 * restore cache coherency. It has to be checked here because it is only
 298	 * valid for cases where we are using iommu groups.
 299	 */
 300	if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) &&
 301	    !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) {
 302		ret = -EINVAL;
 303		goto err_out;
 304	}
 305
 306	ret = vfio_device_add(device);
 307	if (ret)
 308		goto err_out;
 309
 310	/* Refcounting can't start until the driver calls register */
 311	refcount_set(&device->refcount, 1);
 312
 313	vfio_device_group_register(device);
 314	vfio_device_debugfs_init(device);
 315
 316	return 0;
 317err_out:
 318	vfio_device_remove_group(device);
 319	return ret;
 320}
 321
 322int vfio_register_group_dev(struct vfio_device *device)
 323{
 324	return __vfio_register_dev(device, VFIO_IOMMU);
 325}
 326EXPORT_SYMBOL_GPL(vfio_register_group_dev);
 327
 328/*
 329 * Register a virtual device without IOMMU backing.  The user of this
 330 * device must not be able to directly trigger unmediated DMA.
 331 */
 332int vfio_register_emulated_iommu_dev(struct vfio_device *device)
 333{
 334	return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
 335}
 336EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
 337
 338/*
 339 * Decrement the device reference count and wait for the device to be
 340 * removed.  Open file descriptors for the device... */
 341void vfio_unregister_group_dev(struct vfio_device *device)
 342{
 343	unsigned int i = 0;
 344	bool interrupted = false;
 345	long rc;
 346
 347	/*
 348	 * Prevent new device opened by userspace via the
 349	 * VFIO_GROUP_GET_DEVICE_FD in the group path.
 350	 */
 351	vfio_device_group_unregister(device);
 352
 353	/*
 354	 * Balances vfio_device_add() in register path, also prevents
 355	 * new device opened by userspace in the cdev path.
 356	 */
 357	vfio_device_del(device);
 358
 359	vfio_device_put_registration(device);
 360	rc = try_wait_for_completion(&device->comp);
 361	while (rc <= 0) {
 362		if (device->ops->request)
 363			device->ops->request(device, i++);
 364
 365		if (interrupted) {
 366			rc = wait_for_completion_timeout(&device->comp,
 367							 HZ * 10);
 368		} else {
 369			rc = wait_for_completion_interruptible_timeout(
 370				&device->comp, HZ * 10);
 371			if (rc < 0) {
 372				interrupted = true;
 373				dev_warn(device->dev,
 374					 "Device is currently in use, task"
 375					 " \"%s\" (%d) "
 376					 "blocked until device is released",
 377					 current->comm, task_pid_nr(current));
 378			}
 379		}
 380	}
 381
 382	vfio_device_debugfs_exit(device);
 383	/* Balances vfio_device_set_group in register path */
 384	vfio_device_remove_group(device);
 385}
 386EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
 387
 388#ifdef CONFIG_HAVE_KVM
 389void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
 390{
 391	void (*pfn)(struct kvm *kvm);
 392	bool (*fn)(struct kvm *kvm);
 393	bool ret;
 394
 395	lockdep_assert_held(&device->dev_set->lock);
 396
 397	if (!kvm)
 398		return;
 399
 400	pfn = symbol_get(kvm_put_kvm);
 401	if (WARN_ON(!pfn))
 402		return;
 403
 404	fn = symbol_get(kvm_get_kvm_safe);
 405	if (WARN_ON(!fn)) {
 406		symbol_put(kvm_put_kvm);
 407		return;
 408	}
 409
 410	ret = fn(kvm);
 411	symbol_put(kvm_get_kvm_safe);
 412	if (!ret) {
 413		symbol_put(kvm_put_kvm);
 414		return;
 415	}
 416
 417	device->put_kvm = pfn;
 418	device->kvm = kvm;
 419}
 420
 421void vfio_device_put_kvm(struct vfio_device *device)
 422{
 423	lockdep_assert_held(&device->dev_set->lock);
 424
 425	if (!device->kvm)
 426		return;
 427
 428	if (WARN_ON(!device->put_kvm))
 429		goto clear;
 430
 431	device->put_kvm(device->kvm);
 432	device->put_kvm = NULL;
 433	symbol_put(kvm_put_kvm);
 434
 435clear:
 436	device->kvm = NULL;
 437}
 438#endif
 439
 440/* true if the vfio_device has open_device() called but not close_device() */
 441static bool vfio_assert_device_open(struct vfio_device *device)
 442{
 443	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
 444}
 445
 446struct vfio_device_file *
 447vfio_allocate_device_file(struct vfio_device *device)
 448{
 449	struct vfio_device_file *df;
 450
 451	df = kzalloc(sizeof(*df), GFP_KERNEL_ACCOUNT);
 452	if (!df)
 453		return ERR_PTR(-ENOMEM);
 454
 455	df->device = device;
 456	spin_lock_init(&df->kvm_ref_lock);
 457
 458	return df;
 459}
 460
 461static int vfio_df_device_first_open(struct vfio_device_file *df)
 462{
 463	struct vfio_device *device = df->device;
 464	struct iommufd_ctx *iommufd = df->iommufd;
 465	int ret;
 466
 467	lockdep_assert_held(&device->dev_set->lock);
 468
 469	if (!try_module_get(device->dev->driver->owner))
 470		return -ENODEV;
 471
 472	if (iommufd)
 473		ret = vfio_df_iommufd_bind(df);
 474	else
 475		ret = vfio_device_group_use_iommu(device);
 476	if (ret)
 477		goto err_module_put;
 478
 479	if (device->ops->open_device) {
 480		ret = device->ops->open_device(device);
 481		if (ret)
 482			goto err_unuse_iommu;
 483	}
 484	return 0;
 485
 486err_unuse_iommu:
 487	if (iommufd)
 488		vfio_df_iommufd_unbind(df);
 489	else
 490		vfio_device_group_unuse_iommu(device);
 491err_module_put:
 492	module_put(device->dev->driver->owner);
 493	return ret;
 494}
 495
 496static void vfio_df_device_last_close(struct vfio_device_file *df)
 497{
 498	struct vfio_device *device = df->device;
 499	struct iommufd_ctx *iommufd = df->iommufd;
 500
 501	lockdep_assert_held(&device->dev_set->lock);
 502
 503	if (device->ops->close_device)
 504		device->ops->close_device(device);
 505	if (iommufd)
 506		vfio_df_iommufd_unbind(df);
 507	else
 508		vfio_device_group_unuse_iommu(device);
 509	module_put(device->dev->driver->owner);
 510}
 511
 512int vfio_df_open(struct vfio_device_file *df)
 513{
 514	struct vfio_device *device = df->device;
 515	int ret = 0;
 516
 517	lockdep_assert_held(&device->dev_set->lock);
 518
 519	/*
 520	 * Only the group path allows the device to be opened multiple
 521	 * times.  The device cdev path doesn't have a secure way for it.
 522	 */
 523	if (device->open_count != 0 && !df->group)
 524		return -EINVAL;
 525
 526	device->open_count++;
 527	if (device->open_count == 1) {
 528		ret = vfio_df_device_first_open(df);
 529		if (ret)
 530			device->open_count--;
 531	}
 532
 533	return ret;
 534}
 535
 536void vfio_df_close(struct vfio_device_file *df)
 537{
 538	struct vfio_device *device = df->device;
 539
 540	lockdep_assert_held(&device->dev_set->lock);
 541
 542	vfio_assert_device_open(device);
 543	if (device->open_count == 1)
 544		vfio_df_device_last_close(df);
 545	device->open_count--;
 546}
 547
 548/*
 549 * Wrapper around pm_runtime_resume_and_get().
 550 * Return error code on failure or 0 on success.
 551 */
 552static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
 553{
 554	struct device *dev = device->dev;
 555
 556	if (dev->driver && dev->driver->pm) {
 557		int ret;
 558
 559		ret = pm_runtime_resume_and_get(dev);
 560		if (ret) {
 561			dev_info_ratelimited(dev,
 562				"vfio: runtime resume failed %d\n", ret);
 563			return -EIO;
 564		}
 565	}
 566
 567	return 0;
 568}
 569
 570/*
 571 * Wrapper around pm_runtime_put().
 572 */
 573static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
 574{
 575	struct device *dev = device->dev;
 576
 577	if (dev->driver && dev->driver->pm)
 578		pm_runtime_put(dev);
 579}
 580
 581/*
 582 * VFIO Device fd
 583 */
 584static int vfio_device_fops_release(struct inode *inode, struct file *filep)
 585{
 586	struct vfio_device_file *df = filep->private_data;
 587	struct vfio_device *device = df->device;
 588
 589	if (df->group)
 590		vfio_df_group_close(df);
 591	else
 592		vfio_df_unbind_iommufd(df);
 593
 594	vfio_device_put_registration(device);
 595
 596	kfree(df);
 597
 598	return 0;
 599}
 600
 601/*
 602 * vfio_mig_get_next_state - Compute the next step in the FSM
 603 * @cur_fsm - The current state the device is in
 604 * @new_fsm - The target state to reach
 605 * @next_fsm - Pointer to the next step to get to new_fsm
 606 *
 607 * Return 0 upon success, otherwise -errno
 608 * Upon success the next step in the state progression between cur_fsm and
 609 * new_fsm will be set in next_fsm.
 610 *
 611 * This breaks down requests for combination transitions into smaller steps and
 612 * returns the next step to get to new_fsm. The function may need to be called
 613 * multiple times before reaching new_fsm.
 614 *
 615 */
 616int vfio_mig_get_next_state(struct vfio_device *device,
 617			    enum vfio_device_mig_state cur_fsm,
 618			    enum vfio_device_mig_state new_fsm,
 619			    enum vfio_device_mig_state *next_fsm)
 620{
 621	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
 622	/*
 623	 * The coding in this table requires the driver to implement the
 624	 * following FSM arcs:
 625	 *         RESUMING -> STOP
 626	 *         STOP -> RESUMING
 627	 *         STOP -> STOP_COPY
 628	 *         STOP_COPY -> STOP
 629	 *
 630	 * If P2P is supported then the driver must also implement these FSM
 631	 * arcs:
 632	 *         RUNNING -> RUNNING_P2P
 633	 *         RUNNING_P2P -> RUNNING
 634	 *         RUNNING_P2P -> STOP
 635	 *         STOP -> RUNNING_P2P
 636	 *
 637	 * If precopy is supported then the driver must support these additional
 638	 * FSM arcs:
 639	 *         RUNNING -> PRE_COPY
 640	 *         PRE_COPY -> RUNNING
 641	 *         PRE_COPY -> STOP_COPY
 642	 * However, if precopy and P2P are supported together then the driver
 643	 * must support these additional arcs beyond the P2P arcs above:
 644	 *         PRE_COPY -> RUNNING
 645	 *         PRE_COPY -> PRE_COPY_P2P
 646	 *         PRE_COPY_P2P -> PRE_COPY
 647	 *         PRE_COPY_P2P -> RUNNING_P2P
 648	 *         PRE_COPY_P2P -> STOP_COPY
 649	 *         RUNNING -> PRE_COPY
 650	 *         RUNNING_P2P -> PRE_COPY_P2P
 651	 *
 652	 * Without P2P and precopy the driver must implement:
 653	 *         RUNNING -> STOP
 654	 *         STOP -> RUNNING
 655	 *
 656	 * The coding will step through multiple states for some combination
 657	 * transitions; if all optional features are supported, this means the
 658	 * following ones:
 659	 *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
 660	 *         PRE_COPY -> RUNNING -> RUNNING_P2P
 661	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
 662	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
 663	 *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
 664	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
 665	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
 666	 *         RESUMING -> STOP -> RUNNING_P2P
 667	 *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
 668	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
 669	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
 670	 *         RESUMING -> STOP -> STOP_COPY
 671	 *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
 672	 *         RUNNING -> RUNNING_P2P -> STOP
 673	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
 674	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
 675	 *         RUNNING_P2P -> RUNNING -> PRE_COPY
 676	 *         RUNNING_P2P -> STOP -> RESUMING
 677	 *         RUNNING_P2P -> STOP -> STOP_COPY
 678	 *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
 679	 *         STOP -> RUNNING_P2P -> RUNNING
 680	 *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
 681	 *         STOP_COPY -> STOP -> RESUMING
 682	 *         STOP_COPY -> STOP -> RUNNING_P2P
 683	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
 684	 *
 685	 *  The following transitions are blocked:
 686	 *         STOP_COPY -> PRE_COPY
 687	 *         STOP_COPY -> PRE_COPY_P2P
 688	 */
 689	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
 690		[VFIO_DEVICE_STATE_STOP] = {
 691			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
 692			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
 693			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
 694			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 695			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
 696			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
 697			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 698			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 699		},
 700		[VFIO_DEVICE_STATE_RUNNING] = {
 701			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
 702			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
 703			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
 704			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 705			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
 706			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
 707			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 708			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 709		},
 710		[VFIO_DEVICE_STATE_PRE_COPY] = {
 711			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
 712			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
 713			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
 714			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
 715			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
 716			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
 717			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
 718			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 719		},
 720		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
 721			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
 722			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
 723			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
 724			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
 725			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
 726			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
 727			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 728			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 729		},
 730		[VFIO_DEVICE_STATE_STOP_COPY] = {
 731			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
 732			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
 733			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
 734			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
 735			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
 736			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
 737			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
 738			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 739		},
 740		[VFIO_DEVICE_STATE_RESUMING] = {
 741			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
 742			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
 743			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
 744			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
 745			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
 746			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
 747			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
 748			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 749		},
 750		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
 751			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
 752			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
 753			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
 754			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
 755			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
 756			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
 757			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 758			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 759		},
 760		[VFIO_DEVICE_STATE_ERROR] = {
 761			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
 762			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
 763			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
 764			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
 765			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
 766			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
 767			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
 768			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 769		},
 770	};
 771
 772	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
 773		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
 774		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
 775		[VFIO_DEVICE_STATE_PRE_COPY] =
 776			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
 777		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
 778						   VFIO_MIGRATION_P2P |
 779						   VFIO_MIGRATION_PRE_COPY,
 780		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
 781		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
 782		[VFIO_DEVICE_STATE_RUNNING_P2P] =
 783			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
 784		[VFIO_DEVICE_STATE_ERROR] = ~0U,
 785	};
 786
 787	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
 788		    (state_flags_table[cur_fsm] & device->migration_flags) !=
 789			state_flags_table[cur_fsm]))
 790		return -EINVAL;
 791
 792	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
 793	   (state_flags_table[new_fsm] & device->migration_flags) !=
 794			state_flags_table[new_fsm])
 795		return -EINVAL;
 796
 797	/*
 798	 * Arcs touching optional and unsupported states are skipped over. The
 799	 * driver will instead see an arc from the original state to the next
 800	 * logical state, as per the above comment.
 801	 */
 802	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
 803	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
 804			state_flags_table[*next_fsm])
 805		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
 806
 807	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
 808}
 809EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
 810
 811/*
 812 * Convert the drivers's struct file into a FD number and return it to userspace
 813 */
 814static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
 815				   struct vfio_device_feature_mig_state *mig)
 816{
 817	int ret;
 818	int fd;
 819
 820	fd = get_unused_fd_flags(O_CLOEXEC);
 821	if (fd < 0) {
 822		ret = fd;
 823		goto out_fput;
 824	}
 825
 826	mig->data_fd = fd;
 827	if (copy_to_user(arg, mig, sizeof(*mig))) {
 828		ret = -EFAULT;
 829		goto out_put_unused;
 830	}
 831	fd_install(fd, filp);
 832	return 0;
 833
 834out_put_unused:
 835	put_unused_fd(fd);
 836out_fput:
 837	fput(filp);
 838	return ret;
 839}
 840
 841static int
 842vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
 843					   u32 flags, void __user *arg,
 844					   size_t argsz)
 845{
 846	size_t minsz =
 847		offsetofend(struct vfio_device_feature_mig_state, data_fd);
 848	struct vfio_device_feature_mig_state mig;
 849	struct file *filp = NULL;
 850	int ret;
 851
 852	if (!device->mig_ops)
 853		return -ENOTTY;
 854
 855	ret = vfio_check_feature(flags, argsz,
 856				 VFIO_DEVICE_FEATURE_SET |
 857				 VFIO_DEVICE_FEATURE_GET,
 858				 sizeof(mig));
 859	if (ret != 1)
 860		return ret;
 861
 862	if (copy_from_user(&mig, arg, minsz))
 863		return -EFAULT;
 864
 865	if (flags & VFIO_DEVICE_FEATURE_GET) {
 866		enum vfio_device_mig_state curr_state;
 867
 868		ret = device->mig_ops->migration_get_state(device,
 869							   &curr_state);
 870		if (ret)
 871			return ret;
 872		mig.device_state = curr_state;
 873		goto out_copy;
 874	}
 875
 876	/* Handle the VFIO_DEVICE_FEATURE_SET */
 877	filp = device->mig_ops->migration_set_state(device, mig.device_state);
 878	if (IS_ERR(filp) || !filp)
 879		goto out_copy;
 880
 881	return vfio_ioct_mig_return_fd(filp, arg, &mig);
 882out_copy:
 883	mig.data_fd = -1;
 884	if (copy_to_user(arg, &mig, sizeof(mig)))
 885		return -EFAULT;
 886	if (IS_ERR(filp))
 887		return PTR_ERR(filp);
 888	return 0;
 889}
 890
 891static int
 892vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
 893					      u32 flags, void __user *arg,
 894					      size_t argsz)
 895{
 896	struct vfio_device_feature_mig_data_size data_size = {};
 897	unsigned long stop_copy_length;
 898	int ret;
 899
 900	if (!device->mig_ops)
 901		return -ENOTTY;
 902
 903	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
 904				 sizeof(data_size));
 905	if (ret != 1)
 906		return ret;
 907
 908	ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
 909	if (ret)
 910		return ret;
 911
 912	data_size.stop_copy_length = stop_copy_length;
 913	if (copy_to_user(arg, &data_size, sizeof(data_size)))
 914		return -EFAULT;
 915
 916	return 0;
 917}
 918
 919static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
 920					       u32 flags, void __user *arg,
 921					       size_t argsz)
 922{
 923	struct vfio_device_feature_migration mig = {
 924		.flags = device->migration_flags,
 925	};
 926	int ret;
 927
 928	if (!device->mig_ops)
 929		return -ENOTTY;
 930
 931	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
 932				 sizeof(mig));
 933	if (ret != 1)
 934		return ret;
 935	if (copy_to_user(arg, &mig, sizeof(mig)))
 936		return -EFAULT;
 937	return 0;
 938}
 939
 940void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
 941			      u32 req_nodes)
 942{
 943	struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
 944	unsigned long min_gap, curr_gap;
 945
 946	/* Special shortcut when a single range is required */
 947	if (req_nodes == 1) {
 948		unsigned long last;
 949
 950		comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
 951
 952		/* Empty list */
 953		if (WARN_ON_ONCE(!comb_start))
 954			return;
 955
 956		curr = comb_start;
 957		while (curr) {
 958			last = curr->last;
 959			prev = curr;
 960			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
 961			if (prev != comb_start)
 962				interval_tree_remove(prev, root);
 963		}
 964		comb_start->last = last;
 965		return;
 966	}
 967
 968	/* Combine ranges which have the smallest gap */
 969	while (cur_nodes > req_nodes) {
 970		prev = NULL;
 971		min_gap = ULONG_MAX;
 972		curr = interval_tree_iter_first(root, 0, ULONG_MAX);
 973		while (curr) {
 974			if (prev) {
 975				curr_gap = curr->start - prev->last;
 976				if (curr_gap < min_gap) {
 977					min_gap = curr_gap;
 978					comb_start = prev;
 979					comb_end = curr;
 980				}
 981			}
 982			prev = curr;
 983			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
 984		}
 985
 986		/* Empty list or no nodes to combine */
 987		if (WARN_ON_ONCE(min_gap == ULONG_MAX))
 988			break;
 989
 990		comb_start->last = comb_end->last;
 991		interval_tree_remove(comb_end, root);
 992		cur_nodes--;
 993	}
 994}
 995EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges);
 996
 997/* Ranges should fit into a single kernel page */
 998#define LOG_MAX_RANGES \
 999	(PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
1000
1001static int
1002vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
1003					u32 flags, void __user *arg,
1004					size_t argsz)
1005{
1006	size_t minsz =
1007		offsetofend(struct vfio_device_feature_dma_logging_control,
1008			    ranges);
1009	struct vfio_device_feature_dma_logging_range __user *ranges;
1010	struct vfio_device_feature_dma_logging_control control;
1011	struct vfio_device_feature_dma_logging_range range;
1012	struct rb_root_cached root = RB_ROOT_CACHED;
1013	struct interval_tree_node *nodes;
1014	u64 iova_end;
1015	u32 nnodes;
1016	int i, ret;
1017
1018	if (!device->log_ops)
1019		return -ENOTTY;
1020
1021	ret = vfio_check_feature(flags, argsz,
1022				 VFIO_DEVICE_FEATURE_SET,
1023				 sizeof(control));
1024	if (ret != 1)
1025		return ret;
1026
1027	if (copy_from_user(&control, arg, minsz))
1028		return -EFAULT;
1029
1030	nnodes = control.num_ranges;
1031	if (!nnodes)
1032		return -EINVAL;
1033
1034	if (nnodes > LOG_MAX_RANGES)
1035		return -E2BIG;
1036
1037	ranges = u64_to_user_ptr(control.ranges);
1038	nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
1039			      GFP_KERNEL);
1040	if (!nodes)
1041		return -ENOMEM;
1042
1043	for (i = 0; i < nnodes; i++) {
1044		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
1045			ret = -EFAULT;
1046			goto end;
1047		}
1048		if (!IS_ALIGNED(range.iova, control.page_size) ||
1049		    !IS_ALIGNED(range.length, control.page_size)) {
1050			ret = -EINVAL;
1051			goto end;
1052		}
1053
1054		if (check_add_overflow(range.iova, range.length, &iova_end) ||
1055		    iova_end > ULONG_MAX) {
1056			ret = -EOVERFLOW;
1057			goto end;
1058		}
1059
1060		nodes[i].start = range.iova;
1061		nodes[i].last = range.iova + range.length - 1;
1062		if (interval_tree_iter_first(&root, nodes[i].start,
1063					     nodes[i].last)) {
1064			/* Range overlapping */
1065			ret = -EINVAL;
1066			goto end;
1067		}
1068		interval_tree_insert(nodes + i, &root);
1069	}
1070
1071	ret = device->log_ops->log_start(device, &root, nnodes,
1072					 &control.page_size);
1073	if (ret)
1074		goto end;
1075
1076	if (copy_to_user(arg, &control, sizeof(control))) {
1077		ret = -EFAULT;
1078		device->log_ops->log_stop(device);
1079	}
1080
1081end:
1082	kfree(nodes);
1083	return ret;
1084}
1085
1086static int
1087vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
1088				       u32 flags, void __user *arg,
1089				       size_t argsz)
1090{
1091	int ret;
1092
1093	if (!device->log_ops)
1094		return -ENOTTY;
1095
1096	ret = vfio_check_feature(flags, argsz,
1097				 VFIO_DEVICE_FEATURE_SET, 0);
1098	if (ret != 1)
1099		return ret;
1100
1101	return device->log_ops->log_stop(device);
1102}
1103
1104static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
1105					  unsigned long iova, size_t length,
1106					  void *opaque)
1107{
1108	struct vfio_device *device = opaque;
1109
1110	return device->log_ops->log_read_and_clear(device, iova, length, iter);
1111}
1112
1113static int
1114vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
1115					 u32 flags, void __user *arg,
1116					 size_t argsz)
1117{
1118	size_t minsz =
1119		offsetofend(struct vfio_device_feature_dma_logging_report,
1120			    bitmap);
1121	struct vfio_device_feature_dma_logging_report report;
1122	struct iova_bitmap *iter;
1123	u64 iova_end;
1124	int ret;
1125
1126	if (!device->log_ops)
1127		return -ENOTTY;
1128
1129	ret = vfio_check_feature(flags, argsz,
1130				 VFIO_DEVICE_FEATURE_GET,
1131				 sizeof(report));
1132	if (ret != 1)
1133		return ret;
1134
1135	if (copy_from_user(&report, arg, minsz))
1136		return -EFAULT;
1137
1138	if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1139		return -EINVAL;
1140
1141	if (check_add_overflow(report.iova, report.length, &iova_end) ||
1142	    iova_end > ULONG_MAX)
1143		return -EOVERFLOW;
1144
1145	iter = iova_bitmap_alloc(report.iova, report.length,
1146				 report.page_size,
1147				 u64_to_user_ptr(report.bitmap));
1148	if (IS_ERR(iter))
1149		return PTR_ERR(iter);
1150
1151	ret = iova_bitmap_for_each(iter, device,
1152				   vfio_device_log_read_and_clear);
1153
1154	iova_bitmap_free(iter);
1155	return ret;
1156}
1157
1158static int vfio_ioctl_device_feature(struct vfio_device *device,
1159				     struct vfio_device_feature __user *arg)
1160{
1161	size_t minsz = offsetofend(struct vfio_device_feature, flags);
1162	struct vfio_device_feature feature;
1163
1164	if (copy_from_user(&feature, arg, minsz))
1165		return -EFAULT;
1166
1167	if (feature.argsz < minsz)
1168		return -EINVAL;
1169
1170	/* Check unknown flags */
1171	if (feature.flags &
1172	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1173	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1174		return -EINVAL;
1175
1176	/* GET & SET are mutually exclusive except with PROBE */
1177	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1178	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1179	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
1180		return -EINVAL;
1181
1182	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1183	case VFIO_DEVICE_FEATURE_MIGRATION:
1184		return vfio_ioctl_device_feature_migration(
1185			device, feature.flags, arg->data,
1186			feature.argsz - minsz);
1187	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1188		return vfio_ioctl_device_feature_mig_device_state(
1189			device, feature.flags, arg->data,
1190			feature.argsz - minsz);
1191	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1192		return vfio_ioctl_device_feature_logging_start(
1193			device, feature.flags, arg->data,
1194			feature.argsz - minsz);
1195	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1196		return vfio_ioctl_device_feature_logging_stop(
1197			device, feature.flags, arg->data,
1198			feature.argsz - minsz);
1199	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1200		return vfio_ioctl_device_feature_logging_report(
1201			device, feature.flags, arg->data,
1202			feature.argsz - minsz);
1203	case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1204		return vfio_ioctl_device_feature_migration_data_size(
1205			device, feature.flags, arg->data,
1206			feature.argsz - minsz);
1207	default:
1208		if (unlikely(!device->ops->device_feature))
1209			return -EINVAL;
1210		return device->ops->device_feature(device, feature.flags,
1211						   arg->data,
1212						   feature.argsz - minsz);
1213	}
1214}
1215
1216static long vfio_device_fops_unl_ioctl(struct file *filep,
1217				       unsigned int cmd, unsigned long arg)
1218{
1219	struct vfio_device_file *df = filep->private_data;
1220	struct vfio_device *device = df->device;
1221	void __user *uptr = (void __user *)arg;
1222	int ret;
1223
1224	if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
1225		return vfio_df_ioctl_bind_iommufd(df, uptr);
1226
1227	/* Paired with smp_store_release() following vfio_df_open() */
1228	if (!smp_load_acquire(&df->access_granted))
1229		return -EINVAL;
1230
1231	ret = vfio_device_pm_runtime_get(device);
1232	if (ret)
1233		return ret;
1234
1235	/* cdev only ioctls */
1236	if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) {
1237		switch (cmd) {
1238		case VFIO_DEVICE_ATTACH_IOMMUFD_PT:
1239			ret = vfio_df_ioctl_attach_pt(df, uptr);
1240			goto out;
1241
1242		case VFIO_DEVICE_DETACH_IOMMUFD_PT:
1243			ret = vfio_df_ioctl_detach_pt(df, uptr);
1244			goto out;
1245		}
1246	}
1247
1248	switch (cmd) {
1249	case VFIO_DEVICE_FEATURE:
1250		ret = vfio_ioctl_device_feature(device, uptr);
1251		break;
1252
1253	default:
1254		if (unlikely(!device->ops->ioctl))
1255			ret = -EINVAL;
1256		else
1257			ret = device->ops->ioctl(device, cmd, arg);
1258		break;
1259	}
1260out:
1261	vfio_device_pm_runtime_put(device);
1262	return ret;
1263}
1264
1265static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1266				     size_t count, loff_t *ppos)
1267{
1268	struct vfio_device_file *df = filep->private_data;
1269	struct vfio_device *device = df->device;
1270
1271	/* Paired with smp_store_release() following vfio_df_open() */
1272	if (!smp_load_acquire(&df->access_granted))
1273		return -EINVAL;
1274
1275	if (unlikely(!device->ops->read))
1276		return -EINVAL;
1277
1278	return device->ops->read(device, buf, count, ppos);
1279}
1280
1281static ssize_t vfio_device_fops_write(struct file *filep,
1282				      const char __user *buf,
1283				      size_t count, loff_t *ppos)
1284{
1285	struct vfio_device_file *df = filep->private_data;
1286	struct vfio_device *device = df->device;
1287
1288	/* Paired with smp_store_release() following vfio_df_open() */
1289	if (!smp_load_acquire(&df->access_granted))
1290		return -EINVAL;
1291
1292	if (unlikely(!device->ops->write))
1293		return -EINVAL;
1294
1295	return device->ops->write(device, buf, count, ppos);
1296}
1297
1298static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1299{
1300	struct vfio_device_file *df = filep->private_data;
1301	struct vfio_device *device = df->device;
1302
1303	/* Paired with smp_store_release() following vfio_df_open() */
1304	if (!smp_load_acquire(&df->access_granted))
1305		return -EINVAL;
1306
1307	if (unlikely(!device->ops->mmap))
1308		return -EINVAL;
1309
1310	return device->ops->mmap(device, vma);
1311}
1312
1313const struct file_operations vfio_device_fops = {
1314	.owner		= THIS_MODULE,
1315	.open		= vfio_device_fops_cdev_open,
1316	.release	= vfio_device_fops_release,
1317	.read		= vfio_device_fops_read,
1318	.write		= vfio_device_fops_write,
1319	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1320	.compat_ioctl	= compat_ptr_ioctl,
1321	.mmap		= vfio_device_fops_mmap,
1322};
1323
1324static struct vfio_device *vfio_device_from_file(struct file *file)
1325{
1326	struct vfio_device_file *df = file->private_data;
1327
1328	if (file->f_op != &vfio_device_fops)
1329		return NULL;
1330	return df->device;
1331}
1332
1333/**
1334 * vfio_file_is_valid - True if the file is valid vfio file
1335 * @file: VFIO group file or VFIO device file
1336 */
1337bool vfio_file_is_valid(struct file *file)
1338{
1339	return vfio_group_from_file(file) ||
1340	       vfio_device_from_file(file);
1341}
1342EXPORT_SYMBOL_GPL(vfio_file_is_valid);
1343
1344/**
1345 * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1346 *        is always CPU cache coherent
1347 * @file: VFIO group file or VFIO device file
1348 *
1349 * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1350 * bit in DMA transactions. A return of false indicates that the user has
1351 * rights to access additional instructions such as wbinvd on x86.
1352 */
1353bool vfio_file_enforced_coherent(struct file *file)
1354{
1355	struct vfio_device *device;
1356	struct vfio_group *group;
1357
1358	group = vfio_group_from_file(file);
1359	if (group)
1360		return vfio_group_enforced_coherent(group);
1361
1362	device = vfio_device_from_file(file);
1363	if (device)
1364		return device_iommu_capable(device->dev,
1365					    IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
1366
1367	return true;
1368}
1369EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1370
1371static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
1372{
1373	struct vfio_device_file *df = file->private_data;
1374
1375	/*
1376	 * The kvm is first recorded in the vfio_device_file, and will
1377	 * be propagated to vfio_device::kvm when the file is bound to
1378	 * iommufd successfully in the vfio device cdev path.
1379	 */
1380	spin_lock(&df->kvm_ref_lock);
1381	df->kvm = kvm;
1382	spin_unlock(&df->kvm_ref_lock);
1383}
1384
1385/**
1386 * vfio_file_set_kvm - Link a kvm with VFIO drivers
1387 * @file: VFIO group file or VFIO device file
1388 * @kvm: KVM to link
1389 *
1390 * When a VFIO device is first opened the KVM will be available in
1391 * device->kvm if one was associated with the file.
1392 */
1393void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1394{
1395	struct vfio_group *group;
1396
1397	group = vfio_group_from_file(file);
1398	if (group)
1399		vfio_group_set_kvm(group, kvm);
1400
1401	if (vfio_device_from_file(file))
1402		vfio_device_file_set_kvm(file, kvm);
1403}
1404EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1405
1406/*
1407 * Sub-module support
1408 */
1409/*
1410 * Helper for managing a buffer of info chain capabilities, allocate or
1411 * reallocate a buffer with additional @size, filling in @id and @version
1412 * of the capability.  A pointer to the new capability is returned.
1413 *
1414 * NB. The chain is based at the head of the buffer, so new entries are
1415 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1416 * next offsets prior to copying to the user buffer.
1417 */
1418struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1419					       size_t size, u16 id, u16 version)
1420{
1421	void *buf;
1422	struct vfio_info_cap_header *header, *tmp;
1423
1424	/* Ensure that the next capability struct will be aligned */
1425	size = ALIGN(size, sizeof(u64));
1426
1427	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1428	if (!buf) {
1429		kfree(caps->buf);
1430		caps->buf = NULL;
1431		caps->size = 0;
1432		return ERR_PTR(-ENOMEM);
1433	}
1434
1435	caps->buf = buf;
1436	header = buf + caps->size;
1437
1438	/* Eventually copied to user buffer, zero */
1439	memset(header, 0, size);
1440
1441	header->id = id;
1442	header->version = version;
1443
1444	/* Add to the end of the capability chain */
1445	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1446		; /* nothing */
1447
1448	tmp->next = caps->size;
1449	caps->size += size;
1450
1451	return header;
1452}
1453EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1454
1455void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1456{
1457	struct vfio_info_cap_header *tmp;
1458	void *buf = (void *)caps->buf;
1459
1460	/* Capability structs should start with proper alignment */
1461	WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));
1462
1463	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1464		tmp->next += offset;
1465}
1466EXPORT_SYMBOL(vfio_info_cap_shift);
1467
1468int vfio_info_add_capability(struct vfio_info_cap *caps,
1469			     struct vfio_info_cap_header *cap, size_t size)
1470{
1471	struct vfio_info_cap_header *header;
1472
1473	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1474	if (IS_ERR(header))
1475		return PTR_ERR(header);
1476
1477	memcpy(header + 1, cap + 1, size - sizeof(*header));
1478
1479	return 0;
1480}
1481EXPORT_SYMBOL(vfio_info_add_capability);
1482
1483int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1484				       int max_irq_type, size_t *data_size)
1485{
1486	unsigned long minsz;
1487	size_t size;
1488
1489	minsz = offsetofend(struct vfio_irq_set, count);
1490
1491	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1492	    (hdr->count >= (U32_MAX - hdr->start)) ||
1493	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1494				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1495		return -EINVAL;
1496
1497	if (data_size)
1498		*data_size = 0;
1499
1500	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1501		return -EINVAL;
1502
1503	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1504	case VFIO_IRQ_SET_DATA_NONE:
1505		size = 0;
1506		break;
1507	case VFIO_IRQ_SET_DATA_BOOL:
1508		size = sizeof(uint8_t);
1509		break;
1510	case VFIO_IRQ_SET_DATA_EVENTFD:
1511		size = sizeof(int32_t);
1512		break;
1513	default:
1514		return -EINVAL;
1515	}
1516
1517	if (size) {
1518		if (hdr->argsz - minsz < hdr->count * size)
1519			return -EINVAL;
1520
1521		if (!data_size)
1522			return -EINVAL;
1523
1524		*data_size = hdr->count * size;
1525	}
1526
1527	return 0;
1528}
1529EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1530
1531/*
1532 * Pin contiguous user pages and return their associated host pages for local
1533 * domain only.
1534 * @device [in]  : device
1535 * @iova [in]    : starting IOVA of user pages to be pinned.
1536 * @npage [in]   : count of pages to be pinned.  This count should not
1537 *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1538 * @prot [in]    : protection flags
1539 * @pages[out]   : array of host pages
1540 * Return error or number of pages pinned.
1541 *
1542 * A driver may only call this function if the vfio_device was created
1543 * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1544 */
1545int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1546		   int npage, int prot, struct page **pages)
1547{
1548	/* group->container cannot change while a vfio device is open */
1549	if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1550		return -EINVAL;
1551	if (!device->ops->dma_unmap)
1552		return -EINVAL;
1553	if (vfio_device_has_container(device))
1554		return vfio_device_container_pin_pages(device, iova,
1555						       npage, prot, pages);
1556	if (device->iommufd_access) {
1557		int ret;
1558
1559		if (iova > ULONG_MAX)
1560			return -EINVAL;
1561		/*
1562		 * VFIO ignores the sub page offset, npages is from the start of
1563		 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1564		 * the sub page offset by doing:
1565		 *     pages[0] + (iova % PAGE_SIZE)
1566		 */
1567		ret = iommufd_access_pin_pages(
1568			device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1569			npage * PAGE_SIZE, pages,
1570			(prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1571		if (ret)
1572			return ret;
1573		return npage;
1574	}
1575	return -EINVAL;
1576}
1577EXPORT_SYMBOL(vfio_pin_pages);
1578
1579/*
1580 * Unpin contiguous host pages for local domain only.
1581 * @device [in]  : device
1582 * @iova [in]    : starting address of user pages to be unpinned.
1583 * @npage [in]   : count of pages to be unpinned.  This count should not
1584 *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1585 */
1586void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1587{
1588	if (WARN_ON(!vfio_assert_device_open(device)))
1589		return;
1590	if (WARN_ON(!device->ops->dma_unmap))
1591		return;
1592
1593	if (vfio_device_has_container(device)) {
1594		vfio_device_container_unpin_pages(device, iova, npage);
1595		return;
1596	}
1597	if (device->iommufd_access) {
1598		if (WARN_ON(iova > ULONG_MAX))
1599			return;
1600		iommufd_access_unpin_pages(device->iommufd_access,
1601					   ALIGN_DOWN(iova, PAGE_SIZE),
1602					   npage * PAGE_SIZE);
1603		return;
1604	}
1605}
1606EXPORT_SYMBOL(vfio_unpin_pages);
1607
1608/*
1609 * This interface allows the CPUs to perform some sort of virtual DMA on
1610 * behalf of the device.
1611 *
1612 * CPUs read/write from/into a range of IOVAs pointing to user space memory
1613 * into/from a kernel buffer.
1614 *
1615 * As the read/write of user space memory is conducted via the CPUs and is
1616 * not a real device DMA, it is not necessary to pin the user space memory.
1617 *
1618 * @device [in]		: VFIO device
1619 * @iova [in]		: base IOVA of a user space buffer
1620 * @data [in]		: pointer to kernel buffer
1621 * @len [in]		: kernel buffer length
1622 * @write		: indicate read or write
1623 * Return error code on failure or 0 on success.
1624 */
1625int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1626		size_t len, bool write)
1627{
1628	if (!data || len <= 0 || !vfio_assert_device_open(device))
1629		return -EINVAL;
1630
1631	if (vfio_device_has_container(device))
1632		return vfio_device_container_dma_rw(device, iova,
1633						    data, len, write);
1634
1635	if (device->iommufd_access) {
1636		unsigned int flags = 0;
1637
1638		if (iova > ULONG_MAX)
1639			return -EINVAL;
1640
1641		/* VFIO historically tries to auto-detect a kthread */
1642		if (!current->mm)
1643			flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1644		if (write)
1645			flags |= IOMMUFD_ACCESS_RW_WRITE;
1646		return iommufd_access_rw(device->iommufd_access, iova, data,
1647					 len, flags);
1648	}
1649	return -EINVAL;
1650}
1651EXPORT_SYMBOL(vfio_dma_rw);
1652
1653/*
1654 * Module/class support
1655 */
1656static int __init vfio_init(void)
1657{
1658	int ret;
1659
1660	ida_init(&vfio.device_ida);
1661
1662	ret = vfio_group_init();
1663	if (ret)
1664		return ret;
1665
1666	ret = vfio_virqfd_init();
1667	if (ret)
1668		goto err_virqfd;
1669
1670	/* /sys/class/vfio-dev/vfioX */
1671	vfio.device_class = class_create("vfio-dev");
1672	if (IS_ERR(vfio.device_class)) {
1673		ret = PTR_ERR(vfio.device_class);
1674		goto err_dev_class;
1675	}
1676
1677	ret = vfio_cdev_init(vfio.device_class);
1678	if (ret)
1679		goto err_alloc_dev_chrdev;
1680
1681	vfio_debugfs_create_root();
1682	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1683	return 0;
1684
1685err_alloc_dev_chrdev:
1686	class_destroy(vfio.device_class);
1687	vfio.device_class = NULL;
1688err_dev_class:
1689	vfio_virqfd_exit();
1690err_virqfd:
1691	vfio_group_cleanup();
1692	return ret;
1693}
1694
1695static void __exit vfio_cleanup(void)
1696{
1697	vfio_debugfs_remove_root();
1698	ida_destroy(&vfio.device_ida);
1699	vfio_cdev_cleanup();
1700	class_destroy(vfio.device_class);
1701	vfio.device_class = NULL;
1702	vfio_virqfd_exit();
1703	vfio_group_cleanup();
1704	xa_destroy(&vfio_device_set_xa);
1705}
1706
1707module_init(vfio_init);
1708module_exit(vfio_cleanup);
1709
1710MODULE_IMPORT_NS(IOMMUFD);
1711MODULE_VERSION(DRIVER_VERSION);
1712MODULE_LICENSE("GPL v2");
1713MODULE_AUTHOR(DRIVER_AUTHOR);
1714MODULE_DESCRIPTION(DRIVER_DESC);
1715MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");