Linux Audio

Check our new training course

Loading...
v6.8
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * VFIO core
   4 *
   5 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   6 *     Author: Alex Williamson <alex.williamson@redhat.com>
   7 *
   8 * Derived from original vfio:
   9 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  10 * Author: Tom Lyon, pugs@cisco.com
  11 */
  12
  13#include <linux/cdev.h>
  14#include <linux/compat.h>
  15#include <linux/device.h>
  16#include <linux/fs.h>
  17#include <linux/idr.h>
  18#include <linux/iommu.h>
  19#ifdef CONFIG_HAVE_KVM
  20#include <linux/kvm_host.h>
  21#endif
  22#include <linux/list.h>
  23#include <linux/miscdevice.h>
  24#include <linux/module.h>
  25#include <linux/mutex.h>
  26#include <linux/pci.h>
  27#include <linux/rwsem.h>
  28#include <linux/sched.h>
  29#include <linux/slab.h>
  30#include <linux/stat.h>
  31#include <linux/string.h>
  32#include <linux/uaccess.h>
  33#include <linux/vfio.h>
  34#include <linux/wait.h>
  35#include <linux/sched/signal.h>
  36#include <linux/pm_runtime.h>
  37#include <linux/interval_tree.h>
  38#include <linux/iova_bitmap.h>
  39#include <linux/iommufd.h>
  40#include "vfio.h"
  41
  42#define DRIVER_VERSION	"0.3"
  43#define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
  44#define DRIVER_DESC	"VFIO - User Level meta-driver"
  45
  46static struct vfio {
  47	struct class			*device_class;
  48	struct ida			device_ida;
  49} vfio;
  50
  51#ifdef CONFIG_VFIO_NOIOMMU
  52bool vfio_noiommu __read_mostly;
  53module_param_named(enable_unsafe_noiommu_mode,
  54		   vfio_noiommu, bool, S_IRUGO | S_IWUSR);
  55MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
  56#endif
  57
  58static DEFINE_XARRAY(vfio_device_set_xa);
  59
  60int vfio_assign_device_set(struct vfio_device *device, void *set_id)
  61{
  62	unsigned long idx = (unsigned long)set_id;
  63	struct vfio_device_set *new_dev_set;
  64	struct vfio_device_set *dev_set;
  65
  66	if (WARN_ON(!set_id))
  67		return -EINVAL;
  68
  69	/*
  70	 * Atomically acquire a singleton object in the xarray for this set_id
  71	 */
  72	xa_lock(&vfio_device_set_xa);
  73	dev_set = xa_load(&vfio_device_set_xa, idx);
  74	if (dev_set)
  75		goto found_get_ref;
  76	xa_unlock(&vfio_device_set_xa);
  77
  78	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
  79	if (!new_dev_set)
  80		return -ENOMEM;
  81	mutex_init(&new_dev_set->lock);
  82	INIT_LIST_HEAD(&new_dev_set->device_list);
  83	new_dev_set->set_id = set_id;
  84
  85	xa_lock(&vfio_device_set_xa);
  86	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
  87			       GFP_KERNEL);
  88	if (!dev_set) {
  89		dev_set = new_dev_set;
  90		goto found_get_ref;
  91	}
  92
  93	kfree(new_dev_set);
  94	if (xa_is_err(dev_set)) {
  95		xa_unlock(&vfio_device_set_xa);
  96		return xa_err(dev_set);
  97	}
  98
  99found_get_ref:
 100	dev_set->device_count++;
 101	xa_unlock(&vfio_device_set_xa);
 102	mutex_lock(&dev_set->lock);
 103	device->dev_set = dev_set;
 104	list_add_tail(&device->dev_set_list, &dev_set->device_list);
 105	mutex_unlock(&dev_set->lock);
 106	return 0;
 107}
 108EXPORT_SYMBOL_GPL(vfio_assign_device_set);
 109
 110static void vfio_release_device_set(struct vfio_device *device)
 111{
 112	struct vfio_device_set *dev_set = device->dev_set;
 113
 114	if (!dev_set)
 115		return;
 116
 117	mutex_lock(&dev_set->lock);
 118	list_del(&device->dev_set_list);
 119	mutex_unlock(&dev_set->lock);
 120
 121	xa_lock(&vfio_device_set_xa);
 122	if (!--dev_set->device_count) {
 123		__xa_erase(&vfio_device_set_xa,
 124			   (unsigned long)dev_set->set_id);
 125		mutex_destroy(&dev_set->lock);
 126		kfree(dev_set);
 127	}
 128	xa_unlock(&vfio_device_set_xa);
 129}
 130
 131unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
 132{
 133	struct vfio_device *cur;
 134	unsigned int open_count = 0;
 135
 136	lockdep_assert_held(&dev_set->lock);
 137
 138	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
 139		open_count += cur->open_count;
 140	return open_count;
 141}
 142EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
 143
 144struct vfio_device *
 145vfio_find_device_in_devset(struct vfio_device_set *dev_set,
 146			   struct device *dev)
 147{
 148	struct vfio_device *cur;
 149
 150	lockdep_assert_held(&dev_set->lock);
 151
 152	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
 153		if (cur->dev == dev)
 154			return cur;
 155	return NULL;
 156}
 157EXPORT_SYMBOL_GPL(vfio_find_device_in_devset);
 158
 159/*
 160 * Device objects - create, release, get, put, search
 161 */
 162/* Device reference always implies a group reference */
 163void vfio_device_put_registration(struct vfio_device *device)
 164{
 165	if (refcount_dec_and_test(&device->refcount))
 166		complete(&device->comp);
 167}
 168
 169bool vfio_device_try_get_registration(struct vfio_device *device)
 170{
 171	return refcount_inc_not_zero(&device->refcount);
 172}
 173
 174/*
 175 * VFIO driver API
 176 */
 177/* Release helper called by vfio_put_device() */
 178static void vfio_device_release(struct device *dev)
 179{
 180	struct vfio_device *device =
 181			container_of(dev, struct vfio_device, device);
 182
 183	vfio_release_device_set(device);
 184	ida_free(&vfio.device_ida, device->index);
 185
 186	if (device->ops->release)
 187		device->ops->release(device);
 188
 189	kvfree(device);
 190}
 191
 192static int vfio_init_device(struct vfio_device *device, struct device *dev,
 193			    const struct vfio_device_ops *ops);
 194
 195/*
 196 * Allocate and initialize vfio_device so it can be registered to vfio
 197 * core.
 198 *
 199 * Drivers should use the wrapper vfio_alloc_device() for allocation.
 200 * @size is the size of the structure to be allocated, including any
 201 * private data used by the driver.
 202 *
 203 * Driver may provide an @init callback to cover device private data.
 204 *
 205 * Use vfio_put_device() to release the structure after success return.
 206 */
 207struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
 208				       const struct vfio_device_ops *ops)
 209{
 210	struct vfio_device *device;
 211	int ret;
 212
 213	if (WARN_ON(size < sizeof(struct vfio_device)))
 214		return ERR_PTR(-EINVAL);
 215
 216	device = kvzalloc(size, GFP_KERNEL);
 217	if (!device)
 218		return ERR_PTR(-ENOMEM);
 219
 220	ret = vfio_init_device(device, dev, ops);
 221	if (ret)
 222		goto out_free;
 223	return device;
 224
 225out_free:
 226	kvfree(device);
 227	return ERR_PTR(ret);
 228}
 229EXPORT_SYMBOL_GPL(_vfio_alloc_device);
 230
 231/*
 232 * Initialize a vfio_device so it can be registered to vfio core.
 233 */
 234static int vfio_init_device(struct vfio_device *device, struct device *dev,
 235			    const struct vfio_device_ops *ops)
 236{
 237	int ret;
 238
 239	ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
 240	if (ret < 0) {
 241		dev_dbg(dev, "Error to alloc index\n");
 242		return ret;
 243	}
 244
 245	device->index = ret;
 246	init_completion(&device->comp);
 247	device->dev = dev;
 248	device->ops = ops;
 249
 250	if (ops->init) {
 251		ret = ops->init(device);
 252		if (ret)
 253			goto out_uninit;
 254	}
 255
 256	device_initialize(&device->device);
 257	device->device.release = vfio_device_release;
 258	device->device.class = vfio.device_class;
 259	device->device.parent = device->dev;
 260	return 0;
 261
 262out_uninit:
 263	vfio_release_device_set(device);
 264	ida_free(&vfio.device_ida, device->index);
 265	return ret;
 266}
 267
 268static int __vfio_register_dev(struct vfio_device *device,
 269			       enum vfio_group_type type)
 270{
 271	int ret;
 272
 273	if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
 274		    (!device->ops->bind_iommufd ||
 275		     !device->ops->unbind_iommufd ||
 276		     !device->ops->attach_ioas ||
 277		     !device->ops->detach_ioas)))
 278		return -EINVAL;
 279
 280	/*
 281	 * If the driver doesn't specify a set then the device is added to a
 282	 * singleton set just for itself.
 283	 */
 284	if (!device->dev_set)
 285		vfio_assign_device_set(device, device);
 286
 287	ret = dev_set_name(&device->device, "vfio%d", device->index);
 288	if (ret)
 289		return ret;
 290
 291	ret = vfio_device_set_group(device, type);
 292	if (ret)
 293		return ret;
 294
 295	/*
 296	 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
 297	 * restore cache coherency. It has to be checked here because it is only
 298	 * valid for cases where we are using iommu groups.
 299	 */
 300	if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) &&
 301	    !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) {
 302		ret = -EINVAL;
 303		goto err_out;
 304	}
 305
 306	ret = vfio_device_add(device);
 307	if (ret)
 308		goto err_out;
 309
 310	/* Refcounting can't start until the driver calls register */
 311	refcount_set(&device->refcount, 1);
 312
 313	vfio_device_group_register(device);
 314	vfio_device_debugfs_init(device);
 315
 316	return 0;
 317err_out:
 318	vfio_device_remove_group(device);
 319	return ret;
 320}
 321
 322int vfio_register_group_dev(struct vfio_device *device)
 323{
 324	return __vfio_register_dev(device, VFIO_IOMMU);
 325}
 326EXPORT_SYMBOL_GPL(vfio_register_group_dev);
 327
 328/*
 329 * Register a virtual device without IOMMU backing.  The user of this
 330 * device must not be able to directly trigger unmediated DMA.
 331 */
 332int vfio_register_emulated_iommu_dev(struct vfio_device *device)
 333{
 334	return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
 335}
 336EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
 337
 338/*
 339 * Decrement the device reference count and wait for the device to be
 340 * removed.  Open file descriptors for the device... */
 341void vfio_unregister_group_dev(struct vfio_device *device)
 342{
 343	unsigned int i = 0;
 344	bool interrupted = false;
 345	long rc;
 346
 347	/*
 348	 * Prevent new device opened by userspace via the
 349	 * VFIO_GROUP_GET_DEVICE_FD in the group path.
 350	 */
 351	vfio_device_group_unregister(device);
 352
 353	/*
 354	 * Balances vfio_device_add() in register path, also prevents
 355	 * new device opened by userspace in the cdev path.
 356	 */
 357	vfio_device_del(device);
 358
 359	vfio_device_put_registration(device);
 360	rc = try_wait_for_completion(&device->comp);
 361	while (rc <= 0) {
 362		if (device->ops->request)
 363			device->ops->request(device, i++);
 364
 365		if (interrupted) {
 366			rc = wait_for_completion_timeout(&device->comp,
 367							 HZ * 10);
 368		} else {
 369			rc = wait_for_completion_interruptible_timeout(
 370				&device->comp, HZ * 10);
 371			if (rc < 0) {
 372				interrupted = true;
 373				dev_warn(device->dev,
 374					 "Device is currently in use, task"
 375					 " \"%s\" (%d) "
 376					 "blocked until device is released",
 377					 current->comm, task_pid_nr(current));
 378			}
 379		}
 380	}
 381
 382	vfio_device_debugfs_exit(device);
 
 
 
 
 383	/* Balances vfio_device_set_group in register path */
 384	vfio_device_remove_group(device);
 385}
 386EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
 387
 388#ifdef CONFIG_HAVE_KVM
 389void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
 390{
 391	void (*pfn)(struct kvm *kvm);
 392	bool (*fn)(struct kvm *kvm);
 393	bool ret;
 394
 395	lockdep_assert_held(&device->dev_set->lock);
 396
 397	if (!kvm)
 398		return;
 399
 400	pfn = symbol_get(kvm_put_kvm);
 401	if (WARN_ON(!pfn))
 402		return;
 403
 404	fn = symbol_get(kvm_get_kvm_safe);
 405	if (WARN_ON(!fn)) {
 406		symbol_put(kvm_put_kvm);
 407		return;
 408	}
 409
 410	ret = fn(kvm);
 411	symbol_put(kvm_get_kvm_safe);
 412	if (!ret) {
 413		symbol_put(kvm_put_kvm);
 414		return;
 415	}
 416
 417	device->put_kvm = pfn;
 418	device->kvm = kvm;
 419}
 420
 421void vfio_device_put_kvm(struct vfio_device *device)
 422{
 423	lockdep_assert_held(&device->dev_set->lock);
 424
 425	if (!device->kvm)
 426		return;
 427
 428	if (WARN_ON(!device->put_kvm))
 429		goto clear;
 430
 431	device->put_kvm(device->kvm);
 432	device->put_kvm = NULL;
 433	symbol_put(kvm_put_kvm);
 434
 435clear:
 436	device->kvm = NULL;
 437}
 438#endif
 439
 440/* true if the vfio_device has open_device() called but not close_device() */
 441static bool vfio_assert_device_open(struct vfio_device *device)
 442{
 443	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
 444}
 445
 446struct vfio_device_file *
 447vfio_allocate_device_file(struct vfio_device *device)
 448{
 449	struct vfio_device_file *df;
 450
 451	df = kzalloc(sizeof(*df), GFP_KERNEL_ACCOUNT);
 452	if (!df)
 453		return ERR_PTR(-ENOMEM);
 454
 455	df->device = device;
 456	spin_lock_init(&df->kvm_ref_lock);
 457
 458	return df;
 459}
 460
 461static int vfio_df_device_first_open(struct vfio_device_file *df)
 462{
 463	struct vfio_device *device = df->device;
 464	struct iommufd_ctx *iommufd = df->iommufd;
 465	int ret;
 466
 467	lockdep_assert_held(&device->dev_set->lock);
 468
 469	if (!try_module_get(device->dev->driver->owner))
 470		return -ENODEV;
 471
 472	if (iommufd)
 473		ret = vfio_df_iommufd_bind(df);
 474	else
 475		ret = vfio_device_group_use_iommu(device);
 476	if (ret)
 477		goto err_module_put;
 478
 
 479	if (device->ops->open_device) {
 480		ret = device->ops->open_device(device);
 481		if (ret)
 482			goto err_unuse_iommu;
 483	}
 484	return 0;
 485
 486err_unuse_iommu:
 
 487	if (iommufd)
 488		vfio_df_iommufd_unbind(df);
 489	else
 490		vfio_device_group_unuse_iommu(device);
 491err_module_put:
 492	module_put(device->dev->driver->owner);
 493	return ret;
 494}
 495
 496static void vfio_df_device_last_close(struct vfio_device_file *df)
 
 497{
 498	struct vfio_device *device = df->device;
 499	struct iommufd_ctx *iommufd = df->iommufd;
 500
 501	lockdep_assert_held(&device->dev_set->lock);
 502
 503	if (device->ops->close_device)
 504		device->ops->close_device(device);
 
 505	if (iommufd)
 506		vfio_df_iommufd_unbind(df);
 507	else
 508		vfio_device_group_unuse_iommu(device);
 509	module_put(device->dev->driver->owner);
 510}
 511
 512int vfio_df_open(struct vfio_device_file *df)
 
 513{
 514	struct vfio_device *device = df->device;
 515	int ret = 0;
 516
 517	lockdep_assert_held(&device->dev_set->lock);
 518
 519	/*
 520	 * Only the group path allows the device to be opened multiple
 521	 * times.  The device cdev path doesn't have a secure way for it.
 522	 */
 523	if (device->open_count != 0 && !df->group)
 524		return -EINVAL;
 525
 526	device->open_count++;
 527	if (device->open_count == 1) {
 528		ret = vfio_df_device_first_open(df);
 529		if (ret)
 530			device->open_count--;
 531	}
 
 532
 533	return ret;
 534}
 535
 536void vfio_df_close(struct vfio_device_file *df)
 
 537{
 538	struct vfio_device *device = df->device;
 539
 540	lockdep_assert_held(&device->dev_set->lock);
 541
 542	vfio_assert_device_open(device);
 543	if (device->open_count == 1)
 544		vfio_df_device_last_close(df);
 545	device->open_count--;
 
 546}
 547
 548/*
 549 * Wrapper around pm_runtime_resume_and_get().
 550 * Return error code on failure or 0 on success.
 551 */
 552static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
 553{
 554	struct device *dev = device->dev;
 555
 556	if (dev->driver && dev->driver->pm) {
 557		int ret;
 558
 559		ret = pm_runtime_resume_and_get(dev);
 560		if (ret) {
 561			dev_info_ratelimited(dev,
 562				"vfio: runtime resume failed %d\n", ret);
 563			return -EIO;
 564		}
 565	}
 566
 567	return 0;
 568}
 569
 570/*
 571 * Wrapper around pm_runtime_put().
 572 */
 573static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
 574{
 575	struct device *dev = device->dev;
 576
 577	if (dev->driver && dev->driver->pm)
 578		pm_runtime_put(dev);
 579}
 580
 581/*
 582 * VFIO Device fd
 583 */
 584static int vfio_device_fops_release(struct inode *inode, struct file *filep)
 585{
 586	struct vfio_device_file *df = filep->private_data;
 587	struct vfio_device *device = df->device;
 588
 589	if (df->group)
 590		vfio_df_group_close(df);
 591	else
 592		vfio_df_unbind_iommufd(df);
 593
 594	vfio_device_put_registration(device);
 595
 596	kfree(df);
 597
 598	return 0;
 599}
 600
 601/*
 602 * vfio_mig_get_next_state - Compute the next step in the FSM
 603 * @cur_fsm - The current state the device is in
 604 * @new_fsm - The target state to reach
 605 * @next_fsm - Pointer to the next step to get to new_fsm
 606 *
 607 * Return 0 upon success, otherwise -errno
 608 * Upon success the next step in the state progression between cur_fsm and
 609 * new_fsm will be set in next_fsm.
 610 *
 611 * This breaks down requests for combination transitions into smaller steps and
 612 * returns the next step to get to new_fsm. The function may need to be called
 613 * multiple times before reaching new_fsm.
 614 *
 615 */
 616int vfio_mig_get_next_state(struct vfio_device *device,
 617			    enum vfio_device_mig_state cur_fsm,
 618			    enum vfio_device_mig_state new_fsm,
 619			    enum vfio_device_mig_state *next_fsm)
 620{
 621	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
 622	/*
 623	 * The coding in this table requires the driver to implement the
 624	 * following FSM arcs:
 625	 *         RESUMING -> STOP
 626	 *         STOP -> RESUMING
 627	 *         STOP -> STOP_COPY
 628	 *         STOP_COPY -> STOP
 629	 *
 630	 * If P2P is supported then the driver must also implement these FSM
 631	 * arcs:
 632	 *         RUNNING -> RUNNING_P2P
 633	 *         RUNNING_P2P -> RUNNING
 634	 *         RUNNING_P2P -> STOP
 635	 *         STOP -> RUNNING_P2P
 636	 *
 637	 * If precopy is supported then the driver must support these additional
 638	 * FSM arcs:
 639	 *         RUNNING -> PRE_COPY
 640	 *         PRE_COPY -> RUNNING
 641	 *         PRE_COPY -> STOP_COPY
 642	 * However, if precopy and P2P are supported together then the driver
 643	 * must support these additional arcs beyond the P2P arcs above:
 644	 *         PRE_COPY -> RUNNING
 645	 *         PRE_COPY -> PRE_COPY_P2P
 646	 *         PRE_COPY_P2P -> PRE_COPY
 647	 *         PRE_COPY_P2P -> RUNNING_P2P
 648	 *         PRE_COPY_P2P -> STOP_COPY
 649	 *         RUNNING -> PRE_COPY
 650	 *         RUNNING_P2P -> PRE_COPY_P2P
 651	 *
 652	 * Without P2P and precopy the driver must implement:
 653	 *         RUNNING -> STOP
 654	 *         STOP -> RUNNING
 655	 *
 656	 * The coding will step through multiple states for some combination
 657	 * transitions; if all optional features are supported, this means the
 658	 * following ones:
 659	 *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
 660	 *         PRE_COPY -> RUNNING -> RUNNING_P2P
 661	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
 662	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
 663	 *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
 664	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
 665	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
 666	 *         RESUMING -> STOP -> RUNNING_P2P
 667	 *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
 668	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
 669	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
 670	 *         RESUMING -> STOP -> STOP_COPY
 671	 *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
 672	 *         RUNNING -> RUNNING_P2P -> STOP
 673	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
 674	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
 675	 *         RUNNING_P2P -> RUNNING -> PRE_COPY
 676	 *         RUNNING_P2P -> STOP -> RESUMING
 677	 *         RUNNING_P2P -> STOP -> STOP_COPY
 678	 *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
 679	 *         STOP -> RUNNING_P2P -> RUNNING
 680	 *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
 681	 *         STOP_COPY -> STOP -> RESUMING
 682	 *         STOP_COPY -> STOP -> RUNNING_P2P
 683	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
 684	 *
 685	 *  The following transitions are blocked:
 686	 *         STOP_COPY -> PRE_COPY
 687	 *         STOP_COPY -> PRE_COPY_P2P
 688	 */
 689	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
 690		[VFIO_DEVICE_STATE_STOP] = {
 691			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
 692			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
 693			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
 694			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 695			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
 696			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
 697			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 698			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 699		},
 700		[VFIO_DEVICE_STATE_RUNNING] = {
 701			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
 702			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
 703			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
 704			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 705			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
 706			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
 707			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 708			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 709		},
 710		[VFIO_DEVICE_STATE_PRE_COPY] = {
 711			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
 712			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
 713			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
 714			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
 715			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
 716			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
 717			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
 718			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 719		},
 720		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
 721			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
 722			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
 723			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
 724			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
 725			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
 726			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
 727			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 728			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 729		},
 730		[VFIO_DEVICE_STATE_STOP_COPY] = {
 731			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
 732			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
 733			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
 734			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
 735			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
 736			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
 737			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
 738			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 739		},
 740		[VFIO_DEVICE_STATE_RESUMING] = {
 741			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
 742			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
 743			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
 744			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
 745			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
 746			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
 747			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
 748			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 749		},
 750		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
 751			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
 752			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
 753			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
 754			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
 755			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
 756			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
 757			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 758			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 759		},
 760		[VFIO_DEVICE_STATE_ERROR] = {
 761			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
 762			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
 763			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
 764			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
 765			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
 766			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
 767			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
 768			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 769		},
 770	};
 771
 772	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
 773		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
 774		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
 775		[VFIO_DEVICE_STATE_PRE_COPY] =
 776			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
 777		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
 778						   VFIO_MIGRATION_P2P |
 779						   VFIO_MIGRATION_PRE_COPY,
 780		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
 781		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
 782		[VFIO_DEVICE_STATE_RUNNING_P2P] =
 783			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
 784		[VFIO_DEVICE_STATE_ERROR] = ~0U,
 785	};
 786
 787	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
 788		    (state_flags_table[cur_fsm] & device->migration_flags) !=
 789			state_flags_table[cur_fsm]))
 790		return -EINVAL;
 791
 792	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
 793	   (state_flags_table[new_fsm] & device->migration_flags) !=
 794			state_flags_table[new_fsm])
 795		return -EINVAL;
 796
 797	/*
 798	 * Arcs touching optional and unsupported states are skipped over. The
 799	 * driver will instead see an arc from the original state to the next
 800	 * logical state, as per the above comment.
 801	 */
 802	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
 803	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
 804			state_flags_table[*next_fsm])
 805		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
 806
 807	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
 808}
 809EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
 810
 811/*
 812 * Convert the drivers's struct file into a FD number and return it to userspace
 813 */
 814static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
 815				   struct vfio_device_feature_mig_state *mig)
 816{
 817	int ret;
 818	int fd;
 819
 820	fd = get_unused_fd_flags(O_CLOEXEC);
 821	if (fd < 0) {
 822		ret = fd;
 823		goto out_fput;
 824	}
 825
 826	mig->data_fd = fd;
 827	if (copy_to_user(arg, mig, sizeof(*mig))) {
 828		ret = -EFAULT;
 829		goto out_put_unused;
 830	}
 831	fd_install(fd, filp);
 832	return 0;
 833
 834out_put_unused:
 835	put_unused_fd(fd);
 836out_fput:
 837	fput(filp);
 838	return ret;
 839}
 840
 841static int
 842vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
 843					   u32 flags, void __user *arg,
 844					   size_t argsz)
 845{
 846	size_t minsz =
 847		offsetofend(struct vfio_device_feature_mig_state, data_fd);
 848	struct vfio_device_feature_mig_state mig;
 849	struct file *filp = NULL;
 850	int ret;
 851
 852	if (!device->mig_ops)
 853		return -ENOTTY;
 854
 855	ret = vfio_check_feature(flags, argsz,
 856				 VFIO_DEVICE_FEATURE_SET |
 857				 VFIO_DEVICE_FEATURE_GET,
 858				 sizeof(mig));
 859	if (ret != 1)
 860		return ret;
 861
 862	if (copy_from_user(&mig, arg, minsz))
 863		return -EFAULT;
 864
 865	if (flags & VFIO_DEVICE_FEATURE_GET) {
 866		enum vfio_device_mig_state curr_state;
 867
 868		ret = device->mig_ops->migration_get_state(device,
 869							   &curr_state);
 870		if (ret)
 871			return ret;
 872		mig.device_state = curr_state;
 873		goto out_copy;
 874	}
 875
 876	/* Handle the VFIO_DEVICE_FEATURE_SET */
 877	filp = device->mig_ops->migration_set_state(device, mig.device_state);
 878	if (IS_ERR(filp) || !filp)
 879		goto out_copy;
 880
 881	return vfio_ioct_mig_return_fd(filp, arg, &mig);
 882out_copy:
 883	mig.data_fd = -1;
 884	if (copy_to_user(arg, &mig, sizeof(mig)))
 885		return -EFAULT;
 886	if (IS_ERR(filp))
 887		return PTR_ERR(filp);
 888	return 0;
 889}
 890
 891static int
 892vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
 893					      u32 flags, void __user *arg,
 894					      size_t argsz)
 895{
 896	struct vfio_device_feature_mig_data_size data_size = {};
 897	unsigned long stop_copy_length;
 898	int ret;
 899
 900	if (!device->mig_ops)
 901		return -ENOTTY;
 902
 903	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
 904				 sizeof(data_size));
 905	if (ret != 1)
 906		return ret;
 907
 908	ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
 909	if (ret)
 910		return ret;
 911
 912	data_size.stop_copy_length = stop_copy_length;
 913	if (copy_to_user(arg, &data_size, sizeof(data_size)))
 914		return -EFAULT;
 915
 916	return 0;
 917}
 918
 919static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
 920					       u32 flags, void __user *arg,
 921					       size_t argsz)
 922{
 923	struct vfio_device_feature_migration mig = {
 924		.flags = device->migration_flags,
 925	};
 926	int ret;
 927
 928	if (!device->mig_ops)
 929		return -ENOTTY;
 930
 931	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
 932				 sizeof(mig));
 933	if (ret != 1)
 934		return ret;
 935	if (copy_to_user(arg, &mig, sizeof(mig)))
 936		return -EFAULT;
 937	return 0;
 938}
 939
 940void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
 941			      u32 req_nodes)
 942{
 943	struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
 944	unsigned long min_gap, curr_gap;
 945
 946	/* Special shortcut when a single range is required */
 947	if (req_nodes == 1) {
 948		unsigned long last;
 949
 950		comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
 951
 952		/* Empty list */
 953		if (WARN_ON_ONCE(!comb_start))
 954			return;
 955
 956		curr = comb_start;
 957		while (curr) {
 958			last = curr->last;
 959			prev = curr;
 960			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
 961			if (prev != comb_start)
 962				interval_tree_remove(prev, root);
 963		}
 964		comb_start->last = last;
 965		return;
 966	}
 967
 968	/* Combine ranges which have the smallest gap */
 969	while (cur_nodes > req_nodes) {
 970		prev = NULL;
 971		min_gap = ULONG_MAX;
 972		curr = interval_tree_iter_first(root, 0, ULONG_MAX);
 973		while (curr) {
 974			if (prev) {
 975				curr_gap = curr->start - prev->last;
 976				if (curr_gap < min_gap) {
 977					min_gap = curr_gap;
 978					comb_start = prev;
 979					comb_end = curr;
 980				}
 981			}
 982			prev = curr;
 983			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
 984		}
 985
 986		/* Empty list or no nodes to combine */
 987		if (WARN_ON_ONCE(min_gap == ULONG_MAX))
 988			break;
 989
 990		comb_start->last = comb_end->last;
 991		interval_tree_remove(comb_end, root);
 992		cur_nodes--;
 993	}
 994}
 995EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges);
 996
 997/* Ranges should fit into a single kernel page */
 998#define LOG_MAX_RANGES \
 999	(PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
1000
1001static int
1002vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
1003					u32 flags, void __user *arg,
1004					size_t argsz)
1005{
1006	size_t minsz =
1007		offsetofend(struct vfio_device_feature_dma_logging_control,
1008			    ranges);
1009	struct vfio_device_feature_dma_logging_range __user *ranges;
1010	struct vfio_device_feature_dma_logging_control control;
1011	struct vfio_device_feature_dma_logging_range range;
1012	struct rb_root_cached root = RB_ROOT_CACHED;
1013	struct interval_tree_node *nodes;
1014	u64 iova_end;
1015	u32 nnodes;
1016	int i, ret;
1017
1018	if (!device->log_ops)
1019		return -ENOTTY;
1020
1021	ret = vfio_check_feature(flags, argsz,
1022				 VFIO_DEVICE_FEATURE_SET,
1023				 sizeof(control));
1024	if (ret != 1)
1025		return ret;
1026
1027	if (copy_from_user(&control, arg, minsz))
1028		return -EFAULT;
1029
1030	nnodes = control.num_ranges;
1031	if (!nnodes)
1032		return -EINVAL;
1033
1034	if (nnodes > LOG_MAX_RANGES)
1035		return -E2BIG;
1036
1037	ranges = u64_to_user_ptr(control.ranges);
1038	nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
1039			      GFP_KERNEL);
1040	if (!nodes)
1041		return -ENOMEM;
1042
1043	for (i = 0; i < nnodes; i++) {
1044		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
1045			ret = -EFAULT;
1046			goto end;
1047		}
1048		if (!IS_ALIGNED(range.iova, control.page_size) ||
1049		    !IS_ALIGNED(range.length, control.page_size)) {
1050			ret = -EINVAL;
1051			goto end;
1052		}
1053
1054		if (check_add_overflow(range.iova, range.length, &iova_end) ||
1055		    iova_end > ULONG_MAX) {
1056			ret = -EOVERFLOW;
1057			goto end;
1058		}
1059
1060		nodes[i].start = range.iova;
1061		nodes[i].last = range.iova + range.length - 1;
1062		if (interval_tree_iter_first(&root, nodes[i].start,
1063					     nodes[i].last)) {
1064			/* Range overlapping */
1065			ret = -EINVAL;
1066			goto end;
1067		}
1068		interval_tree_insert(nodes + i, &root);
1069	}
1070
1071	ret = device->log_ops->log_start(device, &root, nnodes,
1072					 &control.page_size);
1073	if (ret)
1074		goto end;
1075
1076	if (copy_to_user(arg, &control, sizeof(control))) {
1077		ret = -EFAULT;
1078		device->log_ops->log_stop(device);
1079	}
1080
1081end:
1082	kfree(nodes);
1083	return ret;
1084}
1085
1086static int
1087vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
1088				       u32 flags, void __user *arg,
1089				       size_t argsz)
1090{
1091	int ret;
1092
1093	if (!device->log_ops)
1094		return -ENOTTY;
1095
1096	ret = vfio_check_feature(flags, argsz,
1097				 VFIO_DEVICE_FEATURE_SET, 0);
1098	if (ret != 1)
1099		return ret;
1100
1101	return device->log_ops->log_stop(device);
1102}
1103
1104static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
1105					  unsigned long iova, size_t length,
1106					  void *opaque)
1107{
1108	struct vfio_device *device = opaque;
1109
1110	return device->log_ops->log_read_and_clear(device, iova, length, iter);
1111}
1112
1113static int
1114vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
1115					 u32 flags, void __user *arg,
1116					 size_t argsz)
1117{
1118	size_t minsz =
1119		offsetofend(struct vfio_device_feature_dma_logging_report,
1120			    bitmap);
1121	struct vfio_device_feature_dma_logging_report report;
1122	struct iova_bitmap *iter;
1123	u64 iova_end;
1124	int ret;
1125
1126	if (!device->log_ops)
1127		return -ENOTTY;
1128
1129	ret = vfio_check_feature(flags, argsz,
1130				 VFIO_DEVICE_FEATURE_GET,
1131				 sizeof(report));
1132	if (ret != 1)
1133		return ret;
1134
1135	if (copy_from_user(&report, arg, minsz))
1136		return -EFAULT;
1137
1138	if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1139		return -EINVAL;
1140
1141	if (check_add_overflow(report.iova, report.length, &iova_end) ||
1142	    iova_end > ULONG_MAX)
1143		return -EOVERFLOW;
1144
1145	iter = iova_bitmap_alloc(report.iova, report.length,
1146				 report.page_size,
1147				 u64_to_user_ptr(report.bitmap));
1148	if (IS_ERR(iter))
1149		return PTR_ERR(iter);
1150
1151	ret = iova_bitmap_for_each(iter, device,
1152				   vfio_device_log_read_and_clear);
1153
1154	iova_bitmap_free(iter);
1155	return ret;
1156}
1157
1158static int vfio_ioctl_device_feature(struct vfio_device *device,
1159				     struct vfio_device_feature __user *arg)
1160{
1161	size_t minsz = offsetofend(struct vfio_device_feature, flags);
1162	struct vfio_device_feature feature;
1163
1164	if (copy_from_user(&feature, arg, minsz))
1165		return -EFAULT;
1166
1167	if (feature.argsz < minsz)
1168		return -EINVAL;
1169
1170	/* Check unknown flags */
1171	if (feature.flags &
1172	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1173	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1174		return -EINVAL;
1175
1176	/* GET & SET are mutually exclusive except with PROBE */
1177	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1178	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1179	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
1180		return -EINVAL;
1181
1182	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1183	case VFIO_DEVICE_FEATURE_MIGRATION:
1184		return vfio_ioctl_device_feature_migration(
1185			device, feature.flags, arg->data,
1186			feature.argsz - minsz);
1187	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1188		return vfio_ioctl_device_feature_mig_device_state(
1189			device, feature.flags, arg->data,
1190			feature.argsz - minsz);
1191	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1192		return vfio_ioctl_device_feature_logging_start(
1193			device, feature.flags, arg->data,
1194			feature.argsz - minsz);
1195	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1196		return vfio_ioctl_device_feature_logging_stop(
1197			device, feature.flags, arg->data,
1198			feature.argsz - minsz);
1199	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1200		return vfio_ioctl_device_feature_logging_report(
1201			device, feature.flags, arg->data,
1202			feature.argsz - minsz);
1203	case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1204		return vfio_ioctl_device_feature_migration_data_size(
1205			device, feature.flags, arg->data,
1206			feature.argsz - minsz);
1207	default:
1208		if (unlikely(!device->ops->device_feature))
1209			return -EINVAL;
1210		return device->ops->device_feature(device, feature.flags,
1211						   arg->data,
1212						   feature.argsz - minsz);
1213	}
1214}
1215
1216static long vfio_device_fops_unl_ioctl(struct file *filep,
1217				       unsigned int cmd, unsigned long arg)
1218{
1219	struct vfio_device_file *df = filep->private_data;
1220	struct vfio_device *device = df->device;
1221	void __user *uptr = (void __user *)arg;
1222	int ret;
1223
1224	if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
1225		return vfio_df_ioctl_bind_iommufd(df, uptr);
1226
1227	/* Paired with smp_store_release() following vfio_df_open() */
1228	if (!smp_load_acquire(&df->access_granted))
1229		return -EINVAL;
1230
1231	ret = vfio_device_pm_runtime_get(device);
1232	if (ret)
1233		return ret;
1234
1235	/* cdev only ioctls */
1236	if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) {
1237		switch (cmd) {
1238		case VFIO_DEVICE_ATTACH_IOMMUFD_PT:
1239			ret = vfio_df_ioctl_attach_pt(df, uptr);
1240			goto out;
1241
1242		case VFIO_DEVICE_DETACH_IOMMUFD_PT:
1243			ret = vfio_df_ioctl_detach_pt(df, uptr);
1244			goto out;
1245		}
1246	}
1247
1248	switch (cmd) {
1249	case VFIO_DEVICE_FEATURE:
1250		ret = vfio_ioctl_device_feature(device, uptr);
1251		break;
1252
1253	default:
1254		if (unlikely(!device->ops->ioctl))
1255			ret = -EINVAL;
1256		else
1257			ret = device->ops->ioctl(device, cmd, arg);
1258		break;
1259	}
1260out:
1261	vfio_device_pm_runtime_put(device);
1262	return ret;
1263}
1264
1265static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1266				     size_t count, loff_t *ppos)
1267{
1268	struct vfio_device_file *df = filep->private_data;
1269	struct vfio_device *device = df->device;
1270
1271	/* Paired with smp_store_release() following vfio_df_open() */
1272	if (!smp_load_acquire(&df->access_granted))
1273		return -EINVAL;
1274
1275	if (unlikely(!device->ops->read))
1276		return -EINVAL;
1277
1278	return device->ops->read(device, buf, count, ppos);
1279}
1280
1281static ssize_t vfio_device_fops_write(struct file *filep,
1282				      const char __user *buf,
1283				      size_t count, loff_t *ppos)
1284{
1285	struct vfio_device_file *df = filep->private_data;
1286	struct vfio_device *device = df->device;
1287
1288	/* Paired with smp_store_release() following vfio_df_open() */
1289	if (!smp_load_acquire(&df->access_granted))
1290		return -EINVAL;
1291
1292	if (unlikely(!device->ops->write))
1293		return -EINVAL;
1294
1295	return device->ops->write(device, buf, count, ppos);
1296}
1297
1298static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1299{
1300	struct vfio_device_file *df = filep->private_data;
1301	struct vfio_device *device = df->device;
1302
1303	/* Paired with smp_store_release() following vfio_df_open() */
1304	if (!smp_load_acquire(&df->access_granted))
1305		return -EINVAL;
1306
1307	if (unlikely(!device->ops->mmap))
1308		return -EINVAL;
1309
1310	return device->ops->mmap(device, vma);
1311}
1312
1313const struct file_operations vfio_device_fops = {
1314	.owner		= THIS_MODULE,
1315	.open		= vfio_device_fops_cdev_open,
1316	.release	= vfio_device_fops_release,
1317	.read		= vfio_device_fops_read,
1318	.write		= vfio_device_fops_write,
1319	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1320	.compat_ioctl	= compat_ptr_ioctl,
1321	.mmap		= vfio_device_fops_mmap,
1322};
1323
1324static struct vfio_device *vfio_device_from_file(struct file *file)
1325{
1326	struct vfio_device_file *df = file->private_data;
1327
1328	if (file->f_op != &vfio_device_fops)
1329		return NULL;
1330	return df->device;
1331}
1332
1333/**
1334 * vfio_file_is_valid - True if the file is valid vfio file
1335 * @file: VFIO group file or VFIO device file
1336 */
1337bool vfio_file_is_valid(struct file *file)
1338{
1339	return vfio_group_from_file(file) ||
1340	       vfio_device_from_file(file);
1341}
1342EXPORT_SYMBOL_GPL(vfio_file_is_valid);
1343
1344/**
1345 * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1346 *        is always CPU cache coherent
1347 * @file: VFIO group file or VFIO device file
1348 *
1349 * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1350 * bit in DMA transactions. A return of false indicates that the user has
1351 * rights to access additional instructions such as wbinvd on x86.
1352 */
1353bool vfio_file_enforced_coherent(struct file *file)
1354{
1355	struct vfio_device *device;
1356	struct vfio_group *group;
1357
1358	group = vfio_group_from_file(file);
1359	if (group)
1360		return vfio_group_enforced_coherent(group);
1361
1362	device = vfio_device_from_file(file);
1363	if (device)
1364		return device_iommu_capable(device->dev,
1365					    IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
1366
1367	return true;
1368}
1369EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1370
1371static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
1372{
1373	struct vfio_device_file *df = file->private_data;
1374
1375	/*
1376	 * The kvm is first recorded in the vfio_device_file, and will
1377	 * be propagated to vfio_device::kvm when the file is bound to
1378	 * iommufd successfully in the vfio device cdev path.
1379	 */
1380	spin_lock(&df->kvm_ref_lock);
1381	df->kvm = kvm;
1382	spin_unlock(&df->kvm_ref_lock);
1383}
1384
1385/**
1386 * vfio_file_set_kvm - Link a kvm with VFIO drivers
1387 * @file: VFIO group file or VFIO device file
1388 * @kvm: KVM to link
1389 *
1390 * When a VFIO device is first opened the KVM will be available in
1391 * device->kvm if one was associated with the file.
1392 */
1393void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1394{
1395	struct vfio_group *group;
1396
1397	group = vfio_group_from_file(file);
1398	if (group)
1399		vfio_group_set_kvm(group, kvm);
1400
1401	if (vfio_device_from_file(file))
1402		vfio_device_file_set_kvm(file, kvm);
1403}
1404EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1405
1406/*
1407 * Sub-module support
1408 */
1409/*
1410 * Helper for managing a buffer of info chain capabilities, allocate or
1411 * reallocate a buffer with additional @size, filling in @id and @version
1412 * of the capability.  A pointer to the new capability is returned.
1413 *
1414 * NB. The chain is based at the head of the buffer, so new entries are
1415 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1416 * next offsets prior to copying to the user buffer.
1417 */
1418struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1419					       size_t size, u16 id, u16 version)
1420{
1421	void *buf;
1422	struct vfio_info_cap_header *header, *tmp;
1423
1424	/* Ensure that the next capability struct will be aligned */
1425	size = ALIGN(size, sizeof(u64));
1426
1427	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1428	if (!buf) {
1429		kfree(caps->buf);
1430		caps->buf = NULL;
1431		caps->size = 0;
1432		return ERR_PTR(-ENOMEM);
1433	}
1434
1435	caps->buf = buf;
1436	header = buf + caps->size;
1437
1438	/* Eventually copied to user buffer, zero */
1439	memset(header, 0, size);
1440
1441	header->id = id;
1442	header->version = version;
1443
1444	/* Add to the end of the capability chain */
1445	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1446		; /* nothing */
1447
1448	tmp->next = caps->size;
1449	caps->size += size;
1450
1451	return header;
1452}
1453EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1454
1455void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1456{
1457	struct vfio_info_cap_header *tmp;
1458	void *buf = (void *)caps->buf;
1459
1460	/* Capability structs should start with proper alignment */
1461	WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));
1462
1463	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1464		tmp->next += offset;
1465}
1466EXPORT_SYMBOL(vfio_info_cap_shift);
1467
1468int vfio_info_add_capability(struct vfio_info_cap *caps,
1469			     struct vfio_info_cap_header *cap, size_t size)
1470{
1471	struct vfio_info_cap_header *header;
1472
1473	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1474	if (IS_ERR(header))
1475		return PTR_ERR(header);
1476
1477	memcpy(header + 1, cap + 1, size - sizeof(*header));
1478
1479	return 0;
1480}
1481EXPORT_SYMBOL(vfio_info_add_capability);
1482
1483int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1484				       int max_irq_type, size_t *data_size)
1485{
1486	unsigned long minsz;
1487	size_t size;
1488
1489	minsz = offsetofend(struct vfio_irq_set, count);
1490
1491	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1492	    (hdr->count >= (U32_MAX - hdr->start)) ||
1493	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1494				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1495		return -EINVAL;
1496
1497	if (data_size)
1498		*data_size = 0;
1499
1500	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1501		return -EINVAL;
1502
1503	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1504	case VFIO_IRQ_SET_DATA_NONE:
1505		size = 0;
1506		break;
1507	case VFIO_IRQ_SET_DATA_BOOL:
1508		size = sizeof(uint8_t);
1509		break;
1510	case VFIO_IRQ_SET_DATA_EVENTFD:
1511		size = sizeof(int32_t);
1512		break;
1513	default:
1514		return -EINVAL;
1515	}
1516
1517	if (size) {
1518		if (hdr->argsz - minsz < hdr->count * size)
1519			return -EINVAL;
1520
1521		if (!data_size)
1522			return -EINVAL;
1523
1524		*data_size = hdr->count * size;
1525	}
1526
1527	return 0;
1528}
1529EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1530
1531/*
1532 * Pin contiguous user pages and return their associated host pages for local
1533 * domain only.
1534 * @device [in]  : device
1535 * @iova [in]    : starting IOVA of user pages to be pinned.
1536 * @npage [in]   : count of pages to be pinned.  This count should not
1537 *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1538 * @prot [in]    : protection flags
1539 * @pages[out]   : array of host pages
1540 * Return error or number of pages pinned.
1541 *
1542 * A driver may only call this function if the vfio_device was created
1543 * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1544 */
1545int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1546		   int npage, int prot, struct page **pages)
1547{
1548	/* group->container cannot change while a vfio device is open */
1549	if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1550		return -EINVAL;
1551	if (!device->ops->dma_unmap)
1552		return -EINVAL;
1553	if (vfio_device_has_container(device))
1554		return vfio_device_container_pin_pages(device, iova,
1555						       npage, prot, pages);
1556	if (device->iommufd_access) {
1557		int ret;
1558
1559		if (iova > ULONG_MAX)
1560			return -EINVAL;
1561		/*
1562		 * VFIO ignores the sub page offset, npages is from the start of
1563		 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1564		 * the sub page offset by doing:
1565		 *     pages[0] + (iova % PAGE_SIZE)
1566		 */
1567		ret = iommufd_access_pin_pages(
1568			device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1569			npage * PAGE_SIZE, pages,
1570			(prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1571		if (ret)
1572			return ret;
1573		return npage;
1574	}
1575	return -EINVAL;
1576}
1577EXPORT_SYMBOL(vfio_pin_pages);
1578
1579/*
1580 * Unpin contiguous host pages for local domain only.
1581 * @device [in]  : device
1582 * @iova [in]    : starting address of user pages to be unpinned.
1583 * @npage [in]   : count of pages to be unpinned.  This count should not
1584 *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1585 */
1586void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1587{
1588	if (WARN_ON(!vfio_assert_device_open(device)))
1589		return;
1590	if (WARN_ON(!device->ops->dma_unmap))
1591		return;
1592
1593	if (vfio_device_has_container(device)) {
1594		vfio_device_container_unpin_pages(device, iova, npage);
1595		return;
1596	}
1597	if (device->iommufd_access) {
1598		if (WARN_ON(iova > ULONG_MAX))
1599			return;
1600		iommufd_access_unpin_pages(device->iommufd_access,
1601					   ALIGN_DOWN(iova, PAGE_SIZE),
1602					   npage * PAGE_SIZE);
1603		return;
1604	}
1605}
1606EXPORT_SYMBOL(vfio_unpin_pages);
1607
1608/*
1609 * This interface allows the CPUs to perform some sort of virtual DMA on
1610 * behalf of the device.
1611 *
1612 * CPUs read/write from/into a range of IOVAs pointing to user space memory
1613 * into/from a kernel buffer.
1614 *
1615 * As the read/write of user space memory is conducted via the CPUs and is
1616 * not a real device DMA, it is not necessary to pin the user space memory.
1617 *
1618 * @device [in]		: VFIO device
1619 * @iova [in]		: base IOVA of a user space buffer
1620 * @data [in]		: pointer to kernel buffer
1621 * @len [in]		: kernel buffer length
1622 * @write		: indicate read or write
1623 * Return error code on failure or 0 on success.
1624 */
1625int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1626		size_t len, bool write)
1627{
1628	if (!data || len <= 0 || !vfio_assert_device_open(device))
1629		return -EINVAL;
1630
1631	if (vfio_device_has_container(device))
1632		return vfio_device_container_dma_rw(device, iova,
1633						    data, len, write);
1634
1635	if (device->iommufd_access) {
1636		unsigned int flags = 0;
1637
1638		if (iova > ULONG_MAX)
1639			return -EINVAL;
1640
1641		/* VFIO historically tries to auto-detect a kthread */
1642		if (!current->mm)
1643			flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1644		if (write)
1645			flags |= IOMMUFD_ACCESS_RW_WRITE;
1646		return iommufd_access_rw(device->iommufd_access, iova, data,
1647					 len, flags);
1648	}
1649	return -EINVAL;
1650}
1651EXPORT_SYMBOL(vfio_dma_rw);
1652
1653/*
1654 * Module/class support
1655 */
1656static int __init vfio_init(void)
1657{
1658	int ret;
1659
1660	ida_init(&vfio.device_ida);
1661
1662	ret = vfio_group_init();
1663	if (ret)
1664		return ret;
1665
1666	ret = vfio_virqfd_init();
1667	if (ret)
1668		goto err_virqfd;
1669
1670	/* /sys/class/vfio-dev/vfioX */
1671	vfio.device_class = class_create("vfio-dev");
1672	if (IS_ERR(vfio.device_class)) {
1673		ret = PTR_ERR(vfio.device_class);
1674		goto err_dev_class;
1675	}
1676
1677	ret = vfio_cdev_init(vfio.device_class);
1678	if (ret)
1679		goto err_alloc_dev_chrdev;
1680
1681	vfio_debugfs_create_root();
1682	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1683	return 0;
1684
1685err_alloc_dev_chrdev:
1686	class_destroy(vfio.device_class);
1687	vfio.device_class = NULL;
1688err_dev_class:
1689	vfio_virqfd_exit();
1690err_virqfd:
1691	vfio_group_cleanup();
1692	return ret;
1693}
1694
1695static void __exit vfio_cleanup(void)
1696{
1697	vfio_debugfs_remove_root();
1698	ida_destroy(&vfio.device_ida);
1699	vfio_cdev_cleanup();
1700	class_destroy(vfio.device_class);
1701	vfio.device_class = NULL;
1702	vfio_virqfd_exit();
1703	vfio_group_cleanup();
1704	xa_destroy(&vfio_device_set_xa);
1705}
1706
1707module_init(vfio_init);
1708module_exit(vfio_cleanup);
1709
1710MODULE_IMPORT_NS(IOMMUFD);
1711MODULE_VERSION(DRIVER_VERSION);
1712MODULE_LICENSE("GPL v2");
1713MODULE_AUTHOR(DRIVER_AUTHOR);
1714MODULE_DESCRIPTION(DRIVER_DESC);
1715MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
v6.2
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * VFIO core
   4 *
   5 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   6 *     Author: Alex Williamson <alex.williamson@redhat.com>
   7 *
   8 * Derived from original vfio:
   9 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  10 * Author: Tom Lyon, pugs@cisco.com
  11 */
  12
  13#include <linux/cdev.h>
  14#include <linux/compat.h>
  15#include <linux/device.h>
  16#include <linux/fs.h>
  17#include <linux/idr.h>
  18#include <linux/iommu.h>
 
 
 
  19#include <linux/list.h>
  20#include <linux/miscdevice.h>
  21#include <linux/module.h>
  22#include <linux/mutex.h>
  23#include <linux/pci.h>
  24#include <linux/rwsem.h>
  25#include <linux/sched.h>
  26#include <linux/slab.h>
  27#include <linux/stat.h>
  28#include <linux/string.h>
  29#include <linux/uaccess.h>
  30#include <linux/vfio.h>
  31#include <linux/wait.h>
  32#include <linux/sched/signal.h>
  33#include <linux/pm_runtime.h>
  34#include <linux/interval_tree.h>
  35#include <linux/iova_bitmap.h>
  36#include <linux/iommufd.h>
  37#include "vfio.h"
  38
  39#define DRIVER_VERSION	"0.3"
  40#define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
  41#define DRIVER_DESC	"VFIO - User Level meta-driver"
  42
  43static struct vfio {
  44	struct class			*device_class;
  45	struct ida			device_ida;
  46} vfio;
  47
 
 
 
 
 
 
 
  48static DEFINE_XARRAY(vfio_device_set_xa);
  49
  50int vfio_assign_device_set(struct vfio_device *device, void *set_id)
  51{
  52	unsigned long idx = (unsigned long)set_id;
  53	struct vfio_device_set *new_dev_set;
  54	struct vfio_device_set *dev_set;
  55
  56	if (WARN_ON(!set_id))
  57		return -EINVAL;
  58
  59	/*
  60	 * Atomically acquire a singleton object in the xarray for this set_id
  61	 */
  62	xa_lock(&vfio_device_set_xa);
  63	dev_set = xa_load(&vfio_device_set_xa, idx);
  64	if (dev_set)
  65		goto found_get_ref;
  66	xa_unlock(&vfio_device_set_xa);
  67
  68	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
  69	if (!new_dev_set)
  70		return -ENOMEM;
  71	mutex_init(&new_dev_set->lock);
  72	INIT_LIST_HEAD(&new_dev_set->device_list);
  73	new_dev_set->set_id = set_id;
  74
  75	xa_lock(&vfio_device_set_xa);
  76	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
  77			       GFP_KERNEL);
  78	if (!dev_set) {
  79		dev_set = new_dev_set;
  80		goto found_get_ref;
  81	}
  82
  83	kfree(new_dev_set);
  84	if (xa_is_err(dev_set)) {
  85		xa_unlock(&vfio_device_set_xa);
  86		return xa_err(dev_set);
  87	}
  88
  89found_get_ref:
  90	dev_set->device_count++;
  91	xa_unlock(&vfio_device_set_xa);
  92	mutex_lock(&dev_set->lock);
  93	device->dev_set = dev_set;
  94	list_add_tail(&device->dev_set_list, &dev_set->device_list);
  95	mutex_unlock(&dev_set->lock);
  96	return 0;
  97}
  98EXPORT_SYMBOL_GPL(vfio_assign_device_set);
  99
 100static void vfio_release_device_set(struct vfio_device *device)
 101{
 102	struct vfio_device_set *dev_set = device->dev_set;
 103
 104	if (!dev_set)
 105		return;
 106
 107	mutex_lock(&dev_set->lock);
 108	list_del(&device->dev_set_list);
 109	mutex_unlock(&dev_set->lock);
 110
 111	xa_lock(&vfio_device_set_xa);
 112	if (!--dev_set->device_count) {
 113		__xa_erase(&vfio_device_set_xa,
 114			   (unsigned long)dev_set->set_id);
 115		mutex_destroy(&dev_set->lock);
 116		kfree(dev_set);
 117	}
 118	xa_unlock(&vfio_device_set_xa);
 119}
 120
 121unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
 122{
 123	struct vfio_device *cur;
 124	unsigned int open_count = 0;
 125
 126	lockdep_assert_held(&dev_set->lock);
 127
 128	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
 129		open_count += cur->open_count;
 130	return open_count;
 131}
 132EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
 133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 134/*
 135 * Device objects - create, release, get, put, search
 136 */
 137/* Device reference always implies a group reference */
 138void vfio_device_put_registration(struct vfio_device *device)
 139{
 140	if (refcount_dec_and_test(&device->refcount))
 141		complete(&device->comp);
 142}
 143
 144bool vfio_device_try_get_registration(struct vfio_device *device)
 145{
 146	return refcount_inc_not_zero(&device->refcount);
 147}
 148
 149/*
 150 * VFIO driver API
 151 */
 152/* Release helper called by vfio_put_device() */
 153static void vfio_device_release(struct device *dev)
 154{
 155	struct vfio_device *device =
 156			container_of(dev, struct vfio_device, device);
 157
 158	vfio_release_device_set(device);
 159	ida_free(&vfio.device_ida, device->index);
 160
 161	if (device->ops->release)
 162		device->ops->release(device);
 163
 164	kvfree(device);
 165}
 166
 167static int vfio_init_device(struct vfio_device *device, struct device *dev,
 168			    const struct vfio_device_ops *ops);
 169
 170/*
 171 * Allocate and initialize vfio_device so it can be registered to vfio
 172 * core.
 173 *
 174 * Drivers should use the wrapper vfio_alloc_device() for allocation.
 175 * @size is the size of the structure to be allocated, including any
 176 * private data used by the driver.
 177 *
 178 * Driver may provide an @init callback to cover device private data.
 179 *
 180 * Use vfio_put_device() to release the structure after success return.
 181 */
 182struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
 183				       const struct vfio_device_ops *ops)
 184{
 185	struct vfio_device *device;
 186	int ret;
 187
 188	if (WARN_ON(size < sizeof(struct vfio_device)))
 189		return ERR_PTR(-EINVAL);
 190
 191	device = kvzalloc(size, GFP_KERNEL);
 192	if (!device)
 193		return ERR_PTR(-ENOMEM);
 194
 195	ret = vfio_init_device(device, dev, ops);
 196	if (ret)
 197		goto out_free;
 198	return device;
 199
 200out_free:
 201	kvfree(device);
 202	return ERR_PTR(ret);
 203}
 204EXPORT_SYMBOL_GPL(_vfio_alloc_device);
 205
 206/*
 207 * Initialize a vfio_device so it can be registered to vfio core.
 208 */
 209static int vfio_init_device(struct vfio_device *device, struct device *dev,
 210			    const struct vfio_device_ops *ops)
 211{
 212	int ret;
 213
 214	ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
 215	if (ret < 0) {
 216		dev_dbg(dev, "Error to alloc index\n");
 217		return ret;
 218	}
 219
 220	device->index = ret;
 221	init_completion(&device->comp);
 222	device->dev = dev;
 223	device->ops = ops;
 224
 225	if (ops->init) {
 226		ret = ops->init(device);
 227		if (ret)
 228			goto out_uninit;
 229	}
 230
 231	device_initialize(&device->device);
 232	device->device.release = vfio_device_release;
 233	device->device.class = vfio.device_class;
 234	device->device.parent = device->dev;
 235	return 0;
 236
 237out_uninit:
 238	vfio_release_device_set(device);
 239	ida_free(&vfio.device_ida, device->index);
 240	return ret;
 241}
 242
 243static int __vfio_register_dev(struct vfio_device *device,
 244			       enum vfio_group_type type)
 245{
 246	int ret;
 247
 248	if (WARN_ON(device->ops->bind_iommufd &&
 249		    (!device->ops->unbind_iommufd ||
 250		     !device->ops->attach_ioas)))
 
 
 251		return -EINVAL;
 252
 253	/*
 254	 * If the driver doesn't specify a set then the device is added to a
 255	 * singleton set just for itself.
 256	 */
 257	if (!device->dev_set)
 258		vfio_assign_device_set(device, device);
 259
 260	ret = dev_set_name(&device->device, "vfio%d", device->index);
 261	if (ret)
 262		return ret;
 263
 264	ret = vfio_device_set_group(device, type);
 265	if (ret)
 266		return ret;
 267
 268	ret = device_add(&device->device);
 
 
 
 
 
 
 
 
 
 
 
 269	if (ret)
 270		goto err_out;
 271
 272	/* Refcounting can't start until the driver calls register */
 273	refcount_set(&device->refcount, 1);
 274
 275	vfio_device_group_register(device);
 
 276
 277	return 0;
 278err_out:
 279	vfio_device_remove_group(device);
 280	return ret;
 281}
 282
 283int vfio_register_group_dev(struct vfio_device *device)
 284{
 285	return __vfio_register_dev(device, VFIO_IOMMU);
 286}
 287EXPORT_SYMBOL_GPL(vfio_register_group_dev);
 288
 289/*
 290 * Register a virtual device without IOMMU backing.  The user of this
 291 * device must not be able to directly trigger unmediated DMA.
 292 */
 293int vfio_register_emulated_iommu_dev(struct vfio_device *device)
 294{
 295	return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
 296}
 297EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
 298
 299/*
 300 * Decrement the device reference count and wait for the device to be
 301 * removed.  Open file descriptors for the device... */
 302void vfio_unregister_group_dev(struct vfio_device *device)
 303{
 304	unsigned int i = 0;
 305	bool interrupted = false;
 306	long rc;
 307
 
 
 
 
 
 
 
 
 
 
 
 
 308	vfio_device_put_registration(device);
 309	rc = try_wait_for_completion(&device->comp);
 310	while (rc <= 0) {
 311		if (device->ops->request)
 312			device->ops->request(device, i++);
 313
 314		if (interrupted) {
 315			rc = wait_for_completion_timeout(&device->comp,
 316							 HZ * 10);
 317		} else {
 318			rc = wait_for_completion_interruptible_timeout(
 319				&device->comp, HZ * 10);
 320			if (rc < 0) {
 321				interrupted = true;
 322				dev_warn(device->dev,
 323					 "Device is currently in use, task"
 324					 " \"%s\" (%d) "
 325					 "blocked until device is released",
 326					 current->comm, task_pid_nr(current));
 327			}
 328		}
 329	}
 330
 331	vfio_device_group_unregister(device);
 332
 333	/* Balances device_add in register path */
 334	device_del(&device->device);
 335
 336	/* Balances vfio_device_set_group in register path */
 337	vfio_device_remove_group(device);
 338}
 339EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
 340
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 341/* true if the vfio_device has open_device() called but not close_device() */
 342static bool vfio_assert_device_open(struct vfio_device *device)
 343{
 344	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
 345}
 346
 347static int vfio_device_first_open(struct vfio_device *device,
 348				  struct iommufd_ctx *iommufd, struct kvm *kvm)
 349{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 350	int ret;
 351
 352	lockdep_assert_held(&device->dev_set->lock);
 353
 354	if (!try_module_get(device->dev->driver->owner))
 355		return -ENODEV;
 356
 357	if (iommufd)
 358		ret = vfio_iommufd_bind(device, iommufd);
 359	else
 360		ret = vfio_device_group_use_iommu(device);
 361	if (ret)
 362		goto err_module_put;
 363
 364	device->kvm = kvm;
 365	if (device->ops->open_device) {
 366		ret = device->ops->open_device(device);
 367		if (ret)
 368			goto err_unuse_iommu;
 369	}
 370	return 0;
 371
 372err_unuse_iommu:
 373	device->kvm = NULL;
 374	if (iommufd)
 375		vfio_iommufd_unbind(device);
 376	else
 377		vfio_device_group_unuse_iommu(device);
 378err_module_put:
 379	module_put(device->dev->driver->owner);
 380	return ret;
 381}
 382
 383static void vfio_device_last_close(struct vfio_device *device,
 384				   struct iommufd_ctx *iommufd)
 385{
 
 
 
 386	lockdep_assert_held(&device->dev_set->lock);
 387
 388	if (device->ops->close_device)
 389		device->ops->close_device(device);
 390	device->kvm = NULL;
 391	if (iommufd)
 392		vfio_iommufd_unbind(device);
 393	else
 394		vfio_device_group_unuse_iommu(device);
 395	module_put(device->dev->driver->owner);
 396}
 397
 398int vfio_device_open(struct vfio_device *device,
 399		     struct iommufd_ctx *iommufd, struct kvm *kvm)
 400{
 
 401	int ret = 0;
 402
 403	mutex_lock(&device->dev_set->lock);
 
 
 
 
 
 
 
 
 404	device->open_count++;
 405	if (device->open_count == 1) {
 406		ret = vfio_device_first_open(device, iommufd, kvm);
 407		if (ret)
 408			device->open_count--;
 409	}
 410	mutex_unlock(&device->dev_set->lock);
 411
 412	return ret;
 413}
 414
 415void vfio_device_close(struct vfio_device *device,
 416		       struct iommufd_ctx *iommufd)
 417{
 418	mutex_lock(&device->dev_set->lock);
 
 
 
 419	vfio_assert_device_open(device);
 420	if (device->open_count == 1)
 421		vfio_device_last_close(device, iommufd);
 422	device->open_count--;
 423	mutex_unlock(&device->dev_set->lock);
 424}
 425
 426/*
 427 * Wrapper around pm_runtime_resume_and_get().
 428 * Return error code on failure or 0 on success.
 429 */
 430static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
 431{
 432	struct device *dev = device->dev;
 433
 434	if (dev->driver && dev->driver->pm) {
 435		int ret;
 436
 437		ret = pm_runtime_resume_and_get(dev);
 438		if (ret) {
 439			dev_info_ratelimited(dev,
 440				"vfio: runtime resume failed %d\n", ret);
 441			return -EIO;
 442		}
 443	}
 444
 445	return 0;
 446}
 447
 448/*
 449 * Wrapper around pm_runtime_put().
 450 */
 451static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
 452{
 453	struct device *dev = device->dev;
 454
 455	if (dev->driver && dev->driver->pm)
 456		pm_runtime_put(dev);
 457}
 458
 459/*
 460 * VFIO Device fd
 461 */
 462static int vfio_device_fops_release(struct inode *inode, struct file *filep)
 463{
 464	struct vfio_device *device = filep->private_data;
 
 465
 466	vfio_device_group_close(device);
 
 
 
 467
 468	vfio_device_put_registration(device);
 469
 
 
 470	return 0;
 471}
 472
 473/*
 474 * vfio_mig_get_next_state - Compute the next step in the FSM
 475 * @cur_fsm - The current state the device is in
 476 * @new_fsm - The target state to reach
 477 * @next_fsm - Pointer to the next step to get to new_fsm
 478 *
 479 * Return 0 upon success, otherwise -errno
 480 * Upon success the next step in the state progression between cur_fsm and
 481 * new_fsm will be set in next_fsm.
 482 *
 483 * This breaks down requests for combination transitions into smaller steps and
 484 * returns the next step to get to new_fsm. The function may need to be called
 485 * multiple times before reaching new_fsm.
 486 *
 487 */
 488int vfio_mig_get_next_state(struct vfio_device *device,
 489			    enum vfio_device_mig_state cur_fsm,
 490			    enum vfio_device_mig_state new_fsm,
 491			    enum vfio_device_mig_state *next_fsm)
 492{
 493	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
 494	/*
 495	 * The coding in this table requires the driver to implement the
 496	 * following FSM arcs:
 497	 *         RESUMING -> STOP
 498	 *         STOP -> RESUMING
 499	 *         STOP -> STOP_COPY
 500	 *         STOP_COPY -> STOP
 501	 *
 502	 * If P2P is supported then the driver must also implement these FSM
 503	 * arcs:
 504	 *         RUNNING -> RUNNING_P2P
 505	 *         RUNNING_P2P -> RUNNING
 506	 *         RUNNING_P2P -> STOP
 507	 *         STOP -> RUNNING_P2P
 508	 *
 509	 * If precopy is supported then the driver must support these additional
 510	 * FSM arcs:
 511	 *         RUNNING -> PRE_COPY
 512	 *         PRE_COPY -> RUNNING
 513	 *         PRE_COPY -> STOP_COPY
 514	 * However, if precopy and P2P are supported together then the driver
 515	 * must support these additional arcs beyond the P2P arcs above:
 516	 *         PRE_COPY -> RUNNING
 517	 *         PRE_COPY -> PRE_COPY_P2P
 518	 *         PRE_COPY_P2P -> PRE_COPY
 519	 *         PRE_COPY_P2P -> RUNNING_P2P
 520	 *         PRE_COPY_P2P -> STOP_COPY
 521	 *         RUNNING -> PRE_COPY
 522	 *         RUNNING_P2P -> PRE_COPY_P2P
 523	 *
 524	 * Without P2P and precopy the driver must implement:
 525	 *         RUNNING -> STOP
 526	 *         STOP -> RUNNING
 527	 *
 528	 * The coding will step through multiple states for some combination
 529	 * transitions; if all optional features are supported, this means the
 530	 * following ones:
 531	 *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
 532	 *         PRE_COPY -> RUNNING -> RUNNING_P2P
 533	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
 534	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
 535	 *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
 536	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
 537	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
 538	 *         RESUMING -> STOP -> RUNNING_P2P
 539	 *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
 540	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
 541	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
 542	 *         RESUMING -> STOP -> STOP_COPY
 543	 *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
 544	 *         RUNNING -> RUNNING_P2P -> STOP
 545	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
 546	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
 547	 *         RUNNING_P2P -> RUNNING -> PRE_COPY
 548	 *         RUNNING_P2P -> STOP -> RESUMING
 549	 *         RUNNING_P2P -> STOP -> STOP_COPY
 550	 *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
 551	 *         STOP -> RUNNING_P2P -> RUNNING
 552	 *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
 553	 *         STOP_COPY -> STOP -> RESUMING
 554	 *         STOP_COPY -> STOP -> RUNNING_P2P
 555	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
 556	 *
 557	 *  The following transitions are blocked:
 558	 *         STOP_COPY -> PRE_COPY
 559	 *         STOP_COPY -> PRE_COPY_P2P
 560	 */
 561	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
 562		[VFIO_DEVICE_STATE_STOP] = {
 563			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
 564			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
 565			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
 566			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 567			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
 568			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
 569			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 570			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 571		},
 572		[VFIO_DEVICE_STATE_RUNNING] = {
 573			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
 574			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
 575			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
 576			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 577			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
 578			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
 579			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 580			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 581		},
 582		[VFIO_DEVICE_STATE_PRE_COPY] = {
 583			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
 584			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
 585			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
 586			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
 587			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
 588			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
 589			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
 590			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 591		},
 592		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
 593			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
 594			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
 595			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
 596			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
 597			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
 598			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
 599			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 600			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 601		},
 602		[VFIO_DEVICE_STATE_STOP_COPY] = {
 603			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
 604			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
 605			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
 606			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
 607			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
 608			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
 609			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
 610			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 611		},
 612		[VFIO_DEVICE_STATE_RESUMING] = {
 613			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
 614			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
 615			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
 616			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
 617			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
 618			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
 619			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
 620			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 621		},
 622		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
 623			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
 624			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
 625			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
 626			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
 627			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
 628			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
 629			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 630			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 631		},
 632		[VFIO_DEVICE_STATE_ERROR] = {
 633			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
 634			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
 635			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
 636			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
 637			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
 638			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
 639			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
 640			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 641		},
 642	};
 643
 644	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
 645		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
 646		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
 647		[VFIO_DEVICE_STATE_PRE_COPY] =
 648			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
 649		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
 650						   VFIO_MIGRATION_P2P |
 651						   VFIO_MIGRATION_PRE_COPY,
 652		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
 653		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
 654		[VFIO_DEVICE_STATE_RUNNING_P2P] =
 655			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
 656		[VFIO_DEVICE_STATE_ERROR] = ~0U,
 657	};
 658
 659	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
 660		    (state_flags_table[cur_fsm] & device->migration_flags) !=
 661			state_flags_table[cur_fsm]))
 662		return -EINVAL;
 663
 664	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
 665	   (state_flags_table[new_fsm] & device->migration_flags) !=
 666			state_flags_table[new_fsm])
 667		return -EINVAL;
 668
 669	/*
 670	 * Arcs touching optional and unsupported states are skipped over. The
 671	 * driver will instead see an arc from the original state to the next
 672	 * logical state, as per the above comment.
 673	 */
 674	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
 675	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
 676			state_flags_table[*next_fsm])
 677		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
 678
 679	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
 680}
 681EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
 682
 683/*
 684 * Convert the drivers's struct file into a FD number and return it to userspace
 685 */
 686static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
 687				   struct vfio_device_feature_mig_state *mig)
 688{
 689	int ret;
 690	int fd;
 691
 692	fd = get_unused_fd_flags(O_CLOEXEC);
 693	if (fd < 0) {
 694		ret = fd;
 695		goto out_fput;
 696	}
 697
 698	mig->data_fd = fd;
 699	if (copy_to_user(arg, mig, sizeof(*mig))) {
 700		ret = -EFAULT;
 701		goto out_put_unused;
 702	}
 703	fd_install(fd, filp);
 704	return 0;
 705
 706out_put_unused:
 707	put_unused_fd(fd);
 708out_fput:
 709	fput(filp);
 710	return ret;
 711}
 712
 713static int
 714vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
 715					   u32 flags, void __user *arg,
 716					   size_t argsz)
 717{
 718	size_t minsz =
 719		offsetofend(struct vfio_device_feature_mig_state, data_fd);
 720	struct vfio_device_feature_mig_state mig;
 721	struct file *filp = NULL;
 722	int ret;
 723
 724	if (!device->mig_ops)
 725		return -ENOTTY;
 726
 727	ret = vfio_check_feature(flags, argsz,
 728				 VFIO_DEVICE_FEATURE_SET |
 729				 VFIO_DEVICE_FEATURE_GET,
 730				 sizeof(mig));
 731	if (ret != 1)
 732		return ret;
 733
 734	if (copy_from_user(&mig, arg, minsz))
 735		return -EFAULT;
 736
 737	if (flags & VFIO_DEVICE_FEATURE_GET) {
 738		enum vfio_device_mig_state curr_state;
 739
 740		ret = device->mig_ops->migration_get_state(device,
 741							   &curr_state);
 742		if (ret)
 743			return ret;
 744		mig.device_state = curr_state;
 745		goto out_copy;
 746	}
 747
 748	/* Handle the VFIO_DEVICE_FEATURE_SET */
 749	filp = device->mig_ops->migration_set_state(device, mig.device_state);
 750	if (IS_ERR(filp) || !filp)
 751		goto out_copy;
 752
 753	return vfio_ioct_mig_return_fd(filp, arg, &mig);
 754out_copy:
 755	mig.data_fd = -1;
 756	if (copy_to_user(arg, &mig, sizeof(mig)))
 757		return -EFAULT;
 758	if (IS_ERR(filp))
 759		return PTR_ERR(filp);
 760	return 0;
 761}
 762
 763static int
 764vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
 765					      u32 flags, void __user *arg,
 766					      size_t argsz)
 767{
 768	struct vfio_device_feature_mig_data_size data_size = {};
 769	unsigned long stop_copy_length;
 770	int ret;
 771
 772	if (!device->mig_ops)
 773		return -ENOTTY;
 774
 775	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
 776				 sizeof(data_size));
 777	if (ret != 1)
 778		return ret;
 779
 780	ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
 781	if (ret)
 782		return ret;
 783
 784	data_size.stop_copy_length = stop_copy_length;
 785	if (copy_to_user(arg, &data_size, sizeof(data_size)))
 786		return -EFAULT;
 787
 788	return 0;
 789}
 790
 791static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
 792					       u32 flags, void __user *arg,
 793					       size_t argsz)
 794{
 795	struct vfio_device_feature_migration mig = {
 796		.flags = device->migration_flags,
 797	};
 798	int ret;
 799
 800	if (!device->mig_ops)
 801		return -ENOTTY;
 802
 803	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
 804				 sizeof(mig));
 805	if (ret != 1)
 806		return ret;
 807	if (copy_to_user(arg, &mig, sizeof(mig)))
 808		return -EFAULT;
 809	return 0;
 810}
 811
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 812/* Ranges should fit into a single kernel page */
 813#define LOG_MAX_RANGES \
 814	(PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
 815
 816static int
 817vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
 818					u32 flags, void __user *arg,
 819					size_t argsz)
 820{
 821	size_t minsz =
 822		offsetofend(struct vfio_device_feature_dma_logging_control,
 823			    ranges);
 824	struct vfio_device_feature_dma_logging_range __user *ranges;
 825	struct vfio_device_feature_dma_logging_control control;
 826	struct vfio_device_feature_dma_logging_range range;
 827	struct rb_root_cached root = RB_ROOT_CACHED;
 828	struct interval_tree_node *nodes;
 829	u64 iova_end;
 830	u32 nnodes;
 831	int i, ret;
 832
 833	if (!device->log_ops)
 834		return -ENOTTY;
 835
 836	ret = vfio_check_feature(flags, argsz,
 837				 VFIO_DEVICE_FEATURE_SET,
 838				 sizeof(control));
 839	if (ret != 1)
 840		return ret;
 841
 842	if (copy_from_user(&control, arg, minsz))
 843		return -EFAULT;
 844
 845	nnodes = control.num_ranges;
 846	if (!nnodes)
 847		return -EINVAL;
 848
 849	if (nnodes > LOG_MAX_RANGES)
 850		return -E2BIG;
 851
 852	ranges = u64_to_user_ptr(control.ranges);
 853	nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
 854			      GFP_KERNEL);
 855	if (!nodes)
 856		return -ENOMEM;
 857
 858	for (i = 0; i < nnodes; i++) {
 859		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
 860			ret = -EFAULT;
 861			goto end;
 862		}
 863		if (!IS_ALIGNED(range.iova, control.page_size) ||
 864		    !IS_ALIGNED(range.length, control.page_size)) {
 865			ret = -EINVAL;
 866			goto end;
 867		}
 868
 869		if (check_add_overflow(range.iova, range.length, &iova_end) ||
 870		    iova_end > ULONG_MAX) {
 871			ret = -EOVERFLOW;
 872			goto end;
 873		}
 874
 875		nodes[i].start = range.iova;
 876		nodes[i].last = range.iova + range.length - 1;
 877		if (interval_tree_iter_first(&root, nodes[i].start,
 878					     nodes[i].last)) {
 879			/* Range overlapping */
 880			ret = -EINVAL;
 881			goto end;
 882		}
 883		interval_tree_insert(nodes + i, &root);
 884	}
 885
 886	ret = device->log_ops->log_start(device, &root, nnodes,
 887					 &control.page_size);
 888	if (ret)
 889		goto end;
 890
 891	if (copy_to_user(arg, &control, sizeof(control))) {
 892		ret = -EFAULT;
 893		device->log_ops->log_stop(device);
 894	}
 895
 896end:
 897	kfree(nodes);
 898	return ret;
 899}
 900
 901static int
 902vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
 903				       u32 flags, void __user *arg,
 904				       size_t argsz)
 905{
 906	int ret;
 907
 908	if (!device->log_ops)
 909		return -ENOTTY;
 910
 911	ret = vfio_check_feature(flags, argsz,
 912				 VFIO_DEVICE_FEATURE_SET, 0);
 913	if (ret != 1)
 914		return ret;
 915
 916	return device->log_ops->log_stop(device);
 917}
 918
 919static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
 920					  unsigned long iova, size_t length,
 921					  void *opaque)
 922{
 923	struct vfio_device *device = opaque;
 924
 925	return device->log_ops->log_read_and_clear(device, iova, length, iter);
 926}
 927
 928static int
 929vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
 930					 u32 flags, void __user *arg,
 931					 size_t argsz)
 932{
 933	size_t minsz =
 934		offsetofend(struct vfio_device_feature_dma_logging_report,
 935			    bitmap);
 936	struct vfio_device_feature_dma_logging_report report;
 937	struct iova_bitmap *iter;
 938	u64 iova_end;
 939	int ret;
 940
 941	if (!device->log_ops)
 942		return -ENOTTY;
 943
 944	ret = vfio_check_feature(flags, argsz,
 945				 VFIO_DEVICE_FEATURE_GET,
 946				 sizeof(report));
 947	if (ret != 1)
 948		return ret;
 949
 950	if (copy_from_user(&report, arg, minsz))
 951		return -EFAULT;
 952
 953	if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
 954		return -EINVAL;
 955
 956	if (check_add_overflow(report.iova, report.length, &iova_end) ||
 957	    iova_end > ULONG_MAX)
 958		return -EOVERFLOW;
 959
 960	iter = iova_bitmap_alloc(report.iova, report.length,
 961				 report.page_size,
 962				 u64_to_user_ptr(report.bitmap));
 963	if (IS_ERR(iter))
 964		return PTR_ERR(iter);
 965
 966	ret = iova_bitmap_for_each(iter, device,
 967				   vfio_device_log_read_and_clear);
 968
 969	iova_bitmap_free(iter);
 970	return ret;
 971}
 972
 973static int vfio_ioctl_device_feature(struct vfio_device *device,
 974				     struct vfio_device_feature __user *arg)
 975{
 976	size_t minsz = offsetofend(struct vfio_device_feature, flags);
 977	struct vfio_device_feature feature;
 978
 979	if (copy_from_user(&feature, arg, minsz))
 980		return -EFAULT;
 981
 982	if (feature.argsz < minsz)
 983		return -EINVAL;
 984
 985	/* Check unknown flags */
 986	if (feature.flags &
 987	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
 988	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
 989		return -EINVAL;
 990
 991	/* GET & SET are mutually exclusive except with PROBE */
 992	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
 993	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
 994	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
 995		return -EINVAL;
 996
 997	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
 998	case VFIO_DEVICE_FEATURE_MIGRATION:
 999		return vfio_ioctl_device_feature_migration(
1000			device, feature.flags, arg->data,
1001			feature.argsz - minsz);
1002	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1003		return vfio_ioctl_device_feature_mig_device_state(
1004			device, feature.flags, arg->data,
1005			feature.argsz - minsz);
1006	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1007		return vfio_ioctl_device_feature_logging_start(
1008			device, feature.flags, arg->data,
1009			feature.argsz - minsz);
1010	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1011		return vfio_ioctl_device_feature_logging_stop(
1012			device, feature.flags, arg->data,
1013			feature.argsz - minsz);
1014	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1015		return vfio_ioctl_device_feature_logging_report(
1016			device, feature.flags, arg->data,
1017			feature.argsz - minsz);
1018	case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1019		return vfio_ioctl_device_feature_migration_data_size(
1020			device, feature.flags, arg->data,
1021			feature.argsz - minsz);
1022	default:
1023		if (unlikely(!device->ops->device_feature))
1024			return -EINVAL;
1025		return device->ops->device_feature(device, feature.flags,
1026						   arg->data,
1027						   feature.argsz - minsz);
1028	}
1029}
1030
1031static long vfio_device_fops_unl_ioctl(struct file *filep,
1032				       unsigned int cmd, unsigned long arg)
1033{
1034	struct vfio_device *device = filep->private_data;
 
 
1035	int ret;
1036
 
 
 
 
 
 
 
1037	ret = vfio_device_pm_runtime_get(device);
1038	if (ret)
1039		return ret;
1040
 
 
 
 
 
 
 
 
 
 
 
 
 
1041	switch (cmd) {
1042	case VFIO_DEVICE_FEATURE:
1043		ret = vfio_ioctl_device_feature(device, (void __user *)arg);
1044		break;
1045
1046	default:
1047		if (unlikely(!device->ops->ioctl))
1048			ret = -EINVAL;
1049		else
1050			ret = device->ops->ioctl(device, cmd, arg);
1051		break;
1052	}
1053
1054	vfio_device_pm_runtime_put(device);
1055	return ret;
1056}
1057
1058static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1059				     size_t count, loff_t *ppos)
1060{
1061	struct vfio_device *device = filep->private_data;
 
 
 
 
 
1062
1063	if (unlikely(!device->ops->read))
1064		return -EINVAL;
1065
1066	return device->ops->read(device, buf, count, ppos);
1067}
1068
1069static ssize_t vfio_device_fops_write(struct file *filep,
1070				      const char __user *buf,
1071				      size_t count, loff_t *ppos)
1072{
1073	struct vfio_device *device = filep->private_data;
 
 
 
 
 
1074
1075	if (unlikely(!device->ops->write))
1076		return -EINVAL;
1077
1078	return device->ops->write(device, buf, count, ppos);
1079}
1080
1081static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1082{
1083	struct vfio_device *device = filep->private_data;
 
 
 
 
 
1084
1085	if (unlikely(!device->ops->mmap))
1086		return -EINVAL;
1087
1088	return device->ops->mmap(device, vma);
1089}
1090
1091const struct file_operations vfio_device_fops = {
1092	.owner		= THIS_MODULE,
 
1093	.release	= vfio_device_fops_release,
1094	.read		= vfio_device_fops_read,
1095	.write		= vfio_device_fops_write,
1096	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1097	.compat_ioctl	= compat_ptr_ioctl,
1098	.mmap		= vfio_device_fops_mmap,
1099};
1100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1101/*
1102 * Sub-module support
1103 */
1104/*
1105 * Helper for managing a buffer of info chain capabilities, allocate or
1106 * reallocate a buffer with additional @size, filling in @id and @version
1107 * of the capability.  A pointer to the new capability is returned.
1108 *
1109 * NB. The chain is based at the head of the buffer, so new entries are
1110 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1111 * next offsets prior to copying to the user buffer.
1112 */
1113struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1114					       size_t size, u16 id, u16 version)
1115{
1116	void *buf;
1117	struct vfio_info_cap_header *header, *tmp;
1118
 
 
 
1119	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1120	if (!buf) {
1121		kfree(caps->buf);
1122		caps->buf = NULL;
1123		caps->size = 0;
1124		return ERR_PTR(-ENOMEM);
1125	}
1126
1127	caps->buf = buf;
1128	header = buf + caps->size;
1129
1130	/* Eventually copied to user buffer, zero */
1131	memset(header, 0, size);
1132
1133	header->id = id;
1134	header->version = version;
1135
1136	/* Add to the end of the capability chain */
1137	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1138		; /* nothing */
1139
1140	tmp->next = caps->size;
1141	caps->size += size;
1142
1143	return header;
1144}
1145EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1146
1147void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1148{
1149	struct vfio_info_cap_header *tmp;
1150	void *buf = (void *)caps->buf;
1151
 
 
 
1152	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1153		tmp->next += offset;
1154}
1155EXPORT_SYMBOL(vfio_info_cap_shift);
1156
1157int vfio_info_add_capability(struct vfio_info_cap *caps,
1158			     struct vfio_info_cap_header *cap, size_t size)
1159{
1160	struct vfio_info_cap_header *header;
1161
1162	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1163	if (IS_ERR(header))
1164		return PTR_ERR(header);
1165
1166	memcpy(header + 1, cap + 1, size - sizeof(*header));
1167
1168	return 0;
1169}
1170EXPORT_SYMBOL(vfio_info_add_capability);
1171
1172int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1173				       int max_irq_type, size_t *data_size)
1174{
1175	unsigned long minsz;
1176	size_t size;
1177
1178	minsz = offsetofend(struct vfio_irq_set, count);
1179
1180	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1181	    (hdr->count >= (U32_MAX - hdr->start)) ||
1182	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1183				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1184		return -EINVAL;
1185
1186	if (data_size)
1187		*data_size = 0;
1188
1189	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1190		return -EINVAL;
1191
1192	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1193	case VFIO_IRQ_SET_DATA_NONE:
1194		size = 0;
1195		break;
1196	case VFIO_IRQ_SET_DATA_BOOL:
1197		size = sizeof(uint8_t);
1198		break;
1199	case VFIO_IRQ_SET_DATA_EVENTFD:
1200		size = sizeof(int32_t);
1201		break;
1202	default:
1203		return -EINVAL;
1204	}
1205
1206	if (size) {
1207		if (hdr->argsz - minsz < hdr->count * size)
1208			return -EINVAL;
1209
1210		if (!data_size)
1211			return -EINVAL;
1212
1213		*data_size = hdr->count * size;
1214	}
1215
1216	return 0;
1217}
1218EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1219
1220/*
1221 * Pin contiguous user pages and return their associated host pages for local
1222 * domain only.
1223 * @device [in]  : device
1224 * @iova [in]    : starting IOVA of user pages to be pinned.
1225 * @npage [in]   : count of pages to be pinned.  This count should not
1226 *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1227 * @prot [in]    : protection flags
1228 * @pages[out]   : array of host pages
1229 * Return error or number of pages pinned.
1230 *
1231 * A driver may only call this function if the vfio_device was created
1232 * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1233 */
1234int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1235		   int npage, int prot, struct page **pages)
1236{
1237	/* group->container cannot change while a vfio device is open */
1238	if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1239		return -EINVAL;
 
 
1240	if (vfio_device_has_container(device))
1241		return vfio_device_container_pin_pages(device, iova,
1242						       npage, prot, pages);
1243	if (device->iommufd_access) {
1244		int ret;
1245
1246		if (iova > ULONG_MAX)
1247			return -EINVAL;
1248		/*
1249		 * VFIO ignores the sub page offset, npages is from the start of
1250		 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1251		 * the sub page offset by doing:
1252		 *     pages[0] + (iova % PAGE_SIZE)
1253		 */
1254		ret = iommufd_access_pin_pages(
1255			device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1256			npage * PAGE_SIZE, pages,
1257			(prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1258		if (ret)
1259			return ret;
1260		return npage;
1261	}
1262	return -EINVAL;
1263}
1264EXPORT_SYMBOL(vfio_pin_pages);
1265
1266/*
1267 * Unpin contiguous host pages for local domain only.
1268 * @device [in]  : device
1269 * @iova [in]    : starting address of user pages to be unpinned.
1270 * @npage [in]   : count of pages to be unpinned.  This count should not
1271 *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1272 */
1273void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1274{
1275	if (WARN_ON(!vfio_assert_device_open(device)))
1276		return;
 
 
1277
1278	if (vfio_device_has_container(device)) {
1279		vfio_device_container_unpin_pages(device, iova, npage);
1280		return;
1281	}
1282	if (device->iommufd_access) {
1283		if (WARN_ON(iova > ULONG_MAX))
1284			return;
1285		iommufd_access_unpin_pages(device->iommufd_access,
1286					   ALIGN_DOWN(iova, PAGE_SIZE),
1287					   npage * PAGE_SIZE);
1288		return;
1289	}
1290}
1291EXPORT_SYMBOL(vfio_unpin_pages);
1292
1293/*
1294 * This interface allows the CPUs to perform some sort of virtual DMA on
1295 * behalf of the device.
1296 *
1297 * CPUs read/write from/into a range of IOVAs pointing to user space memory
1298 * into/from a kernel buffer.
1299 *
1300 * As the read/write of user space memory is conducted via the CPUs and is
1301 * not a real device DMA, it is not necessary to pin the user space memory.
1302 *
1303 * @device [in]		: VFIO device
1304 * @iova [in]		: base IOVA of a user space buffer
1305 * @data [in]		: pointer to kernel buffer
1306 * @len [in]		: kernel buffer length
1307 * @write		: indicate read or write
1308 * Return error code on failure or 0 on success.
1309 */
1310int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1311		size_t len, bool write)
1312{
1313	if (!data || len <= 0 || !vfio_assert_device_open(device))
1314		return -EINVAL;
1315
1316	if (vfio_device_has_container(device))
1317		return vfio_device_container_dma_rw(device, iova,
1318						    data, len, write);
1319
1320	if (device->iommufd_access) {
1321		unsigned int flags = 0;
1322
1323		if (iova > ULONG_MAX)
1324			return -EINVAL;
1325
1326		/* VFIO historically tries to auto-detect a kthread */
1327		if (!current->mm)
1328			flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1329		if (write)
1330			flags |= IOMMUFD_ACCESS_RW_WRITE;
1331		return iommufd_access_rw(device->iommufd_access, iova, data,
1332					 len, flags);
1333	}
1334	return -EINVAL;
1335}
1336EXPORT_SYMBOL(vfio_dma_rw);
1337
1338/*
1339 * Module/class support
1340 */
1341static int __init vfio_init(void)
1342{
1343	int ret;
1344
1345	ida_init(&vfio.device_ida);
1346
1347	ret = vfio_group_init();
1348	if (ret)
1349		return ret;
1350
1351	ret = vfio_virqfd_init();
1352	if (ret)
1353		goto err_virqfd;
1354
1355	/* /sys/class/vfio-dev/vfioX */
1356	vfio.device_class = class_create(THIS_MODULE, "vfio-dev");
1357	if (IS_ERR(vfio.device_class)) {
1358		ret = PTR_ERR(vfio.device_class);
1359		goto err_dev_class;
1360	}
1361
 
 
 
 
 
1362	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1363	return 0;
1364
 
 
 
1365err_dev_class:
1366	vfio_virqfd_exit();
1367err_virqfd:
1368	vfio_group_cleanup();
1369	return ret;
1370}
1371
1372static void __exit vfio_cleanup(void)
1373{
 
1374	ida_destroy(&vfio.device_ida);
 
1375	class_destroy(vfio.device_class);
1376	vfio.device_class = NULL;
1377	vfio_virqfd_exit();
1378	vfio_group_cleanup();
1379	xa_destroy(&vfio_device_set_xa);
1380}
1381
1382module_init(vfio_init);
1383module_exit(vfio_cleanup);
1384
 
1385MODULE_VERSION(DRIVER_VERSION);
1386MODULE_LICENSE("GPL v2");
1387MODULE_AUTHOR(DRIVER_AUTHOR);
1388MODULE_DESCRIPTION(DRIVER_DESC);
1389MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");