Linux Audio

Check our new training course

Loading...
v5.4
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * VFIO core
   4 *
   5 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   6 *     Author: Alex Williamson <alex.williamson@redhat.com>
   7 *
   8 * Derived from original vfio:
   9 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  10 * Author: Tom Lyon, pugs@cisco.com
  11 */
  12
  13#include <linux/cdev.h>
  14#include <linux/compat.h>
  15#include <linux/device.h>
  16#include <linux/file.h>
  17#include <linux/anon_inodes.h>
  18#include <linux/fs.h>
  19#include <linux/idr.h>
  20#include <linux/iommu.h>
  21#include <linux/list.h>
  22#include <linux/miscdevice.h>
  23#include <linux/module.h>
  24#include <linux/mutex.h>
  25#include <linux/pci.h>
  26#include <linux/rwsem.h>
  27#include <linux/sched.h>
  28#include <linux/slab.h>
  29#include <linux/stat.h>
  30#include <linux/string.h>
  31#include <linux/uaccess.h>
  32#include <linux/vfio.h>
  33#include <linux/wait.h>
  34#include <linux/sched/signal.h>
  35
  36#define DRIVER_VERSION	"0.3"
  37#define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
  38#define DRIVER_DESC	"VFIO - User Level meta-driver"
  39
  40static struct vfio {
  41	struct class			*class;
  42	struct list_head		iommu_drivers_list;
  43	struct mutex			iommu_drivers_lock;
  44	struct list_head		group_list;
  45	struct idr			group_idr;
  46	struct mutex			group_lock;
  47	struct cdev			group_cdev;
  48	dev_t				group_devt;
  49	wait_queue_head_t		release_q;
  50} vfio;
  51
  52struct vfio_iommu_driver {
  53	const struct vfio_iommu_driver_ops	*ops;
  54	struct list_head			vfio_next;
  55};
  56
  57struct vfio_container {
  58	struct kref			kref;
  59	struct list_head		group_list;
  60	struct rw_semaphore		group_lock;
  61	struct vfio_iommu_driver	*iommu_driver;
  62	void				*iommu_data;
  63	bool				noiommu;
  64};
  65
  66struct vfio_unbound_dev {
  67	struct device			*dev;
  68	struct list_head		unbound_next;
  69};
  70
  71struct vfio_group {
  72	struct kref			kref;
  73	int				minor;
  74	atomic_t			container_users;
  75	struct iommu_group		*iommu_group;
  76	struct vfio_container		*container;
  77	struct list_head		device_list;
  78	struct mutex			device_lock;
  79	struct device			*dev;
  80	struct notifier_block		nb;
  81	struct list_head		vfio_next;
  82	struct list_head		container_next;
  83	struct list_head		unbound_list;
  84	struct mutex			unbound_lock;
  85	atomic_t			opened;
  86	wait_queue_head_t		container_q;
  87	bool				noiommu;
 
  88	struct kvm			*kvm;
  89	struct blocking_notifier_head	notifier;
  90};
  91
  92struct vfio_device {
  93	struct kref			kref;
  94	struct device			*dev;
  95	const struct vfio_device_ops	*ops;
  96	struct vfio_group		*group;
  97	struct list_head		group_next;
  98	void				*device_data;
  99};
 100
 101#ifdef CONFIG_VFIO_NOIOMMU
 102static bool noiommu __read_mostly;
 103module_param_named(enable_unsafe_noiommu_mode,
 104		   noiommu, bool, S_IRUGO | S_IWUSR);
 105MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
 106#endif
 107
 108/*
 109 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
 110 * and remove functions, any use cases other than acquiring the first
 111 * reference for the purpose of calling vfio_add_group_dev() or removing
 112 * that symmetric reference after vfio_del_group_dev() should use the raw
 113 * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
 114 * removes the device from the dummy group and cannot be nested.
 115 */
 116struct iommu_group *vfio_iommu_group_get(struct device *dev)
 117{
 118	struct iommu_group *group;
 119	int __maybe_unused ret;
 120
 121	group = iommu_group_get(dev);
 122
 123#ifdef CONFIG_VFIO_NOIOMMU
 124	/*
 125	 * With noiommu enabled, an IOMMU group will be created for a device
 126	 * that doesn't already have one and doesn't have an iommu_ops on their
 127	 * bus.  We set iommudata simply to be able to identify these groups
 128	 * as special use and for reclamation later.
 129	 */
 130	if (group || !noiommu || iommu_present(dev->bus))
 131		return group;
 132
 133	group = iommu_group_alloc();
 134	if (IS_ERR(group))
 135		return NULL;
 136
 137	iommu_group_set_name(group, "vfio-noiommu");
 138	iommu_group_set_iommudata(group, &noiommu, NULL);
 139	ret = iommu_group_add_device(group, dev);
 140	if (ret) {
 141		iommu_group_put(group);
 142		return NULL;
 143	}
 144
 145	/*
 146	 * Where to taint?  At this point we've added an IOMMU group for a
 147	 * device that is not backed by iommu_ops, therefore any iommu_
 148	 * callback using iommu_ops can legitimately Oops.  So, while we may
 149	 * be about to give a DMA capable device to a user without IOMMU
 150	 * protection, which is clearly taint-worthy, let's go ahead and do
 151	 * it here.
 152	 */
 153	add_taint(TAINT_USER, LOCKDEP_STILL_OK);
 154	dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
 155#endif
 156
 157	return group;
 158}
 159EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
 160
 161void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
 162{
 163#ifdef CONFIG_VFIO_NOIOMMU
 164	if (iommu_group_get_iommudata(group) == &noiommu)
 165		iommu_group_remove_device(dev);
 166#endif
 167
 168	iommu_group_put(group);
 169}
 170EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
 171
 172#ifdef CONFIG_VFIO_NOIOMMU
 173static void *vfio_noiommu_open(unsigned long arg)
 174{
 175	if (arg != VFIO_NOIOMMU_IOMMU)
 176		return ERR_PTR(-EINVAL);
 177	if (!capable(CAP_SYS_RAWIO))
 178		return ERR_PTR(-EPERM);
 179
 180	return NULL;
 181}
 182
 183static void vfio_noiommu_release(void *iommu_data)
 184{
 185}
 186
 187static long vfio_noiommu_ioctl(void *iommu_data,
 188			       unsigned int cmd, unsigned long arg)
 189{
 190	if (cmd == VFIO_CHECK_EXTENSION)
 191		return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
 192
 193	return -ENOTTY;
 194}
 195
 196static int vfio_noiommu_attach_group(void *iommu_data,
 197				     struct iommu_group *iommu_group)
 198{
 199	return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
 200}
 201
 202static void vfio_noiommu_detach_group(void *iommu_data,
 203				      struct iommu_group *iommu_group)
 204{
 205}
 206
 207static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
 208	.name = "vfio-noiommu",
 209	.owner = THIS_MODULE,
 210	.open = vfio_noiommu_open,
 211	.release = vfio_noiommu_release,
 212	.ioctl = vfio_noiommu_ioctl,
 213	.attach_group = vfio_noiommu_attach_group,
 214	.detach_group = vfio_noiommu_detach_group,
 215};
 216#endif
 217
 218
 219/**
 220 * IOMMU driver registration
 221 */
 222int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 223{
 224	struct vfio_iommu_driver *driver, *tmp;
 225
 226	driver = kzalloc(sizeof(*driver), GFP_KERNEL);
 227	if (!driver)
 228		return -ENOMEM;
 229
 230	driver->ops = ops;
 231
 232	mutex_lock(&vfio.iommu_drivers_lock);
 233
 234	/* Check for duplicates */
 235	list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
 236		if (tmp->ops == ops) {
 237			mutex_unlock(&vfio.iommu_drivers_lock);
 238			kfree(driver);
 239			return -EINVAL;
 240		}
 241	}
 242
 243	list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
 244
 245	mutex_unlock(&vfio.iommu_drivers_lock);
 246
 247	return 0;
 248}
 249EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
 250
 251void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 252{
 253	struct vfio_iommu_driver *driver;
 254
 255	mutex_lock(&vfio.iommu_drivers_lock);
 256	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
 257		if (driver->ops == ops) {
 258			list_del(&driver->vfio_next);
 259			mutex_unlock(&vfio.iommu_drivers_lock);
 260			kfree(driver);
 261			return;
 262		}
 263	}
 264	mutex_unlock(&vfio.iommu_drivers_lock);
 265}
 266EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
 267
 268/**
 269 * Group minor allocation/free - both called with vfio.group_lock held
 270 */
 271static int vfio_alloc_group_minor(struct vfio_group *group)
 272{
 273	return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
 274}
 275
 276static void vfio_free_group_minor(int minor)
 277{
 278	idr_remove(&vfio.group_idr, minor);
 279}
 280
 281static int vfio_iommu_group_notifier(struct notifier_block *nb,
 282				     unsigned long action, void *data);
 283static void vfio_group_get(struct vfio_group *group);
 284
 285/**
 286 * Container objects - containers are created when /dev/vfio/vfio is
 287 * opened, but their lifecycle extends until the last user is done, so
 288 * it's freed via kref.  Must support container/group/device being
 289 * closed in any order.
 290 */
 291static void vfio_container_get(struct vfio_container *container)
 292{
 293	kref_get(&container->kref);
 294}
 295
 296static void vfio_container_release(struct kref *kref)
 297{
 298	struct vfio_container *container;
 299	container = container_of(kref, struct vfio_container, kref);
 300
 301	kfree(container);
 302}
 303
 304static void vfio_container_put(struct vfio_container *container)
 305{
 306	kref_put(&container->kref, vfio_container_release);
 307}
 308
 309static void vfio_group_unlock_and_free(struct vfio_group *group)
 310{
 311	mutex_unlock(&vfio.group_lock);
 312	/*
 313	 * Unregister outside of lock.  A spurious callback is harmless now
 314	 * that the group is no longer in vfio.group_list.
 315	 */
 316	iommu_group_unregister_notifier(group->iommu_group, &group->nb);
 317	kfree(group);
 318}
 319
 320/**
 321 * Group objects - create, release, get, put, search
 322 */
 323static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 324{
 325	struct vfio_group *group, *tmp;
 326	struct device *dev;
 327	int ret, minor;
 328
 329	group = kzalloc(sizeof(*group), GFP_KERNEL);
 330	if (!group)
 331		return ERR_PTR(-ENOMEM);
 332
 333	kref_init(&group->kref);
 334	INIT_LIST_HEAD(&group->device_list);
 335	mutex_init(&group->device_lock);
 336	INIT_LIST_HEAD(&group->unbound_list);
 337	mutex_init(&group->unbound_lock);
 338	atomic_set(&group->container_users, 0);
 339	atomic_set(&group->opened, 0);
 340	init_waitqueue_head(&group->container_q);
 341	group->iommu_group = iommu_group;
 342#ifdef CONFIG_VFIO_NOIOMMU
 343	group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
 344#endif
 345	BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
 346
 347	group->nb.notifier_call = vfio_iommu_group_notifier;
 348
 349	/*
 350	 * blocking notifiers acquire a rwsem around registering and hold
 351	 * it around callback.  Therefore, need to register outside of
 352	 * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
 353	 * do anything unless it can find the group in vfio.group_list, so
 354	 * no harm in registering early.
 355	 */
 356	ret = iommu_group_register_notifier(iommu_group, &group->nb);
 357	if (ret) {
 358		kfree(group);
 359		return ERR_PTR(ret);
 360	}
 361
 362	mutex_lock(&vfio.group_lock);
 363
 364	/* Did we race creating this group? */
 365	list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
 366		if (tmp->iommu_group == iommu_group) {
 367			vfio_group_get(tmp);
 368			vfio_group_unlock_and_free(group);
 369			return tmp;
 370		}
 371	}
 372
 373	minor = vfio_alloc_group_minor(group);
 374	if (minor < 0) {
 375		vfio_group_unlock_and_free(group);
 376		return ERR_PTR(minor);
 377	}
 378
 379	dev = device_create(vfio.class, NULL,
 380			    MKDEV(MAJOR(vfio.group_devt), minor),
 381			    group, "%s%d", group->noiommu ? "noiommu-" : "",
 382			    iommu_group_id(iommu_group));
 383	if (IS_ERR(dev)) {
 384		vfio_free_group_minor(minor);
 385		vfio_group_unlock_and_free(group);
 386		return ERR_CAST(dev);
 387	}
 388
 389	group->minor = minor;
 390	group->dev = dev;
 391
 392	list_add(&group->vfio_next, &vfio.group_list);
 393
 394	mutex_unlock(&vfio.group_lock);
 395
 396	return group;
 397}
 398
 399/* called with vfio.group_lock held */
 400static void vfio_group_release(struct kref *kref)
 401{
 402	struct vfio_group *group = container_of(kref, struct vfio_group, kref);
 403	struct vfio_unbound_dev *unbound, *tmp;
 404	struct iommu_group *iommu_group = group->iommu_group;
 405
 406	WARN_ON(!list_empty(&group->device_list));
 407	WARN_ON(group->notifier.head);
 408
 409	list_for_each_entry_safe(unbound, tmp,
 410				 &group->unbound_list, unbound_next) {
 411		list_del(&unbound->unbound_next);
 412		kfree(unbound);
 413	}
 414
 415	device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
 416	list_del(&group->vfio_next);
 417	vfio_free_group_minor(group->minor);
 418	vfio_group_unlock_and_free(group);
 419	iommu_group_put(iommu_group);
 420}
 421
 422static void vfio_group_put(struct vfio_group *group)
 423{
 424	kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
 425}
 426
 427struct vfio_group_put_work {
 428	struct work_struct work;
 429	struct vfio_group *group;
 430};
 431
 432static void vfio_group_put_bg(struct work_struct *work)
 433{
 434	struct vfio_group_put_work *do_work;
 435
 436	do_work = container_of(work, struct vfio_group_put_work, work);
 437
 438	vfio_group_put(do_work->group);
 439	kfree(do_work);
 440}
 441
 442static void vfio_group_schedule_put(struct vfio_group *group)
 443{
 444	struct vfio_group_put_work *do_work;
 445
 446	do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
 447	if (WARN_ON(!do_work))
 448		return;
 449
 450	INIT_WORK(&do_work->work, vfio_group_put_bg);
 451	do_work->group = group;
 452	schedule_work(&do_work->work);
 453}
 454
 455/* Assume group_lock or group reference is held */
 456static void vfio_group_get(struct vfio_group *group)
 457{
 458	kref_get(&group->kref);
 459}
 460
 461/*
 462 * Not really a try as we will sleep for mutex, but we need to make
 463 * sure the group pointer is valid under lock and get a reference.
 464 */
 465static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
 466{
 467	struct vfio_group *target = group;
 468
 469	mutex_lock(&vfio.group_lock);
 470	list_for_each_entry(group, &vfio.group_list, vfio_next) {
 471		if (group == target) {
 472			vfio_group_get(group);
 473			mutex_unlock(&vfio.group_lock);
 474			return group;
 475		}
 476	}
 477	mutex_unlock(&vfio.group_lock);
 478
 479	return NULL;
 480}
 481
 482static
 483struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
 484{
 485	struct vfio_group *group;
 486
 487	mutex_lock(&vfio.group_lock);
 488	list_for_each_entry(group, &vfio.group_list, vfio_next) {
 489		if (group->iommu_group == iommu_group) {
 490			vfio_group_get(group);
 491			mutex_unlock(&vfio.group_lock);
 492			return group;
 493		}
 494	}
 495	mutex_unlock(&vfio.group_lock);
 496
 497	return NULL;
 498}
 499
 500static struct vfio_group *vfio_group_get_from_minor(int minor)
 501{
 502	struct vfio_group *group;
 503
 504	mutex_lock(&vfio.group_lock);
 505	group = idr_find(&vfio.group_idr, minor);
 506	if (!group) {
 507		mutex_unlock(&vfio.group_lock);
 508		return NULL;
 509	}
 510	vfio_group_get(group);
 511	mutex_unlock(&vfio.group_lock);
 512
 513	return group;
 514}
 515
 516static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
 517{
 518	struct iommu_group *iommu_group;
 519	struct vfio_group *group;
 520
 521	iommu_group = iommu_group_get(dev);
 522	if (!iommu_group)
 523		return NULL;
 524
 525	group = vfio_group_get_from_iommu(iommu_group);
 526	iommu_group_put(iommu_group);
 527
 528	return group;
 529}
 530
 531/**
 532 * Device objects - create, release, get, put, search
 533 */
 534static
 535struct vfio_device *vfio_group_create_device(struct vfio_group *group,
 536					     struct device *dev,
 537					     const struct vfio_device_ops *ops,
 538					     void *device_data)
 539{
 540	struct vfio_device *device;
 541
 542	device = kzalloc(sizeof(*device), GFP_KERNEL);
 543	if (!device)
 544		return ERR_PTR(-ENOMEM);
 545
 546	kref_init(&device->kref);
 547	device->dev = dev;
 548	device->group = group;
 549	device->ops = ops;
 550	device->device_data = device_data;
 551	dev_set_drvdata(dev, device);
 552
 553	/* No need to get group_lock, caller has group reference */
 554	vfio_group_get(group);
 555
 556	mutex_lock(&group->device_lock);
 557	list_add(&device->group_next, &group->device_list);
 558	mutex_unlock(&group->device_lock);
 559
 560	return device;
 561}
 562
 563static void vfio_device_release(struct kref *kref)
 564{
 565	struct vfio_device *device = container_of(kref,
 566						  struct vfio_device, kref);
 567	struct vfio_group *group = device->group;
 568
 569	list_del(&device->group_next);
 570	mutex_unlock(&group->device_lock);
 571
 572	dev_set_drvdata(device->dev, NULL);
 573
 574	kfree(device);
 575
 576	/* vfio_del_group_dev may be waiting for this device */
 577	wake_up(&vfio.release_q);
 578}
 579
 580/* Device reference always implies a group reference */
 581void vfio_device_put(struct vfio_device *device)
 582{
 583	struct vfio_group *group = device->group;
 584	kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
 585	vfio_group_put(group);
 586}
 587EXPORT_SYMBOL_GPL(vfio_device_put);
 588
 589static void vfio_device_get(struct vfio_device *device)
 590{
 591	vfio_group_get(device->group);
 592	kref_get(&device->kref);
 593}
 594
 595static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
 596						 struct device *dev)
 597{
 598	struct vfio_device *device;
 599
 600	mutex_lock(&group->device_lock);
 601	list_for_each_entry(device, &group->device_list, group_next) {
 602		if (device->dev == dev) {
 603			vfio_device_get(device);
 604			mutex_unlock(&group->device_lock);
 605			return device;
 606		}
 607	}
 608	mutex_unlock(&group->device_lock);
 609	return NULL;
 610}
 611
 612/*
 613 * Some drivers, like pci-stub, are only used to prevent other drivers from
 614 * claiming a device and are therefore perfectly legitimate for a user owned
 615 * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
 616 * of the device, but it does prevent the user from having direct access to
 617 * the device, which is useful in some circumstances.
 618 *
 619 * We also assume that we can include PCI interconnect devices, ie. bridges.
 620 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
 621 * then all of the downstream devices will be part of the same IOMMU group as
 622 * the bridge.  Thus, if placing the bridge into the user owned IOVA space
 623 * breaks anything, it only does so for user owned devices downstream.  Note
 624 * that error notification via MSI can be affected for platforms that handle
 625 * MSI within the same IOVA space as DMA.
 626 */
 627static const char * const vfio_driver_whitelist[] = { "pci-stub" };
 628
 629static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
 
 630{
 631	if (dev_is_pci(dev)) {
 632		struct pci_dev *pdev = to_pci_dev(dev);
 633
 634		if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
 635			return true;
 636	}
 637
 638	return match_string(vfio_driver_whitelist,
 639			    ARRAY_SIZE(vfio_driver_whitelist),
 640			    drv->name) >= 0;
 641}
 642
 643/*
 644 * A vfio group is viable for use by userspace if all devices are in
 645 * one of the following states:
 646 *  - driver-less
 647 *  - bound to a vfio driver
 648 *  - bound to a whitelisted driver
 649 *  - a PCI interconnect device
 650 *
 651 * We use two methods to determine whether a device is bound to a vfio
 652 * driver.  The first is to test whether the device exists in the vfio
 653 * group.  The second is to test if the device exists on the group
 654 * unbound_list, indicating it's in the middle of transitioning from
 655 * a vfio driver to driver-less.
 656 */
 657static int vfio_dev_viable(struct device *dev, void *data)
 658{
 659	struct vfio_group *group = data;
 660	struct vfio_device *device;
 661	struct device_driver *drv = READ_ONCE(dev->driver);
 662	struct vfio_unbound_dev *unbound;
 663	int ret = -EINVAL;
 664
 665	mutex_lock(&group->unbound_lock);
 666	list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
 667		if (dev == unbound->dev) {
 668			ret = 0;
 669			break;
 670		}
 671	}
 672	mutex_unlock(&group->unbound_lock);
 673
 674	if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
 675		return 0;
 676
 677	device = vfio_group_get_device(group, dev);
 678	if (device) {
 679		vfio_device_put(device);
 680		return 0;
 681	}
 682
 683	return ret;
 684}
 685
 686/**
 687 * Async device support
 688 */
 689static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
 690{
 691	struct vfio_device *device;
 692
 693	/* Do we already know about it?  We shouldn't */
 694	device = vfio_group_get_device(group, dev);
 695	if (WARN_ON_ONCE(device)) {
 696		vfio_device_put(device);
 697		return 0;
 698	}
 699
 700	/* Nothing to do for idle groups */
 701	if (!atomic_read(&group->container_users))
 702		return 0;
 703
 704	/* TODO Prevent device auto probing */
 705	dev_WARN(dev, "Device added to live group %d!\n",
 706		 iommu_group_id(group->iommu_group));
 707
 708	return 0;
 709}
 710
 711static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
 712{
 713	/* We don't care what happens when the group isn't in use */
 714	if (!atomic_read(&group->container_users))
 715		return 0;
 716
 717	return vfio_dev_viable(dev, group);
 718}
 719
 720static int vfio_iommu_group_notifier(struct notifier_block *nb,
 721				     unsigned long action, void *data)
 722{
 723	struct vfio_group *group = container_of(nb, struct vfio_group, nb);
 724	struct device *dev = data;
 725	struct vfio_unbound_dev *unbound;
 726
 727	/*
 728	 * Need to go through a group_lock lookup to get a reference or we
 729	 * risk racing a group being removed.  Ignore spurious notifies.
 730	 */
 731	group = vfio_group_try_get(group);
 732	if (!group)
 733		return NOTIFY_OK;
 734
 735	switch (action) {
 736	case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
 737		vfio_group_nb_add_dev(group, dev);
 738		break;
 739	case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
 740		/*
 741		 * Nothing to do here.  If the device is in use, then the
 742		 * vfio sub-driver should block the remove callback until
 743		 * it is unused.  If the device is unused or attached to a
 744		 * stub driver, then it should be released and we don't
 745		 * care that it will be going away.
 746		 */
 747		break;
 748	case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
 749		dev_dbg(dev, "%s: group %d binding to driver\n", __func__,
 750			iommu_group_id(group->iommu_group));
 751		break;
 752	case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
 753		dev_dbg(dev, "%s: group %d bound to driver %s\n", __func__,
 754			iommu_group_id(group->iommu_group), dev->driver->name);
 755		BUG_ON(vfio_group_nb_verify(group, dev));
 756		break;
 757	case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
 758		dev_dbg(dev, "%s: group %d unbinding from driver %s\n",
 759			__func__, iommu_group_id(group->iommu_group),
 760			dev->driver->name);
 761		break;
 762	case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
 763		dev_dbg(dev, "%s: group %d unbound from driver\n", __func__,
 764			iommu_group_id(group->iommu_group));
 765		/*
 766		 * XXX An unbound device in a live group is ok, but we'd
 767		 * really like to avoid the above BUG_ON by preventing other
 768		 * drivers from binding to it.  Once that occurs, we have to
 769		 * stop the system to maintain isolation.  At a minimum, we'd
 770		 * want a toggle to disable driver auto probe for this device.
 771		 */
 772
 773		mutex_lock(&group->unbound_lock);
 774		list_for_each_entry(unbound,
 775				    &group->unbound_list, unbound_next) {
 776			if (dev == unbound->dev) {
 777				list_del(&unbound->unbound_next);
 778				kfree(unbound);
 779				break;
 780			}
 781		}
 782		mutex_unlock(&group->unbound_lock);
 783		break;
 784	}
 785
 786	/*
 787	 * If we're the last reference to the group, the group will be
 788	 * released, which includes unregistering the iommu group notifier.
 789	 * We hold a read-lock on that notifier list, unregistering needs
 790	 * a write-lock... deadlock.  Release our reference asynchronously
 791	 * to avoid that situation.
 792	 */
 793	vfio_group_schedule_put(group);
 794	return NOTIFY_OK;
 795}
 796
 797/**
 798 * VFIO driver API
 799 */
 800int vfio_add_group_dev(struct device *dev,
 801		       const struct vfio_device_ops *ops, void *device_data)
 802{
 
 
 
 
 
 
 
 
 
 803	struct iommu_group *iommu_group;
 804	struct vfio_group *group;
 805	struct vfio_device *device;
 806
 807	iommu_group = iommu_group_get(dev);
 808	if (!iommu_group)
 809		return -EINVAL;
 810
 811	group = vfio_group_get_from_iommu(iommu_group);
 812	if (!group) {
 813		group = vfio_create_group(iommu_group);
 814		if (IS_ERR(group)) {
 815			iommu_group_put(iommu_group);
 816			return PTR_ERR(group);
 817		}
 818	} else {
 819		/*
 820		 * A found vfio_group already holds a reference to the
 821		 * iommu_group.  A created vfio_group keeps the reference.
 822		 */
 823		iommu_group_put(iommu_group);
 824	}
 825
 826	device = vfio_group_get_device(group, dev);
 827	if (device) {
 828		dev_WARN(dev, "Device already exists on group %d\n",
 829			 iommu_group_id(iommu_group));
 830		vfio_device_put(device);
 831		vfio_group_put(group);
 832		return -EBUSY;
 833	}
 834
 835	device = vfio_group_create_device(group, dev, ops, device_data);
 836	if (IS_ERR(device)) {
 837		vfio_group_put(group);
 838		return PTR_ERR(device);
 839	}
 840
 841	/*
 842	 * Drop all but the vfio_device reference.  The vfio_device holds
 843	 * a reference to the vfio_group, which holds a reference to the
 844	 * iommu_group.
 845	 */
 846	vfio_group_put(group);
 
 847
 848	return 0;
 849}
 850EXPORT_SYMBOL_GPL(vfio_add_group_dev);
 851
 852/**
 853 * Get a reference to the vfio_device for a device.  Even if the
 854 * caller thinks they own the device, they could be racing with a
 855 * release call path, so we can't trust drvdata for the shortcut.
 856 * Go the long way around, from the iommu_group to the vfio_group
 857 * to the vfio_device.
 858 */
 859struct vfio_device *vfio_device_get_from_dev(struct device *dev)
 860{
 861	struct vfio_group *group;
 862	struct vfio_device *device;
 863
 864	group = vfio_group_get_from_dev(dev);
 865	if (!group)
 866		return NULL;
 867
 868	device = vfio_group_get_device(group, dev);
 869	vfio_group_put(group);
 870
 871	return device;
 872}
 873EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
 874
 875static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 876						     char *buf)
 877{
 878	struct vfio_device *it, *device = NULL;
 879
 880	mutex_lock(&group->device_lock);
 881	list_for_each_entry(it, &group->device_list, group_next) {
 882		if (!strcmp(dev_name(it->dev), buf)) {
 
 
 
 
 
 
 
 
 
 
 
 
 883			device = it;
 884			vfio_device_get(device);
 885			break;
 886		}
 887	}
 888	mutex_unlock(&group->device_lock);
 889
 890	return device;
 891}
 892
 893/*
 894 * Caller must hold a reference to the vfio_device
 895 */
 896void *vfio_device_data(struct vfio_device *device)
 897{
 898	return device->device_data;
 899}
 900EXPORT_SYMBOL_GPL(vfio_device_data);
 901
 902/*
 903 * Decrement the device reference count and wait for the device to be
 904 * removed.  Open file descriptors for the device... */
 905void *vfio_del_group_dev(struct device *dev)
 906{
 907	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 908	struct vfio_device *device = dev_get_drvdata(dev);
 909	struct vfio_group *group = device->group;
 910	void *device_data = device->device_data;
 911	struct vfio_unbound_dev *unbound;
 912	unsigned int i = 0;
 913	bool interrupted = false;
 914
 915	/*
 916	 * The group exists so long as we have a device reference.  Get
 917	 * a group reference and use it to scan for the device going away.
 918	 */
 919	vfio_group_get(group);
 920
 921	/*
 922	 * When the device is removed from the group, the group suddenly
 923	 * becomes non-viable; the device has a driver (until the unbind
 924	 * completes), but it's not present in the group.  This is bad news
 925	 * for any external users that need to re-acquire a group reference
 926	 * in order to match and release their existing reference.  To
 927	 * solve this, we track such devices on the unbound_list to bridge
 928	 * the gap until they're fully unbound.
 929	 */
 930	unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
 931	if (unbound) {
 932		unbound->dev = dev;
 933		mutex_lock(&group->unbound_lock);
 934		list_add(&unbound->unbound_next, &group->unbound_list);
 935		mutex_unlock(&group->unbound_lock);
 936	}
 937	WARN_ON(!unbound);
 938
 939	vfio_device_put(device);
 940
 941	/*
 942	 * If the device is still present in the group after the above
 943	 * 'put', then it is in use and we need to request it from the
 944	 * bus driver.  The driver may in turn need to request the
 945	 * device from the user.  We send the request on an arbitrary
 946	 * interval with counter to allow the driver to take escalating
 947	 * measures to release the device if it has the ability to do so.
 948	 */
 949	add_wait_queue(&vfio.release_q, &wait);
 950
 951	do {
 952		device = vfio_group_get_device(group, dev);
 953		if (!device)
 954			break;
 955
 956		if (device->ops->request)
 957			device->ops->request(device_data, i++);
 958
 959		vfio_device_put(device);
 960
 961		if (interrupted) {
 962			wait_woken(&wait, TASK_UNINTERRUPTIBLE, HZ * 10);
 
 963		} else {
 964			wait_woken(&wait, TASK_INTERRUPTIBLE, HZ * 10);
 965			if (signal_pending(current)) {
 
 966				interrupted = true;
 967				dev_warn(dev,
 968					 "Device is currently in use, task"
 969					 " \"%s\" (%d) "
 970					 "blocked until device is released",
 971					 current->comm, task_pid_nr(current));
 972			}
 973		}
 
 974
 975	} while (1);
 
 
 
 976
 977	remove_wait_queue(&vfio.release_q, &wait);
 978	/*
 979	 * In order to support multiple devices per group, devices can be
 980	 * plucked from the group while other devices in the group are still
 981	 * in use.  The container persists with this group and those remaining
 982	 * devices still attached.  If the user creates an isolation violation
 983	 * by binding this device to another driver while the group is still in
 984	 * use, that's their fault.  However, in the case of removing the last,
 985	 * or potentially the only, device in the group there can be no other
 986	 * in-use devices in the group.  The user has done their due diligence
 987	 * and we should lay no claims to those devices.  In order to do that,
 988	 * we need to make sure the group is detached from the container.
 989	 * Without this stall, we're potentially racing with a user process
 990	 * that may attempt to immediately bind this device to another driver.
 991	 */
 992	if (list_empty(&group->device_list))
 993		wait_event(group->container_q, !group->container);
 994
 
 995	vfio_group_put(group);
 996
 997	return device_data;
 998}
 999EXPORT_SYMBOL_GPL(vfio_del_group_dev);
1000
1001/**
1002 * VFIO base fd, /dev/vfio/vfio
1003 */
1004static long vfio_ioctl_check_extension(struct vfio_container *container,
1005				       unsigned long arg)
1006{
1007	struct vfio_iommu_driver *driver;
1008	long ret = 0;
1009
1010	down_read(&container->group_lock);
1011
1012	driver = container->iommu_driver;
1013
1014	switch (arg) {
1015		/* No base extensions yet */
1016	default:
1017		/*
1018		 * If no driver is set, poll all registered drivers for
1019		 * extensions and return the first positive result.  If
1020		 * a driver is already set, further queries will be passed
1021		 * only to that driver.
1022		 */
1023		if (!driver) {
1024			mutex_lock(&vfio.iommu_drivers_lock);
1025			list_for_each_entry(driver, &vfio.iommu_drivers_list,
1026					    vfio_next) {
1027
1028#ifdef CONFIG_VFIO_NOIOMMU
1029				if (!list_empty(&container->group_list) &&
1030				    (container->noiommu !=
1031				     (driver->ops == &vfio_noiommu_ops)))
1032					continue;
1033#endif
1034
1035				if (!try_module_get(driver->ops->owner))
1036					continue;
1037
1038				ret = driver->ops->ioctl(NULL,
1039							 VFIO_CHECK_EXTENSION,
1040							 arg);
1041				module_put(driver->ops->owner);
1042				if (ret > 0)
1043					break;
1044			}
1045			mutex_unlock(&vfio.iommu_drivers_lock);
1046		} else
1047			ret = driver->ops->ioctl(container->iommu_data,
1048						 VFIO_CHECK_EXTENSION, arg);
1049	}
1050
1051	up_read(&container->group_lock);
1052
1053	return ret;
1054}
1055
1056/* hold write lock on container->group_lock */
1057static int __vfio_container_attach_groups(struct vfio_container *container,
1058					  struct vfio_iommu_driver *driver,
1059					  void *data)
1060{
1061	struct vfio_group *group;
1062	int ret = -ENODEV;
1063
1064	list_for_each_entry(group, &container->group_list, container_next) {
1065		ret = driver->ops->attach_group(data, group->iommu_group);
1066		if (ret)
1067			goto unwind;
1068	}
1069
1070	return ret;
1071
1072unwind:
1073	list_for_each_entry_continue_reverse(group, &container->group_list,
1074					     container_next) {
1075		driver->ops->detach_group(data, group->iommu_group);
1076	}
1077
1078	return ret;
1079}
1080
1081static long vfio_ioctl_set_iommu(struct vfio_container *container,
1082				 unsigned long arg)
1083{
1084	struct vfio_iommu_driver *driver;
1085	long ret = -ENODEV;
1086
1087	down_write(&container->group_lock);
1088
1089	/*
1090	 * The container is designed to be an unprivileged interface while
1091	 * the group can be assigned to specific users.  Therefore, only by
1092	 * adding a group to a container does the user get the privilege of
1093	 * enabling the iommu, which may allocate finite resources.  There
1094	 * is no unset_iommu, but by removing all the groups from a container,
1095	 * the container is deprivileged and returns to an unset state.
1096	 */
1097	if (list_empty(&container->group_list) || container->iommu_driver) {
1098		up_write(&container->group_lock);
1099		return -EINVAL;
1100	}
1101
1102	mutex_lock(&vfio.iommu_drivers_lock);
1103	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1104		void *data;
1105
1106#ifdef CONFIG_VFIO_NOIOMMU
1107		/*
1108		 * Only noiommu containers can use vfio-noiommu and noiommu
1109		 * containers can only use vfio-noiommu.
1110		 */
1111		if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1112			continue;
1113#endif
1114
1115		if (!try_module_get(driver->ops->owner))
1116			continue;
1117
1118		/*
1119		 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1120		 * so test which iommu driver reported support for this
1121		 * extension and call open on them.  We also pass them the
1122		 * magic, allowing a single driver to support multiple
1123		 * interfaces if they'd like.
1124		 */
1125		if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1126			module_put(driver->ops->owner);
1127			continue;
1128		}
1129
1130		data = driver->ops->open(arg);
1131		if (IS_ERR(data)) {
1132			ret = PTR_ERR(data);
1133			module_put(driver->ops->owner);
1134			continue;
1135		}
1136
1137		ret = __vfio_container_attach_groups(container, driver, data);
1138		if (ret) {
1139			driver->ops->release(data);
1140			module_put(driver->ops->owner);
1141			continue;
1142		}
1143
1144		container->iommu_driver = driver;
1145		container->iommu_data = data;
1146		break;
1147	}
1148
1149	mutex_unlock(&vfio.iommu_drivers_lock);
1150	up_write(&container->group_lock);
1151
1152	return ret;
1153}
1154
1155static long vfio_fops_unl_ioctl(struct file *filep,
1156				unsigned int cmd, unsigned long arg)
1157{
1158	struct vfio_container *container = filep->private_data;
1159	struct vfio_iommu_driver *driver;
1160	void *data;
1161	long ret = -EINVAL;
1162
1163	if (!container)
1164		return ret;
1165
1166	switch (cmd) {
1167	case VFIO_GET_API_VERSION:
1168		ret = VFIO_API_VERSION;
1169		break;
1170	case VFIO_CHECK_EXTENSION:
1171		ret = vfio_ioctl_check_extension(container, arg);
1172		break;
1173	case VFIO_SET_IOMMU:
1174		ret = vfio_ioctl_set_iommu(container, arg);
1175		break;
1176	default:
1177		driver = container->iommu_driver;
1178		data = container->iommu_data;
1179
1180		if (driver) /* passthrough all unrecognized ioctls */
1181			ret = driver->ops->ioctl(data, cmd, arg);
1182	}
1183
1184	return ret;
1185}
1186
1187#ifdef CONFIG_COMPAT
1188static long vfio_fops_compat_ioctl(struct file *filep,
1189				   unsigned int cmd, unsigned long arg)
1190{
1191	arg = (unsigned long)compat_ptr(arg);
1192	return vfio_fops_unl_ioctl(filep, cmd, arg);
1193}
1194#endif	/* CONFIG_COMPAT */
1195
1196static int vfio_fops_open(struct inode *inode, struct file *filep)
1197{
1198	struct vfio_container *container;
1199
1200	container = kzalloc(sizeof(*container), GFP_KERNEL);
1201	if (!container)
1202		return -ENOMEM;
1203
1204	INIT_LIST_HEAD(&container->group_list);
1205	init_rwsem(&container->group_lock);
1206	kref_init(&container->kref);
1207
1208	filep->private_data = container;
1209
1210	return 0;
1211}
1212
1213static int vfio_fops_release(struct inode *inode, struct file *filep)
1214{
1215	struct vfio_container *container = filep->private_data;
 
 
 
 
 
1216
1217	filep->private_data = NULL;
1218
1219	vfio_container_put(container);
1220
1221	return 0;
1222}
1223
1224/*
1225 * Once an iommu driver is set, we optionally pass read/write/mmap
1226 * on to the driver, allowing management interfaces beyond ioctl.
1227 */
1228static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1229			      size_t count, loff_t *ppos)
1230{
1231	struct vfio_container *container = filep->private_data;
1232	struct vfio_iommu_driver *driver;
1233	ssize_t ret = -EINVAL;
1234
1235	driver = container->iommu_driver;
1236	if (likely(driver && driver->ops->read))
1237		ret = driver->ops->read(container->iommu_data,
1238					buf, count, ppos);
1239
1240	return ret;
1241}
1242
1243static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1244			       size_t count, loff_t *ppos)
1245{
1246	struct vfio_container *container = filep->private_data;
1247	struct vfio_iommu_driver *driver;
1248	ssize_t ret = -EINVAL;
1249
1250	driver = container->iommu_driver;
1251	if (likely(driver && driver->ops->write))
1252		ret = driver->ops->write(container->iommu_data,
1253					 buf, count, ppos);
1254
1255	return ret;
1256}
1257
1258static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1259{
1260	struct vfio_container *container = filep->private_data;
1261	struct vfio_iommu_driver *driver;
1262	int ret = -EINVAL;
1263
1264	driver = container->iommu_driver;
1265	if (likely(driver && driver->ops->mmap))
1266		ret = driver->ops->mmap(container->iommu_data, vma);
1267
1268	return ret;
1269}
1270
1271static const struct file_operations vfio_fops = {
1272	.owner		= THIS_MODULE,
1273	.open		= vfio_fops_open,
1274	.release	= vfio_fops_release,
1275	.read		= vfio_fops_read,
1276	.write		= vfio_fops_write,
1277	.unlocked_ioctl	= vfio_fops_unl_ioctl,
1278#ifdef CONFIG_COMPAT
1279	.compat_ioctl	= vfio_fops_compat_ioctl,
1280#endif
1281	.mmap		= vfio_fops_mmap,
1282};
1283
1284/**
1285 * VFIO Group fd, /dev/vfio/$GROUP
1286 */
1287static void __vfio_group_unset_container(struct vfio_group *group)
1288{
1289	struct vfio_container *container = group->container;
1290	struct vfio_iommu_driver *driver;
1291
1292	down_write(&container->group_lock);
1293
1294	driver = container->iommu_driver;
1295	if (driver)
1296		driver->ops->detach_group(container->iommu_data,
1297					  group->iommu_group);
1298
1299	group->container = NULL;
1300	wake_up(&group->container_q);
1301	list_del(&group->container_next);
1302
1303	/* Detaching the last group deprivileges a container, remove iommu */
1304	if (driver && list_empty(&container->group_list)) {
1305		driver->ops->release(container->iommu_data);
1306		module_put(driver->ops->owner);
1307		container->iommu_driver = NULL;
1308		container->iommu_data = NULL;
1309	}
1310
1311	up_write(&container->group_lock);
1312
1313	vfio_container_put(container);
1314}
1315
1316/*
1317 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1318 * if there was no container to unset.  Since the ioctl is called on
1319 * the group, we know that still exists, therefore the only valid
1320 * transition here is 1->0.
1321 */
1322static int vfio_group_unset_container(struct vfio_group *group)
1323{
1324	int users = atomic_cmpxchg(&group->container_users, 1, 0);
1325
1326	if (!users)
1327		return -EINVAL;
1328	if (users != 1)
1329		return -EBUSY;
1330
1331	__vfio_group_unset_container(group);
1332
1333	return 0;
1334}
1335
1336/*
1337 * When removing container users, anything that removes the last user
1338 * implicitly removes the group from the container.  That is, if the
1339 * group file descriptor is closed, as well as any device file descriptors,
1340 * the group is free.
1341 */
1342static void vfio_group_try_dissolve_container(struct vfio_group *group)
1343{
1344	if (0 == atomic_dec_if_positive(&group->container_users))
1345		__vfio_group_unset_container(group);
1346}
1347
1348static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1349{
1350	struct fd f;
1351	struct vfio_container *container;
1352	struct vfio_iommu_driver *driver;
1353	int ret = 0;
1354
1355	if (atomic_read(&group->container_users))
1356		return -EINVAL;
1357
1358	if (group->noiommu && !capable(CAP_SYS_RAWIO))
1359		return -EPERM;
1360
1361	f = fdget(container_fd);
1362	if (!f.file)
1363		return -EBADF;
1364
1365	/* Sanity check, is this really our fd? */
1366	if (f.file->f_op != &vfio_fops) {
1367		fdput(f);
1368		return -EINVAL;
1369	}
1370
1371	container = f.file->private_data;
1372	WARN_ON(!container); /* fget ensures we don't race vfio_release */
1373
1374	down_write(&container->group_lock);
1375
1376	/* Real groups and fake groups cannot mix */
1377	if (!list_empty(&container->group_list) &&
1378	    container->noiommu != group->noiommu) {
1379		ret = -EPERM;
1380		goto unlock_out;
1381	}
1382
1383	driver = container->iommu_driver;
1384	if (driver) {
1385		ret = driver->ops->attach_group(container->iommu_data,
1386						group->iommu_group);
1387		if (ret)
1388			goto unlock_out;
1389	}
1390
1391	group->container = container;
1392	container->noiommu = group->noiommu;
1393	list_add(&group->container_next, &container->group_list);
1394
1395	/* Get a reference on the container and mark a user within the group */
1396	vfio_container_get(container);
1397	atomic_inc(&group->container_users);
1398
1399unlock_out:
1400	up_write(&container->group_lock);
1401	fdput(f);
1402	return ret;
1403}
1404
1405static bool vfio_group_viable(struct vfio_group *group)
1406{
1407	return (iommu_group_for_each_dev(group->iommu_group,
1408					 group, vfio_dev_viable) == 0);
1409}
1410
1411static int vfio_group_add_container_user(struct vfio_group *group)
1412{
1413	if (!atomic_inc_not_zero(&group->container_users))
1414		return -EINVAL;
1415
1416	if (group->noiommu) {
1417		atomic_dec(&group->container_users);
1418		return -EPERM;
1419	}
1420	if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1421		atomic_dec(&group->container_users);
1422		return -EINVAL;
1423	}
1424
1425	return 0;
1426}
1427
1428static const struct file_operations vfio_device_fops;
1429
1430static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1431{
1432	struct vfio_device *device;
1433	struct file *filep;
1434	int ret;
1435
1436	if (0 == atomic_read(&group->container_users) ||
1437	    !group->container->iommu_driver || !vfio_group_viable(group))
1438		return -EINVAL;
1439
1440	if (group->noiommu && !capable(CAP_SYS_RAWIO))
1441		return -EPERM;
1442
1443	device = vfio_device_get_from_name(group, buf);
1444	if (!device)
 
 
 
 
1445		return -ENODEV;
 
1446
1447	ret = device->ops->open(device->device_data);
1448	if (ret) {
 
1449		vfio_device_put(device);
1450		return ret;
1451	}
1452
1453	/*
1454	 * We can't use anon_inode_getfd() because we need to modify
1455	 * the f_mode flags directly to allow more than just ioctls
1456	 */
1457	ret = get_unused_fd_flags(O_CLOEXEC);
1458	if (ret < 0) {
1459		device->ops->release(device->device_data);
 
1460		vfio_device_put(device);
1461		return ret;
1462	}
1463
1464	filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1465				   device, O_RDWR);
1466	if (IS_ERR(filep)) {
1467		put_unused_fd(ret);
1468		ret = PTR_ERR(filep);
1469		device->ops->release(device->device_data);
 
1470		vfio_device_put(device);
1471		return ret;
1472	}
1473
1474	/*
1475	 * TODO: add an anon_inode interface to do this.
1476	 * Appears to be missing by lack of need rather than
1477	 * explicitly prevented.  Now there's need.
1478	 */
1479	filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1480
1481	atomic_inc(&group->container_users);
1482
1483	fd_install(ret, filep);
1484
1485	if (group->noiommu)
1486		dev_warn(device->dev, "vfio-noiommu device opened by user "
1487			 "(%s:%d)\n", current->comm, task_pid_nr(current));
1488
1489	return ret;
1490}
1491
1492static long vfio_group_fops_unl_ioctl(struct file *filep,
1493				      unsigned int cmd, unsigned long arg)
1494{
1495	struct vfio_group *group = filep->private_data;
1496	long ret = -ENOTTY;
1497
1498	switch (cmd) {
1499	case VFIO_GROUP_GET_STATUS:
1500	{
1501		struct vfio_group_status status;
1502		unsigned long minsz;
1503
1504		minsz = offsetofend(struct vfio_group_status, flags);
1505
1506		if (copy_from_user(&status, (void __user *)arg, minsz))
1507			return -EFAULT;
1508
1509		if (status.argsz < minsz)
1510			return -EINVAL;
1511
1512		status.flags = 0;
1513
1514		if (vfio_group_viable(group))
1515			status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1516
1517		if (group->container)
1518			status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1519
1520		if (copy_to_user((void __user *)arg, &status, minsz))
1521			return -EFAULT;
1522
1523		ret = 0;
1524		break;
1525	}
1526	case VFIO_GROUP_SET_CONTAINER:
1527	{
1528		int fd;
1529
1530		if (get_user(fd, (int __user *)arg))
1531			return -EFAULT;
1532
1533		if (fd < 0)
1534			return -EINVAL;
1535
1536		ret = vfio_group_set_container(group, fd);
1537		break;
1538	}
1539	case VFIO_GROUP_UNSET_CONTAINER:
1540		ret = vfio_group_unset_container(group);
1541		break;
1542	case VFIO_GROUP_GET_DEVICE_FD:
1543	{
1544		char *buf;
1545
1546		buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1547		if (IS_ERR(buf))
1548			return PTR_ERR(buf);
1549
1550		ret = vfio_group_get_device_fd(group, buf);
1551		kfree(buf);
1552		break;
1553	}
1554	}
1555
1556	return ret;
1557}
1558
1559#ifdef CONFIG_COMPAT
1560static long vfio_group_fops_compat_ioctl(struct file *filep,
1561					 unsigned int cmd, unsigned long arg)
1562{
1563	arg = (unsigned long)compat_ptr(arg);
1564	return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1565}
1566#endif	/* CONFIG_COMPAT */
1567
1568static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1569{
1570	struct vfio_group *group;
1571	int opened;
1572
1573	group = vfio_group_get_from_minor(iminor(inode));
1574	if (!group)
1575		return -ENODEV;
1576
1577	if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1578		vfio_group_put(group);
1579		return -EPERM;
1580	}
1581
1582	/* Do we need multiple instances of the group open?  Seems not. */
1583	opened = atomic_cmpxchg(&group->opened, 0, 1);
1584	if (opened) {
1585		vfio_group_put(group);
1586		return -EBUSY;
1587	}
1588
1589	/* Is something still in use from a previous open? */
1590	if (group->container) {
1591		atomic_dec(&group->opened);
1592		vfio_group_put(group);
1593		return -EBUSY;
1594	}
1595
1596	/* Warn if previous user didn't cleanup and re-init to drop them */
1597	if (WARN_ON(group->notifier.head))
1598		BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1599
1600	filep->private_data = group;
1601
1602	return 0;
1603}
1604
1605static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1606{
1607	struct vfio_group *group = filep->private_data;
1608
1609	filep->private_data = NULL;
1610
1611	vfio_group_try_dissolve_container(group);
1612
1613	atomic_dec(&group->opened);
1614
1615	vfio_group_put(group);
1616
1617	return 0;
1618}
1619
1620static const struct file_operations vfio_group_fops = {
1621	.owner		= THIS_MODULE,
1622	.unlocked_ioctl	= vfio_group_fops_unl_ioctl,
1623#ifdef CONFIG_COMPAT
1624	.compat_ioctl	= vfio_group_fops_compat_ioctl,
1625#endif
1626	.open		= vfio_group_fops_open,
1627	.release	= vfio_group_fops_release,
1628};
1629
1630/**
1631 * VFIO Device fd
1632 */
1633static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1634{
1635	struct vfio_device *device = filep->private_data;
1636
1637	device->ops->release(device->device_data);
 
 
1638
1639	vfio_group_try_dissolve_container(device->group);
1640
1641	vfio_device_put(device);
1642
1643	return 0;
1644}
1645
1646static long vfio_device_fops_unl_ioctl(struct file *filep,
1647				       unsigned int cmd, unsigned long arg)
1648{
1649	struct vfio_device *device = filep->private_data;
1650
1651	if (unlikely(!device->ops->ioctl))
1652		return -EINVAL;
1653
1654	return device->ops->ioctl(device->device_data, cmd, arg);
1655}
1656
1657static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1658				     size_t count, loff_t *ppos)
1659{
1660	struct vfio_device *device = filep->private_data;
1661
1662	if (unlikely(!device->ops->read))
1663		return -EINVAL;
1664
1665	return device->ops->read(device->device_data, buf, count, ppos);
1666}
1667
1668static ssize_t vfio_device_fops_write(struct file *filep,
1669				      const char __user *buf,
1670				      size_t count, loff_t *ppos)
1671{
1672	struct vfio_device *device = filep->private_data;
1673
1674	if (unlikely(!device->ops->write))
1675		return -EINVAL;
1676
1677	return device->ops->write(device->device_data, buf, count, ppos);
1678}
1679
1680static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1681{
1682	struct vfio_device *device = filep->private_data;
1683
1684	if (unlikely(!device->ops->mmap))
1685		return -EINVAL;
1686
1687	return device->ops->mmap(device->device_data, vma);
1688}
1689
1690#ifdef CONFIG_COMPAT
1691static long vfio_device_fops_compat_ioctl(struct file *filep,
1692					  unsigned int cmd, unsigned long arg)
1693{
1694	arg = (unsigned long)compat_ptr(arg);
1695	return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1696}
1697#endif	/* CONFIG_COMPAT */
1698
1699static const struct file_operations vfio_device_fops = {
1700	.owner		= THIS_MODULE,
1701	.release	= vfio_device_fops_release,
1702	.read		= vfio_device_fops_read,
1703	.write		= vfio_device_fops_write,
1704	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1705#ifdef CONFIG_COMPAT
1706	.compat_ioctl	= vfio_device_fops_compat_ioctl,
1707#endif
1708	.mmap		= vfio_device_fops_mmap,
1709};
1710
1711/**
1712 * External user API, exported by symbols to be linked dynamically.
1713 *
1714 * The protocol includes:
1715 *  1. do normal VFIO init operation:
1716 *	- opening a new container;
1717 *	- attaching group(s) to it;
1718 *	- setting an IOMMU driver for a container.
1719 * When IOMMU is set for a container, all groups in it are
1720 * considered ready to use by an external user.
1721 *
1722 * 2. User space passes a group fd to an external user.
1723 * The external user calls vfio_group_get_external_user()
1724 * to verify that:
1725 *	- the group is initialized;
1726 *	- IOMMU is set for it.
1727 * If both checks passed, vfio_group_get_external_user()
1728 * increments the container user counter to prevent
1729 * the VFIO group from disposal before KVM exits.
1730 *
1731 * 3. The external user calls vfio_external_user_iommu_id()
1732 * to know an IOMMU ID.
1733 *
1734 * 4. When the external KVM finishes, it calls
1735 * vfio_group_put_external_user() to release the VFIO group.
1736 * This call decrements the container user counter.
1737 */
1738struct vfio_group *vfio_group_get_external_user(struct file *filep)
1739{
1740	struct vfio_group *group = filep->private_data;
1741	int ret;
1742
1743	if (filep->f_op != &vfio_group_fops)
1744		return ERR_PTR(-EINVAL);
1745
1746	ret = vfio_group_add_container_user(group);
1747	if (ret)
1748		return ERR_PTR(ret);
1749
1750	vfio_group_get(group);
1751
1752	return group;
1753}
1754EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1755
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1756void vfio_group_put_external_user(struct vfio_group *group)
1757{
1758	vfio_group_try_dissolve_container(group);
1759	vfio_group_put(group);
1760}
1761EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1762
1763bool vfio_external_group_match_file(struct vfio_group *test_group,
1764				    struct file *filep)
1765{
1766	struct vfio_group *group = filep->private_data;
1767
1768	return (filep->f_op == &vfio_group_fops) && (group == test_group);
1769}
1770EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1771
1772int vfio_external_user_iommu_id(struct vfio_group *group)
1773{
1774	return iommu_group_id(group->iommu_group);
1775}
1776EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1777
1778long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1779{
1780	return vfio_ioctl_check_extension(group->container, arg);
1781}
1782EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1783
1784/**
1785 * Sub-module support
1786 */
1787/*
1788 * Helper for managing a buffer of info chain capabilities, allocate or
1789 * reallocate a buffer with additional @size, filling in @id and @version
1790 * of the capability.  A pointer to the new capability is returned.
1791 *
1792 * NB. The chain is based at the head of the buffer, so new entries are
1793 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1794 * next offsets prior to copying to the user buffer.
1795 */
1796struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1797					       size_t size, u16 id, u16 version)
1798{
1799	void *buf;
1800	struct vfio_info_cap_header *header, *tmp;
1801
1802	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1803	if (!buf) {
1804		kfree(caps->buf);
1805		caps->size = 0;
1806		return ERR_PTR(-ENOMEM);
1807	}
1808
1809	caps->buf = buf;
1810	header = buf + caps->size;
1811
1812	/* Eventually copied to user buffer, zero */
1813	memset(header, 0, size);
1814
1815	header->id = id;
1816	header->version = version;
1817
1818	/* Add to the end of the capability chain */
1819	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1820		; /* nothing */
1821
1822	tmp->next = caps->size;
1823	caps->size += size;
1824
1825	return header;
1826}
1827EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1828
1829void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1830{
1831	struct vfio_info_cap_header *tmp;
1832	void *buf = (void *)caps->buf;
1833
1834	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1835		tmp->next += offset;
1836}
1837EXPORT_SYMBOL(vfio_info_cap_shift);
1838
1839int vfio_info_add_capability(struct vfio_info_cap *caps,
1840			     struct vfio_info_cap_header *cap, size_t size)
1841{
1842	struct vfio_info_cap_header *header;
1843
1844	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1845	if (IS_ERR(header))
1846		return PTR_ERR(header);
1847
1848	memcpy(header + 1, cap + 1, size - sizeof(*header));
1849
1850	return 0;
1851}
1852EXPORT_SYMBOL(vfio_info_add_capability);
1853
1854int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1855				       int max_irq_type, size_t *data_size)
1856{
1857	unsigned long minsz;
1858	size_t size;
1859
1860	minsz = offsetofend(struct vfio_irq_set, count);
1861
1862	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1863	    (hdr->count >= (U32_MAX - hdr->start)) ||
1864	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1865				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1866		return -EINVAL;
1867
1868	if (data_size)
1869		*data_size = 0;
1870
1871	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1872		return -EINVAL;
1873
1874	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1875	case VFIO_IRQ_SET_DATA_NONE:
1876		size = 0;
1877		break;
1878	case VFIO_IRQ_SET_DATA_BOOL:
1879		size = sizeof(uint8_t);
1880		break;
1881	case VFIO_IRQ_SET_DATA_EVENTFD:
1882		size = sizeof(int32_t);
1883		break;
1884	default:
1885		return -EINVAL;
1886	}
1887
1888	if (size) {
1889		if (hdr->argsz - minsz < hdr->count * size)
1890			return -EINVAL;
1891
1892		if (!data_size)
1893			return -EINVAL;
1894
1895		*data_size = hdr->count * size;
1896	}
1897
1898	return 0;
1899}
1900EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1901
1902/*
1903 * Pin a set of guest PFNs and return their associated host PFNs for local
1904 * domain only.
1905 * @dev [in]     : device
1906 * @user_pfn [in]: array of user/guest PFNs to be pinned.
1907 * @npage [in]   : count of elements in user_pfn array.  This count should not
1908 *		   be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1909 * @prot [in]    : protection flags
1910 * @phys_pfn[out]: array of host PFNs
1911 * Return error or number of pages pinned.
1912 */
1913int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1914		   int prot, unsigned long *phys_pfn)
1915{
1916	struct vfio_container *container;
1917	struct vfio_group *group;
1918	struct vfio_iommu_driver *driver;
1919	int ret;
1920
1921	if (!dev || !user_pfn || !phys_pfn || !npage)
1922		return -EINVAL;
1923
1924	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1925		return -E2BIG;
1926
1927	group = vfio_group_get_from_dev(dev);
1928	if (!group)
1929		return -ENODEV;
1930
 
 
 
 
 
1931	ret = vfio_group_add_container_user(group);
1932	if (ret)
1933		goto err_pin_pages;
1934
1935	container = group->container;
1936	driver = container->iommu_driver;
1937	if (likely(driver && driver->ops->pin_pages))
1938		ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
 
1939					     npage, prot, phys_pfn);
1940	else
1941		ret = -ENOTTY;
1942
1943	vfio_group_try_dissolve_container(group);
1944
1945err_pin_pages:
1946	vfio_group_put(group);
1947	return ret;
1948}
1949EXPORT_SYMBOL(vfio_pin_pages);
1950
1951/*
1952 * Unpin set of host PFNs for local domain only.
1953 * @dev [in]     : device
1954 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1955 *		   PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1956 * @npage [in]   : count of elements in user_pfn array.  This count should not
1957 *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1958 * Return error or number of pages unpinned.
1959 */
1960int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
1961{
1962	struct vfio_container *container;
1963	struct vfio_group *group;
1964	struct vfio_iommu_driver *driver;
1965	int ret;
1966
1967	if (!dev || !user_pfn || !npage)
1968		return -EINVAL;
1969
1970	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1971		return -E2BIG;
1972
1973	group = vfio_group_get_from_dev(dev);
1974	if (!group)
1975		return -ENODEV;
1976
1977	ret = vfio_group_add_container_user(group);
1978	if (ret)
1979		goto err_unpin_pages;
1980
1981	container = group->container;
1982	driver = container->iommu_driver;
1983	if (likely(driver && driver->ops->unpin_pages))
1984		ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
1985					       npage);
1986	else
1987		ret = -ENOTTY;
1988
1989	vfio_group_try_dissolve_container(group);
1990
1991err_unpin_pages:
1992	vfio_group_put(group);
1993	return ret;
1994}
1995EXPORT_SYMBOL(vfio_unpin_pages);
1996
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1997static int vfio_register_iommu_notifier(struct vfio_group *group,
1998					unsigned long *events,
1999					struct notifier_block *nb)
2000{
2001	struct vfio_container *container;
2002	struct vfio_iommu_driver *driver;
2003	int ret;
2004
2005	ret = vfio_group_add_container_user(group);
2006	if (ret)
2007		return -EINVAL;
2008
2009	container = group->container;
2010	driver = container->iommu_driver;
2011	if (likely(driver && driver->ops->register_notifier))
2012		ret = driver->ops->register_notifier(container->iommu_data,
2013						     events, nb);
2014	else
2015		ret = -ENOTTY;
2016
2017	vfio_group_try_dissolve_container(group);
2018
2019	return ret;
2020}
2021
2022static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2023					  struct notifier_block *nb)
2024{
2025	struct vfio_container *container;
2026	struct vfio_iommu_driver *driver;
2027	int ret;
2028
2029	ret = vfio_group_add_container_user(group);
2030	if (ret)
2031		return -EINVAL;
2032
2033	container = group->container;
2034	driver = container->iommu_driver;
2035	if (likely(driver && driver->ops->unregister_notifier))
2036		ret = driver->ops->unregister_notifier(container->iommu_data,
2037						       nb);
2038	else
2039		ret = -ENOTTY;
2040
2041	vfio_group_try_dissolve_container(group);
2042
2043	return ret;
2044}
2045
2046void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2047{
2048	group->kvm = kvm;
2049	blocking_notifier_call_chain(&group->notifier,
2050				VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2051}
2052EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2053
2054static int vfio_register_group_notifier(struct vfio_group *group,
2055					unsigned long *events,
2056					struct notifier_block *nb)
2057{
2058	int ret;
2059	bool set_kvm = false;
2060
2061	if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2062		set_kvm = true;
2063
2064	/* clear known events */
2065	*events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2066
2067	/* refuse to continue if still events remaining */
2068	if (*events)
2069		return -EINVAL;
2070
2071	ret = vfio_group_add_container_user(group);
2072	if (ret)
2073		return -EINVAL;
2074
2075	ret = blocking_notifier_chain_register(&group->notifier, nb);
2076
2077	/*
2078	 * The attaching of kvm and vfio_group might already happen, so
2079	 * here we replay once upon registration.
2080	 */
2081	if (!ret && set_kvm && group->kvm)
2082		blocking_notifier_call_chain(&group->notifier,
2083					VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2084
2085	vfio_group_try_dissolve_container(group);
2086
2087	return ret;
2088}
2089
2090static int vfio_unregister_group_notifier(struct vfio_group *group,
2091					 struct notifier_block *nb)
2092{
2093	int ret;
2094
2095	ret = vfio_group_add_container_user(group);
2096	if (ret)
2097		return -EINVAL;
2098
2099	ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2100
2101	vfio_group_try_dissolve_container(group);
2102
2103	return ret;
2104}
2105
2106int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2107			   unsigned long *events, struct notifier_block *nb)
2108{
2109	struct vfio_group *group;
2110	int ret;
2111
2112	if (!dev || !nb || !events || (*events == 0))
2113		return -EINVAL;
2114
2115	group = vfio_group_get_from_dev(dev);
2116	if (!group)
2117		return -ENODEV;
2118
2119	switch (type) {
2120	case VFIO_IOMMU_NOTIFY:
2121		ret = vfio_register_iommu_notifier(group, events, nb);
2122		break;
2123	case VFIO_GROUP_NOTIFY:
2124		ret = vfio_register_group_notifier(group, events, nb);
2125		break;
2126	default:
2127		ret = -EINVAL;
2128	}
2129
2130	vfio_group_put(group);
2131	return ret;
2132}
2133EXPORT_SYMBOL(vfio_register_notifier);
2134
2135int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2136			     struct notifier_block *nb)
2137{
2138	struct vfio_group *group;
2139	int ret;
2140
2141	if (!dev || !nb)
2142		return -EINVAL;
2143
2144	group = vfio_group_get_from_dev(dev);
2145	if (!group)
2146		return -ENODEV;
2147
2148	switch (type) {
2149	case VFIO_IOMMU_NOTIFY:
2150		ret = vfio_unregister_iommu_notifier(group, nb);
2151		break;
2152	case VFIO_GROUP_NOTIFY:
2153		ret = vfio_unregister_group_notifier(group, nb);
2154		break;
2155	default:
2156		ret = -EINVAL;
2157	}
2158
2159	vfio_group_put(group);
2160	return ret;
2161}
2162EXPORT_SYMBOL(vfio_unregister_notifier);
2163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2164/**
2165 * Module/class support
2166 */
2167static char *vfio_devnode(struct device *dev, umode_t *mode)
2168{
2169	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2170}
2171
2172static struct miscdevice vfio_dev = {
2173	.minor = VFIO_MINOR,
2174	.name = "vfio",
2175	.fops = &vfio_fops,
2176	.nodename = "vfio/vfio",
2177	.mode = S_IRUGO | S_IWUGO,
2178};
2179
2180static int __init vfio_init(void)
2181{
2182	int ret;
2183
2184	idr_init(&vfio.group_idr);
2185	mutex_init(&vfio.group_lock);
2186	mutex_init(&vfio.iommu_drivers_lock);
2187	INIT_LIST_HEAD(&vfio.group_list);
2188	INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2189	init_waitqueue_head(&vfio.release_q);
2190
2191	ret = misc_register(&vfio_dev);
2192	if (ret) {
2193		pr_err("vfio: misc device register failed\n");
2194		return ret;
2195	}
2196
2197	/* /dev/vfio/$GROUP */
2198	vfio.class = class_create(THIS_MODULE, "vfio");
2199	if (IS_ERR(vfio.class)) {
2200		ret = PTR_ERR(vfio.class);
2201		goto err_class;
2202	}
2203
2204	vfio.class->devnode = vfio_devnode;
2205
2206	ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2207	if (ret)
2208		goto err_alloc_chrdev;
2209
2210	cdev_init(&vfio.group_cdev, &vfio_group_fops);
2211	ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK + 1);
2212	if (ret)
2213		goto err_cdev_add;
2214
2215	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2216
2217#ifdef CONFIG_VFIO_NOIOMMU
2218	vfio_register_iommu_driver(&vfio_noiommu_ops);
2219#endif
2220	return 0;
2221
2222err_cdev_add:
2223	unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2224err_alloc_chrdev:
2225	class_destroy(vfio.class);
2226	vfio.class = NULL;
2227err_class:
2228	misc_deregister(&vfio_dev);
2229	return ret;
2230}
2231
2232static void __exit vfio_cleanup(void)
2233{
2234	WARN_ON(!list_empty(&vfio.group_list));
2235
2236#ifdef CONFIG_VFIO_NOIOMMU
2237	vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2238#endif
2239	idr_destroy(&vfio.group_idr);
2240	cdev_del(&vfio.group_cdev);
2241	unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2242	class_destroy(vfio.class);
2243	vfio.class = NULL;
2244	misc_deregister(&vfio_dev);
2245}
2246
2247module_init(vfio_init);
2248module_exit(vfio_cleanup);
2249
2250MODULE_VERSION(DRIVER_VERSION);
2251MODULE_LICENSE("GPL v2");
2252MODULE_AUTHOR(DRIVER_AUTHOR);
2253MODULE_DESCRIPTION(DRIVER_DESC);
2254MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2255MODULE_ALIAS("devname:vfio/vfio");
2256MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
v5.14.15
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * VFIO core
   4 *
   5 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   6 *     Author: Alex Williamson <alex.williamson@redhat.com>
   7 *
   8 * Derived from original vfio:
   9 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  10 * Author: Tom Lyon, pugs@cisco.com
  11 */
  12
  13#include <linux/cdev.h>
  14#include <linux/compat.h>
  15#include <linux/device.h>
  16#include <linux/file.h>
  17#include <linux/anon_inodes.h>
  18#include <linux/fs.h>
  19#include <linux/idr.h>
  20#include <linux/iommu.h>
  21#include <linux/list.h>
  22#include <linux/miscdevice.h>
  23#include <linux/module.h>
  24#include <linux/mutex.h>
  25#include <linux/pci.h>
  26#include <linux/rwsem.h>
  27#include <linux/sched.h>
  28#include <linux/slab.h>
  29#include <linux/stat.h>
  30#include <linux/string.h>
  31#include <linux/uaccess.h>
  32#include <linux/vfio.h>
  33#include <linux/wait.h>
  34#include <linux/sched/signal.h>
  35
  36#define DRIVER_VERSION	"0.3"
  37#define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
  38#define DRIVER_DESC	"VFIO - User Level meta-driver"
  39
  40static struct vfio {
  41	struct class			*class;
  42	struct list_head		iommu_drivers_list;
  43	struct mutex			iommu_drivers_lock;
  44	struct list_head		group_list;
  45	struct idr			group_idr;
  46	struct mutex			group_lock;
  47	struct cdev			group_cdev;
  48	dev_t				group_devt;
 
  49} vfio;
  50
  51struct vfio_iommu_driver {
  52	const struct vfio_iommu_driver_ops	*ops;
  53	struct list_head			vfio_next;
  54};
  55
  56struct vfio_container {
  57	struct kref			kref;
  58	struct list_head		group_list;
  59	struct rw_semaphore		group_lock;
  60	struct vfio_iommu_driver	*iommu_driver;
  61	void				*iommu_data;
  62	bool				noiommu;
  63};
  64
  65struct vfio_unbound_dev {
  66	struct device			*dev;
  67	struct list_head		unbound_next;
  68};
  69
  70struct vfio_group {
  71	struct kref			kref;
  72	int				minor;
  73	atomic_t			container_users;
  74	struct iommu_group		*iommu_group;
  75	struct vfio_container		*container;
  76	struct list_head		device_list;
  77	struct mutex			device_lock;
  78	struct device			*dev;
  79	struct notifier_block		nb;
  80	struct list_head		vfio_next;
  81	struct list_head		container_next;
  82	struct list_head		unbound_list;
  83	struct mutex			unbound_lock;
  84	atomic_t			opened;
  85	wait_queue_head_t		container_q;
  86	bool				noiommu;
  87	unsigned int			dev_counter;
  88	struct kvm			*kvm;
  89	struct blocking_notifier_head	notifier;
  90};
  91
 
 
 
 
 
 
 
 
 
  92#ifdef CONFIG_VFIO_NOIOMMU
  93static bool noiommu __read_mostly;
  94module_param_named(enable_unsafe_noiommu_mode,
  95		   noiommu, bool, S_IRUGO | S_IWUSR);
  96MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
  97#endif
  98
  99/*
 100 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
 101 * and remove functions, any use cases other than acquiring the first
 102 * reference for the purpose of calling vfio_register_group_dev() or removing
 103 * that symmetric reference after vfio_unregister_group_dev() should use the raw
 104 * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
 105 * removes the device from the dummy group and cannot be nested.
 106 */
 107struct iommu_group *vfio_iommu_group_get(struct device *dev)
 108{
 109	struct iommu_group *group;
 110	int __maybe_unused ret;
 111
 112	group = iommu_group_get(dev);
 113
 114#ifdef CONFIG_VFIO_NOIOMMU
 115	/*
 116	 * With noiommu enabled, an IOMMU group will be created for a device
 117	 * that doesn't already have one and doesn't have an iommu_ops on their
 118	 * bus.  We set iommudata simply to be able to identify these groups
 119	 * as special use and for reclamation later.
 120	 */
 121	if (group || !noiommu || iommu_present(dev->bus))
 122		return group;
 123
 124	group = iommu_group_alloc();
 125	if (IS_ERR(group))
 126		return NULL;
 127
 128	iommu_group_set_name(group, "vfio-noiommu");
 129	iommu_group_set_iommudata(group, &noiommu, NULL);
 130	ret = iommu_group_add_device(group, dev);
 131	if (ret) {
 132		iommu_group_put(group);
 133		return NULL;
 134	}
 135
 136	/*
 137	 * Where to taint?  At this point we've added an IOMMU group for a
 138	 * device that is not backed by iommu_ops, therefore any iommu_
 139	 * callback using iommu_ops can legitimately Oops.  So, while we may
 140	 * be about to give a DMA capable device to a user without IOMMU
 141	 * protection, which is clearly taint-worthy, let's go ahead and do
 142	 * it here.
 143	 */
 144	add_taint(TAINT_USER, LOCKDEP_STILL_OK);
 145	dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
 146#endif
 147
 148	return group;
 149}
 150EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
 151
 152void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
 153{
 154#ifdef CONFIG_VFIO_NOIOMMU
 155	if (iommu_group_get_iommudata(group) == &noiommu)
 156		iommu_group_remove_device(dev);
 157#endif
 158
 159	iommu_group_put(group);
 160}
 161EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
 162
 163#ifdef CONFIG_VFIO_NOIOMMU
 164static void *vfio_noiommu_open(unsigned long arg)
 165{
 166	if (arg != VFIO_NOIOMMU_IOMMU)
 167		return ERR_PTR(-EINVAL);
 168	if (!capable(CAP_SYS_RAWIO))
 169		return ERR_PTR(-EPERM);
 170
 171	return NULL;
 172}
 173
 174static void vfio_noiommu_release(void *iommu_data)
 175{
 176}
 177
 178static long vfio_noiommu_ioctl(void *iommu_data,
 179			       unsigned int cmd, unsigned long arg)
 180{
 181	if (cmd == VFIO_CHECK_EXTENSION)
 182		return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
 183
 184	return -ENOTTY;
 185}
 186
 187static int vfio_noiommu_attach_group(void *iommu_data,
 188				     struct iommu_group *iommu_group)
 189{
 190	return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
 191}
 192
 193static void vfio_noiommu_detach_group(void *iommu_data,
 194				      struct iommu_group *iommu_group)
 195{
 196}
 197
 198static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
 199	.name = "vfio-noiommu",
 200	.owner = THIS_MODULE,
 201	.open = vfio_noiommu_open,
 202	.release = vfio_noiommu_release,
 203	.ioctl = vfio_noiommu_ioctl,
 204	.attach_group = vfio_noiommu_attach_group,
 205	.detach_group = vfio_noiommu_detach_group,
 206};
 207#endif
 208
 209
 210/**
 211 * IOMMU driver registration
 212 */
 213int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 214{
 215	struct vfio_iommu_driver *driver, *tmp;
 216
 217	driver = kzalloc(sizeof(*driver), GFP_KERNEL);
 218	if (!driver)
 219		return -ENOMEM;
 220
 221	driver->ops = ops;
 222
 223	mutex_lock(&vfio.iommu_drivers_lock);
 224
 225	/* Check for duplicates */
 226	list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
 227		if (tmp->ops == ops) {
 228			mutex_unlock(&vfio.iommu_drivers_lock);
 229			kfree(driver);
 230			return -EINVAL;
 231		}
 232	}
 233
 234	list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
 235
 236	mutex_unlock(&vfio.iommu_drivers_lock);
 237
 238	return 0;
 239}
 240EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
 241
 242void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 243{
 244	struct vfio_iommu_driver *driver;
 245
 246	mutex_lock(&vfio.iommu_drivers_lock);
 247	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
 248		if (driver->ops == ops) {
 249			list_del(&driver->vfio_next);
 250			mutex_unlock(&vfio.iommu_drivers_lock);
 251			kfree(driver);
 252			return;
 253		}
 254	}
 255	mutex_unlock(&vfio.iommu_drivers_lock);
 256}
 257EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
 258
 259/**
 260 * Group minor allocation/free - both called with vfio.group_lock held
 261 */
 262static int vfio_alloc_group_minor(struct vfio_group *group)
 263{
 264	return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
 265}
 266
 267static void vfio_free_group_minor(int minor)
 268{
 269	idr_remove(&vfio.group_idr, minor);
 270}
 271
 272static int vfio_iommu_group_notifier(struct notifier_block *nb,
 273				     unsigned long action, void *data);
 274static void vfio_group_get(struct vfio_group *group);
 275
 276/**
 277 * Container objects - containers are created when /dev/vfio/vfio is
 278 * opened, but their lifecycle extends until the last user is done, so
 279 * it's freed via kref.  Must support container/group/device being
 280 * closed in any order.
 281 */
 282static void vfio_container_get(struct vfio_container *container)
 283{
 284	kref_get(&container->kref);
 285}
 286
 287static void vfio_container_release(struct kref *kref)
 288{
 289	struct vfio_container *container;
 290	container = container_of(kref, struct vfio_container, kref);
 291
 292	kfree(container);
 293}
 294
 295static void vfio_container_put(struct vfio_container *container)
 296{
 297	kref_put(&container->kref, vfio_container_release);
 298}
 299
 300static void vfio_group_unlock_and_free(struct vfio_group *group)
 301{
 302	mutex_unlock(&vfio.group_lock);
 303	/*
 304	 * Unregister outside of lock.  A spurious callback is harmless now
 305	 * that the group is no longer in vfio.group_list.
 306	 */
 307	iommu_group_unregister_notifier(group->iommu_group, &group->nb);
 308	kfree(group);
 309}
 310
 311/**
 312 * Group objects - create, release, get, put, search
 313 */
 314static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 315{
 316	struct vfio_group *group, *tmp;
 317	struct device *dev;
 318	int ret, minor;
 319
 320	group = kzalloc(sizeof(*group), GFP_KERNEL);
 321	if (!group)
 322		return ERR_PTR(-ENOMEM);
 323
 324	kref_init(&group->kref);
 325	INIT_LIST_HEAD(&group->device_list);
 326	mutex_init(&group->device_lock);
 327	INIT_LIST_HEAD(&group->unbound_list);
 328	mutex_init(&group->unbound_lock);
 329	atomic_set(&group->container_users, 0);
 330	atomic_set(&group->opened, 0);
 331	init_waitqueue_head(&group->container_q);
 332	group->iommu_group = iommu_group;
 333#ifdef CONFIG_VFIO_NOIOMMU
 334	group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
 335#endif
 336	BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
 337
 338	group->nb.notifier_call = vfio_iommu_group_notifier;
 339
 340	/*
 341	 * blocking notifiers acquire a rwsem around registering and hold
 342	 * it around callback.  Therefore, need to register outside of
 343	 * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
 344	 * do anything unless it can find the group in vfio.group_list, so
 345	 * no harm in registering early.
 346	 */
 347	ret = iommu_group_register_notifier(iommu_group, &group->nb);
 348	if (ret) {
 349		kfree(group);
 350		return ERR_PTR(ret);
 351	}
 352
 353	mutex_lock(&vfio.group_lock);
 354
 355	/* Did we race creating this group? */
 356	list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
 357		if (tmp->iommu_group == iommu_group) {
 358			vfio_group_get(tmp);
 359			vfio_group_unlock_and_free(group);
 360			return tmp;
 361		}
 362	}
 363
 364	minor = vfio_alloc_group_minor(group);
 365	if (minor < 0) {
 366		vfio_group_unlock_and_free(group);
 367		return ERR_PTR(minor);
 368	}
 369
 370	dev = device_create(vfio.class, NULL,
 371			    MKDEV(MAJOR(vfio.group_devt), minor),
 372			    group, "%s%d", group->noiommu ? "noiommu-" : "",
 373			    iommu_group_id(iommu_group));
 374	if (IS_ERR(dev)) {
 375		vfio_free_group_minor(minor);
 376		vfio_group_unlock_and_free(group);
 377		return ERR_CAST(dev);
 378	}
 379
 380	group->minor = minor;
 381	group->dev = dev;
 382
 383	list_add(&group->vfio_next, &vfio.group_list);
 384
 385	mutex_unlock(&vfio.group_lock);
 386
 387	return group;
 388}
 389
 390/* called with vfio.group_lock held */
 391static void vfio_group_release(struct kref *kref)
 392{
 393	struct vfio_group *group = container_of(kref, struct vfio_group, kref);
 394	struct vfio_unbound_dev *unbound, *tmp;
 395	struct iommu_group *iommu_group = group->iommu_group;
 396
 397	WARN_ON(!list_empty(&group->device_list));
 398	WARN_ON(group->notifier.head);
 399
 400	list_for_each_entry_safe(unbound, tmp,
 401				 &group->unbound_list, unbound_next) {
 402		list_del(&unbound->unbound_next);
 403		kfree(unbound);
 404	}
 405
 406	device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
 407	list_del(&group->vfio_next);
 408	vfio_free_group_minor(group->minor);
 409	vfio_group_unlock_and_free(group);
 410	iommu_group_put(iommu_group);
 411}
 412
 413static void vfio_group_put(struct vfio_group *group)
 414{
 415	kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
 416}
 417
 418struct vfio_group_put_work {
 419	struct work_struct work;
 420	struct vfio_group *group;
 421};
 422
 423static void vfio_group_put_bg(struct work_struct *work)
 424{
 425	struct vfio_group_put_work *do_work;
 426
 427	do_work = container_of(work, struct vfio_group_put_work, work);
 428
 429	vfio_group_put(do_work->group);
 430	kfree(do_work);
 431}
 432
 433static void vfio_group_schedule_put(struct vfio_group *group)
 434{
 435	struct vfio_group_put_work *do_work;
 436
 437	do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
 438	if (WARN_ON(!do_work))
 439		return;
 440
 441	INIT_WORK(&do_work->work, vfio_group_put_bg);
 442	do_work->group = group;
 443	schedule_work(&do_work->work);
 444}
 445
 446/* Assume group_lock or group reference is held */
 447static void vfio_group_get(struct vfio_group *group)
 448{
 449	kref_get(&group->kref);
 450}
 451
 452/*
 453 * Not really a try as we will sleep for mutex, but we need to make
 454 * sure the group pointer is valid under lock and get a reference.
 455 */
 456static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
 457{
 458	struct vfio_group *target = group;
 459
 460	mutex_lock(&vfio.group_lock);
 461	list_for_each_entry(group, &vfio.group_list, vfio_next) {
 462		if (group == target) {
 463			vfio_group_get(group);
 464			mutex_unlock(&vfio.group_lock);
 465			return group;
 466		}
 467	}
 468	mutex_unlock(&vfio.group_lock);
 469
 470	return NULL;
 471}
 472
 473static
 474struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
 475{
 476	struct vfio_group *group;
 477
 478	mutex_lock(&vfio.group_lock);
 479	list_for_each_entry(group, &vfio.group_list, vfio_next) {
 480		if (group->iommu_group == iommu_group) {
 481			vfio_group_get(group);
 482			mutex_unlock(&vfio.group_lock);
 483			return group;
 484		}
 485	}
 486	mutex_unlock(&vfio.group_lock);
 487
 488	return NULL;
 489}
 490
 491static struct vfio_group *vfio_group_get_from_minor(int minor)
 492{
 493	struct vfio_group *group;
 494
 495	mutex_lock(&vfio.group_lock);
 496	group = idr_find(&vfio.group_idr, minor);
 497	if (!group) {
 498		mutex_unlock(&vfio.group_lock);
 499		return NULL;
 500	}
 501	vfio_group_get(group);
 502	mutex_unlock(&vfio.group_lock);
 503
 504	return group;
 505}
 506
 507static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
 508{
 509	struct iommu_group *iommu_group;
 510	struct vfio_group *group;
 511
 512	iommu_group = iommu_group_get(dev);
 513	if (!iommu_group)
 514		return NULL;
 515
 516	group = vfio_group_get_from_iommu(iommu_group);
 517	iommu_group_put(iommu_group);
 518
 519	return group;
 520}
 521
 522/**
 523 * Device objects - create, release, get, put, search
 524 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 525/* Device reference always implies a group reference */
 526void vfio_device_put(struct vfio_device *device)
 527{
 528	if (refcount_dec_and_test(&device->refcount))
 529		complete(&device->comp);
 
 530}
 531EXPORT_SYMBOL_GPL(vfio_device_put);
 532
 533static bool vfio_device_try_get(struct vfio_device *device)
 534{
 535	return refcount_inc_not_zero(&device->refcount);
 
 536}
 537
 538static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
 539						 struct device *dev)
 540{
 541	struct vfio_device *device;
 542
 543	mutex_lock(&group->device_lock);
 544	list_for_each_entry(device, &group->device_list, group_next) {
 545		if (device->dev == dev && vfio_device_try_get(device)) {
 
 546			mutex_unlock(&group->device_lock);
 547			return device;
 548		}
 549	}
 550	mutex_unlock(&group->device_lock);
 551	return NULL;
 552}
 553
 554/*
 555 * Some drivers, like pci-stub, are only used to prevent other drivers from
 556 * claiming a device and are therefore perfectly legitimate for a user owned
 557 * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
 558 * of the device, but it does prevent the user from having direct access to
 559 * the device, which is useful in some circumstances.
 560 *
 561 * We also assume that we can include PCI interconnect devices, ie. bridges.
 562 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
 563 * then all of the downstream devices will be part of the same IOMMU group as
 564 * the bridge.  Thus, if placing the bridge into the user owned IOVA space
 565 * breaks anything, it only does so for user owned devices downstream.  Note
 566 * that error notification via MSI can be affected for platforms that handle
 567 * MSI within the same IOVA space as DMA.
 568 */
 569static const char * const vfio_driver_allowed[] = { "pci-stub" };
 570
 571static bool vfio_dev_driver_allowed(struct device *dev,
 572				    struct device_driver *drv)
 573{
 574	if (dev_is_pci(dev)) {
 575		struct pci_dev *pdev = to_pci_dev(dev);
 576
 577		if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
 578			return true;
 579	}
 580
 581	return match_string(vfio_driver_allowed,
 582			    ARRAY_SIZE(vfio_driver_allowed),
 583			    drv->name) >= 0;
 584}
 585
 586/*
 587 * A vfio group is viable for use by userspace if all devices are in
 588 * one of the following states:
 589 *  - driver-less
 590 *  - bound to a vfio driver
 591 *  - bound to an otherwise allowed driver
 592 *  - a PCI interconnect device
 593 *
 594 * We use two methods to determine whether a device is bound to a vfio
 595 * driver.  The first is to test whether the device exists in the vfio
 596 * group.  The second is to test if the device exists on the group
 597 * unbound_list, indicating it's in the middle of transitioning from
 598 * a vfio driver to driver-less.
 599 */
 600static int vfio_dev_viable(struct device *dev, void *data)
 601{
 602	struct vfio_group *group = data;
 603	struct vfio_device *device;
 604	struct device_driver *drv = READ_ONCE(dev->driver);
 605	struct vfio_unbound_dev *unbound;
 606	int ret = -EINVAL;
 607
 608	mutex_lock(&group->unbound_lock);
 609	list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
 610		if (dev == unbound->dev) {
 611			ret = 0;
 612			break;
 613		}
 614	}
 615	mutex_unlock(&group->unbound_lock);
 616
 617	if (!ret || !drv || vfio_dev_driver_allowed(dev, drv))
 618		return 0;
 619
 620	device = vfio_group_get_device(group, dev);
 621	if (device) {
 622		vfio_device_put(device);
 623		return 0;
 624	}
 625
 626	return ret;
 627}
 628
 629/**
 630 * Async device support
 631 */
 632static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
 633{
 634	struct vfio_device *device;
 635
 636	/* Do we already know about it?  We shouldn't */
 637	device = vfio_group_get_device(group, dev);
 638	if (WARN_ON_ONCE(device)) {
 639		vfio_device_put(device);
 640		return 0;
 641	}
 642
 643	/* Nothing to do for idle groups */
 644	if (!atomic_read(&group->container_users))
 645		return 0;
 646
 647	/* TODO Prevent device auto probing */
 648	dev_WARN(dev, "Device added to live group %d!\n",
 649		 iommu_group_id(group->iommu_group));
 650
 651	return 0;
 652}
 653
 654static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
 655{
 656	/* We don't care what happens when the group isn't in use */
 657	if (!atomic_read(&group->container_users))
 658		return 0;
 659
 660	return vfio_dev_viable(dev, group);
 661}
 662
 663static int vfio_iommu_group_notifier(struct notifier_block *nb,
 664				     unsigned long action, void *data)
 665{
 666	struct vfio_group *group = container_of(nb, struct vfio_group, nb);
 667	struct device *dev = data;
 668	struct vfio_unbound_dev *unbound;
 669
 670	/*
 671	 * Need to go through a group_lock lookup to get a reference or we
 672	 * risk racing a group being removed.  Ignore spurious notifies.
 673	 */
 674	group = vfio_group_try_get(group);
 675	if (!group)
 676		return NOTIFY_OK;
 677
 678	switch (action) {
 679	case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
 680		vfio_group_nb_add_dev(group, dev);
 681		break;
 682	case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
 683		/*
 684		 * Nothing to do here.  If the device is in use, then the
 685		 * vfio sub-driver should block the remove callback until
 686		 * it is unused.  If the device is unused or attached to a
 687		 * stub driver, then it should be released and we don't
 688		 * care that it will be going away.
 689		 */
 690		break;
 691	case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
 692		dev_dbg(dev, "%s: group %d binding to driver\n", __func__,
 693			iommu_group_id(group->iommu_group));
 694		break;
 695	case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
 696		dev_dbg(dev, "%s: group %d bound to driver %s\n", __func__,
 697			iommu_group_id(group->iommu_group), dev->driver->name);
 698		BUG_ON(vfio_group_nb_verify(group, dev));
 699		break;
 700	case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
 701		dev_dbg(dev, "%s: group %d unbinding from driver %s\n",
 702			__func__, iommu_group_id(group->iommu_group),
 703			dev->driver->name);
 704		break;
 705	case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
 706		dev_dbg(dev, "%s: group %d unbound from driver\n", __func__,
 707			iommu_group_id(group->iommu_group));
 708		/*
 709		 * XXX An unbound device in a live group is ok, but we'd
 710		 * really like to avoid the above BUG_ON by preventing other
 711		 * drivers from binding to it.  Once that occurs, we have to
 712		 * stop the system to maintain isolation.  At a minimum, we'd
 713		 * want a toggle to disable driver auto probe for this device.
 714		 */
 715
 716		mutex_lock(&group->unbound_lock);
 717		list_for_each_entry(unbound,
 718				    &group->unbound_list, unbound_next) {
 719			if (dev == unbound->dev) {
 720				list_del(&unbound->unbound_next);
 721				kfree(unbound);
 722				break;
 723			}
 724		}
 725		mutex_unlock(&group->unbound_lock);
 726		break;
 727	}
 728
 729	/*
 730	 * If we're the last reference to the group, the group will be
 731	 * released, which includes unregistering the iommu group notifier.
 732	 * We hold a read-lock on that notifier list, unregistering needs
 733	 * a write-lock... deadlock.  Release our reference asynchronously
 734	 * to avoid that situation.
 735	 */
 736	vfio_group_schedule_put(group);
 737	return NOTIFY_OK;
 738}
 739
 740/**
 741 * VFIO driver API
 742 */
 743void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
 744			 const struct vfio_device_ops *ops)
 745{
 746	init_completion(&device->comp);
 747	device->dev = dev;
 748	device->ops = ops;
 749}
 750EXPORT_SYMBOL_GPL(vfio_init_group_dev);
 751
 752int vfio_register_group_dev(struct vfio_device *device)
 753{
 754	struct vfio_device *existing_device;
 755	struct iommu_group *iommu_group;
 756	struct vfio_group *group;
 
 757
 758	iommu_group = iommu_group_get(device->dev);
 759	if (!iommu_group)
 760		return -EINVAL;
 761
 762	group = vfio_group_get_from_iommu(iommu_group);
 763	if (!group) {
 764		group = vfio_create_group(iommu_group);
 765		if (IS_ERR(group)) {
 766			iommu_group_put(iommu_group);
 767			return PTR_ERR(group);
 768		}
 769	} else {
 770		/*
 771		 * A found vfio_group already holds a reference to the
 772		 * iommu_group.  A created vfio_group keeps the reference.
 773		 */
 774		iommu_group_put(iommu_group);
 775	}
 776
 777	existing_device = vfio_group_get_device(group, device->dev);
 778	if (existing_device) {
 779		dev_WARN(device->dev, "Device already exists on group %d\n",
 780			 iommu_group_id(iommu_group));
 781		vfio_device_put(existing_device);
 782		vfio_group_put(group);
 783		return -EBUSY;
 784	}
 785
 786	/* Our reference on group is moved to the device */
 787	device->group = group;
 
 
 
 788
 789	/* Refcounting can't start until the driver calls register */
 790	refcount_set(&device->refcount, 1);
 791
 792	mutex_lock(&group->device_lock);
 793	list_add(&device->group_next, &group->device_list);
 794	group->dev_counter++;
 795	mutex_unlock(&group->device_lock);
 796
 797	return 0;
 798}
 799EXPORT_SYMBOL_GPL(vfio_register_group_dev);
 800
 801/**
 802 * Get a reference to the vfio_device for a device.  Even if the
 803 * caller thinks they own the device, they could be racing with a
 804 * release call path, so we can't trust drvdata for the shortcut.
 805 * Go the long way around, from the iommu_group to the vfio_group
 806 * to the vfio_device.
 807 */
 808struct vfio_device *vfio_device_get_from_dev(struct device *dev)
 809{
 810	struct vfio_group *group;
 811	struct vfio_device *device;
 812
 813	group = vfio_group_get_from_dev(dev);
 814	if (!group)
 815		return NULL;
 816
 817	device = vfio_group_get_device(group, dev);
 818	vfio_group_put(group);
 819
 820	return device;
 821}
 822EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
 823
 824static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 825						     char *buf)
 826{
 827	struct vfio_device *it, *device = ERR_PTR(-ENODEV);
 828
 829	mutex_lock(&group->device_lock);
 830	list_for_each_entry(it, &group->device_list, group_next) {
 831		int ret;
 832
 833		if (it->ops->match) {
 834			ret = it->ops->match(it, buf);
 835			if (ret < 0) {
 836				device = ERR_PTR(ret);
 837				break;
 838			}
 839		} else {
 840			ret = !strcmp(dev_name(it->dev), buf);
 841		}
 842
 843		if (ret && vfio_device_try_get(it)) {
 844			device = it;
 
 845			break;
 846		}
 847	}
 848	mutex_unlock(&group->device_lock);
 849
 850	return device;
 851}
 852
 853/*
 
 
 
 
 
 
 
 
 
 854 * Decrement the device reference count and wait for the device to be
 855 * removed.  Open file descriptors for the device... */
 856void vfio_unregister_group_dev(struct vfio_device *device)
 857{
 
 
 858	struct vfio_group *group = device->group;
 
 859	struct vfio_unbound_dev *unbound;
 860	unsigned int i = 0;
 861	bool interrupted = false;
 862	long rc;
 
 
 
 
 
 863
 864	/*
 865	 * When the device is removed from the group, the group suddenly
 866	 * becomes non-viable; the device has a driver (until the unbind
 867	 * completes), but it's not present in the group.  This is bad news
 868	 * for any external users that need to re-acquire a group reference
 869	 * in order to match and release their existing reference.  To
 870	 * solve this, we track such devices on the unbound_list to bridge
 871	 * the gap until they're fully unbound.
 872	 */
 873	unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
 874	if (unbound) {
 875		unbound->dev = device->dev;
 876		mutex_lock(&group->unbound_lock);
 877		list_add(&unbound->unbound_next, &group->unbound_list);
 878		mutex_unlock(&group->unbound_lock);
 879	}
 880	WARN_ON(!unbound);
 881
 882	vfio_device_put(device);
 883	rc = try_wait_for_completion(&device->comp);
 884	while (rc <= 0) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 885		if (device->ops->request)
 886			device->ops->request(device, i++);
 
 
 887
 888		if (interrupted) {
 889			rc = wait_for_completion_timeout(&device->comp,
 890							 HZ * 10);
 891		} else {
 892			rc = wait_for_completion_interruptible_timeout(
 893				&device->comp, HZ * 10);
 894			if (rc < 0) {
 895				interrupted = true;
 896				dev_warn(device->dev,
 897					 "Device is currently in use, task"
 898					 " \"%s\" (%d) "
 899					 "blocked until device is released",
 900					 current->comm, task_pid_nr(current));
 901			}
 902		}
 903	}
 904
 905	mutex_lock(&group->device_lock);
 906	list_del(&device->group_next);
 907	group->dev_counter--;
 908	mutex_unlock(&group->device_lock);
 909
 
 910	/*
 911	 * In order to support multiple devices per group, devices can be
 912	 * plucked from the group while other devices in the group are still
 913	 * in use.  The container persists with this group and those remaining
 914	 * devices still attached.  If the user creates an isolation violation
 915	 * by binding this device to another driver while the group is still in
 916	 * use, that's their fault.  However, in the case of removing the last,
 917	 * or potentially the only, device in the group there can be no other
 918	 * in-use devices in the group.  The user has done their due diligence
 919	 * and we should lay no claims to those devices.  In order to do that,
 920	 * we need to make sure the group is detached from the container.
 921	 * Without this stall, we're potentially racing with a user process
 922	 * that may attempt to immediately bind this device to another driver.
 923	 */
 924	if (list_empty(&group->device_list))
 925		wait_event(group->container_q, !group->container);
 926
 927	/* Matches the get in vfio_register_group_dev() */
 928	vfio_group_put(group);
 
 
 929}
 930EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
 931
 932/**
 933 * VFIO base fd, /dev/vfio/vfio
 934 */
 935static long vfio_ioctl_check_extension(struct vfio_container *container,
 936				       unsigned long arg)
 937{
 938	struct vfio_iommu_driver *driver;
 939	long ret = 0;
 940
 941	down_read(&container->group_lock);
 942
 943	driver = container->iommu_driver;
 944
 945	switch (arg) {
 946		/* No base extensions yet */
 947	default:
 948		/*
 949		 * If no driver is set, poll all registered drivers for
 950		 * extensions and return the first positive result.  If
 951		 * a driver is already set, further queries will be passed
 952		 * only to that driver.
 953		 */
 954		if (!driver) {
 955			mutex_lock(&vfio.iommu_drivers_lock);
 956			list_for_each_entry(driver, &vfio.iommu_drivers_list,
 957					    vfio_next) {
 958
 959#ifdef CONFIG_VFIO_NOIOMMU
 960				if (!list_empty(&container->group_list) &&
 961				    (container->noiommu !=
 962				     (driver->ops == &vfio_noiommu_ops)))
 963					continue;
 964#endif
 965
 966				if (!try_module_get(driver->ops->owner))
 967					continue;
 968
 969				ret = driver->ops->ioctl(NULL,
 970							 VFIO_CHECK_EXTENSION,
 971							 arg);
 972				module_put(driver->ops->owner);
 973				if (ret > 0)
 974					break;
 975			}
 976			mutex_unlock(&vfio.iommu_drivers_lock);
 977		} else
 978			ret = driver->ops->ioctl(container->iommu_data,
 979						 VFIO_CHECK_EXTENSION, arg);
 980	}
 981
 982	up_read(&container->group_lock);
 983
 984	return ret;
 985}
 986
 987/* hold write lock on container->group_lock */
 988static int __vfio_container_attach_groups(struct vfio_container *container,
 989					  struct vfio_iommu_driver *driver,
 990					  void *data)
 991{
 992	struct vfio_group *group;
 993	int ret = -ENODEV;
 994
 995	list_for_each_entry(group, &container->group_list, container_next) {
 996		ret = driver->ops->attach_group(data, group->iommu_group);
 997		if (ret)
 998			goto unwind;
 999	}
1000
1001	return ret;
1002
1003unwind:
1004	list_for_each_entry_continue_reverse(group, &container->group_list,
1005					     container_next) {
1006		driver->ops->detach_group(data, group->iommu_group);
1007	}
1008
1009	return ret;
1010}
1011
1012static long vfio_ioctl_set_iommu(struct vfio_container *container,
1013				 unsigned long arg)
1014{
1015	struct vfio_iommu_driver *driver;
1016	long ret = -ENODEV;
1017
1018	down_write(&container->group_lock);
1019
1020	/*
1021	 * The container is designed to be an unprivileged interface while
1022	 * the group can be assigned to specific users.  Therefore, only by
1023	 * adding a group to a container does the user get the privilege of
1024	 * enabling the iommu, which may allocate finite resources.  There
1025	 * is no unset_iommu, but by removing all the groups from a container,
1026	 * the container is deprivileged and returns to an unset state.
1027	 */
1028	if (list_empty(&container->group_list) || container->iommu_driver) {
1029		up_write(&container->group_lock);
1030		return -EINVAL;
1031	}
1032
1033	mutex_lock(&vfio.iommu_drivers_lock);
1034	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1035		void *data;
1036
1037#ifdef CONFIG_VFIO_NOIOMMU
1038		/*
1039		 * Only noiommu containers can use vfio-noiommu and noiommu
1040		 * containers can only use vfio-noiommu.
1041		 */
1042		if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1043			continue;
1044#endif
1045
1046		if (!try_module_get(driver->ops->owner))
1047			continue;
1048
1049		/*
1050		 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1051		 * so test which iommu driver reported support for this
1052		 * extension and call open on them.  We also pass them the
1053		 * magic, allowing a single driver to support multiple
1054		 * interfaces if they'd like.
1055		 */
1056		if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1057			module_put(driver->ops->owner);
1058			continue;
1059		}
1060
1061		data = driver->ops->open(arg);
1062		if (IS_ERR(data)) {
1063			ret = PTR_ERR(data);
1064			module_put(driver->ops->owner);
1065			continue;
1066		}
1067
1068		ret = __vfio_container_attach_groups(container, driver, data);
1069		if (ret) {
1070			driver->ops->release(data);
1071			module_put(driver->ops->owner);
1072			continue;
1073		}
1074
1075		container->iommu_driver = driver;
1076		container->iommu_data = data;
1077		break;
1078	}
1079
1080	mutex_unlock(&vfio.iommu_drivers_lock);
1081	up_write(&container->group_lock);
1082
1083	return ret;
1084}
1085
1086static long vfio_fops_unl_ioctl(struct file *filep,
1087				unsigned int cmd, unsigned long arg)
1088{
1089	struct vfio_container *container = filep->private_data;
1090	struct vfio_iommu_driver *driver;
1091	void *data;
1092	long ret = -EINVAL;
1093
1094	if (!container)
1095		return ret;
1096
1097	switch (cmd) {
1098	case VFIO_GET_API_VERSION:
1099		ret = VFIO_API_VERSION;
1100		break;
1101	case VFIO_CHECK_EXTENSION:
1102		ret = vfio_ioctl_check_extension(container, arg);
1103		break;
1104	case VFIO_SET_IOMMU:
1105		ret = vfio_ioctl_set_iommu(container, arg);
1106		break;
1107	default:
1108		driver = container->iommu_driver;
1109		data = container->iommu_data;
1110
1111		if (driver) /* passthrough all unrecognized ioctls */
1112			ret = driver->ops->ioctl(data, cmd, arg);
1113	}
1114
1115	return ret;
1116}
1117
 
 
 
 
 
 
 
 
 
1118static int vfio_fops_open(struct inode *inode, struct file *filep)
1119{
1120	struct vfio_container *container;
1121
1122	container = kzalloc(sizeof(*container), GFP_KERNEL);
1123	if (!container)
1124		return -ENOMEM;
1125
1126	INIT_LIST_HEAD(&container->group_list);
1127	init_rwsem(&container->group_lock);
1128	kref_init(&container->kref);
1129
1130	filep->private_data = container;
1131
1132	return 0;
1133}
1134
1135static int vfio_fops_release(struct inode *inode, struct file *filep)
1136{
1137	struct vfio_container *container = filep->private_data;
1138	struct vfio_iommu_driver *driver = container->iommu_driver;
1139
1140	if (driver && driver->ops->notify)
1141		driver->ops->notify(container->iommu_data,
1142				    VFIO_IOMMU_CONTAINER_CLOSE);
1143
1144	filep->private_data = NULL;
1145
1146	vfio_container_put(container);
1147
1148	return 0;
1149}
1150
1151/*
1152 * Once an iommu driver is set, we optionally pass read/write/mmap
1153 * on to the driver, allowing management interfaces beyond ioctl.
1154 */
1155static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1156			      size_t count, loff_t *ppos)
1157{
1158	struct vfio_container *container = filep->private_data;
1159	struct vfio_iommu_driver *driver;
1160	ssize_t ret = -EINVAL;
1161
1162	driver = container->iommu_driver;
1163	if (likely(driver && driver->ops->read))
1164		ret = driver->ops->read(container->iommu_data,
1165					buf, count, ppos);
1166
1167	return ret;
1168}
1169
1170static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1171			       size_t count, loff_t *ppos)
1172{
1173	struct vfio_container *container = filep->private_data;
1174	struct vfio_iommu_driver *driver;
1175	ssize_t ret = -EINVAL;
1176
1177	driver = container->iommu_driver;
1178	if (likely(driver && driver->ops->write))
1179		ret = driver->ops->write(container->iommu_data,
1180					 buf, count, ppos);
1181
1182	return ret;
1183}
1184
1185static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1186{
1187	struct vfio_container *container = filep->private_data;
1188	struct vfio_iommu_driver *driver;
1189	int ret = -EINVAL;
1190
1191	driver = container->iommu_driver;
1192	if (likely(driver && driver->ops->mmap))
1193		ret = driver->ops->mmap(container->iommu_data, vma);
1194
1195	return ret;
1196}
1197
1198static const struct file_operations vfio_fops = {
1199	.owner		= THIS_MODULE,
1200	.open		= vfio_fops_open,
1201	.release	= vfio_fops_release,
1202	.read		= vfio_fops_read,
1203	.write		= vfio_fops_write,
1204	.unlocked_ioctl	= vfio_fops_unl_ioctl,
1205	.compat_ioctl	= compat_ptr_ioctl,
 
 
1206	.mmap		= vfio_fops_mmap,
1207};
1208
1209/**
1210 * VFIO Group fd, /dev/vfio/$GROUP
1211 */
1212static void __vfio_group_unset_container(struct vfio_group *group)
1213{
1214	struct vfio_container *container = group->container;
1215	struct vfio_iommu_driver *driver;
1216
1217	down_write(&container->group_lock);
1218
1219	driver = container->iommu_driver;
1220	if (driver)
1221		driver->ops->detach_group(container->iommu_data,
1222					  group->iommu_group);
1223
1224	group->container = NULL;
1225	wake_up(&group->container_q);
1226	list_del(&group->container_next);
1227
1228	/* Detaching the last group deprivileges a container, remove iommu */
1229	if (driver && list_empty(&container->group_list)) {
1230		driver->ops->release(container->iommu_data);
1231		module_put(driver->ops->owner);
1232		container->iommu_driver = NULL;
1233		container->iommu_data = NULL;
1234	}
1235
1236	up_write(&container->group_lock);
1237
1238	vfio_container_put(container);
1239}
1240
1241/*
1242 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1243 * if there was no container to unset.  Since the ioctl is called on
1244 * the group, we know that still exists, therefore the only valid
1245 * transition here is 1->0.
1246 */
1247static int vfio_group_unset_container(struct vfio_group *group)
1248{
1249	int users = atomic_cmpxchg(&group->container_users, 1, 0);
1250
1251	if (!users)
1252		return -EINVAL;
1253	if (users != 1)
1254		return -EBUSY;
1255
1256	__vfio_group_unset_container(group);
1257
1258	return 0;
1259}
1260
1261/*
1262 * When removing container users, anything that removes the last user
1263 * implicitly removes the group from the container.  That is, if the
1264 * group file descriptor is closed, as well as any device file descriptors,
1265 * the group is free.
1266 */
1267static void vfio_group_try_dissolve_container(struct vfio_group *group)
1268{
1269	if (0 == atomic_dec_if_positive(&group->container_users))
1270		__vfio_group_unset_container(group);
1271}
1272
1273static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1274{
1275	struct fd f;
1276	struct vfio_container *container;
1277	struct vfio_iommu_driver *driver;
1278	int ret = 0;
1279
1280	if (atomic_read(&group->container_users))
1281		return -EINVAL;
1282
1283	if (group->noiommu && !capable(CAP_SYS_RAWIO))
1284		return -EPERM;
1285
1286	f = fdget(container_fd);
1287	if (!f.file)
1288		return -EBADF;
1289
1290	/* Sanity check, is this really our fd? */
1291	if (f.file->f_op != &vfio_fops) {
1292		fdput(f);
1293		return -EINVAL;
1294	}
1295
1296	container = f.file->private_data;
1297	WARN_ON(!container); /* fget ensures we don't race vfio_release */
1298
1299	down_write(&container->group_lock);
1300
1301	/* Real groups and fake groups cannot mix */
1302	if (!list_empty(&container->group_list) &&
1303	    container->noiommu != group->noiommu) {
1304		ret = -EPERM;
1305		goto unlock_out;
1306	}
1307
1308	driver = container->iommu_driver;
1309	if (driver) {
1310		ret = driver->ops->attach_group(container->iommu_data,
1311						group->iommu_group);
1312		if (ret)
1313			goto unlock_out;
1314	}
1315
1316	group->container = container;
1317	container->noiommu = group->noiommu;
1318	list_add(&group->container_next, &container->group_list);
1319
1320	/* Get a reference on the container and mark a user within the group */
1321	vfio_container_get(container);
1322	atomic_inc(&group->container_users);
1323
1324unlock_out:
1325	up_write(&container->group_lock);
1326	fdput(f);
1327	return ret;
1328}
1329
1330static bool vfio_group_viable(struct vfio_group *group)
1331{
1332	return (iommu_group_for_each_dev(group->iommu_group,
1333					 group, vfio_dev_viable) == 0);
1334}
1335
1336static int vfio_group_add_container_user(struct vfio_group *group)
1337{
1338	if (!atomic_inc_not_zero(&group->container_users))
1339		return -EINVAL;
1340
1341	if (group->noiommu) {
1342		atomic_dec(&group->container_users);
1343		return -EPERM;
1344	}
1345	if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1346		atomic_dec(&group->container_users);
1347		return -EINVAL;
1348	}
1349
1350	return 0;
1351}
1352
1353static const struct file_operations vfio_device_fops;
1354
1355static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1356{
1357	struct vfio_device *device;
1358	struct file *filep;
1359	int ret;
1360
1361	if (0 == atomic_read(&group->container_users) ||
1362	    !group->container->iommu_driver || !vfio_group_viable(group))
1363		return -EINVAL;
1364
1365	if (group->noiommu && !capable(CAP_SYS_RAWIO))
1366		return -EPERM;
1367
1368	device = vfio_device_get_from_name(group, buf);
1369	if (IS_ERR(device))
1370		return PTR_ERR(device);
1371
1372	if (!try_module_get(device->dev->driver->owner)) {
1373		vfio_device_put(device);
1374		return -ENODEV;
1375	}
1376
1377	ret = device->ops->open(device);
1378	if (ret) {
1379		module_put(device->dev->driver->owner);
1380		vfio_device_put(device);
1381		return ret;
1382	}
1383
1384	/*
1385	 * We can't use anon_inode_getfd() because we need to modify
1386	 * the f_mode flags directly to allow more than just ioctls
1387	 */
1388	ret = get_unused_fd_flags(O_CLOEXEC);
1389	if (ret < 0) {
1390		device->ops->release(device);
1391		module_put(device->dev->driver->owner);
1392		vfio_device_put(device);
1393		return ret;
1394	}
1395
1396	filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1397				   device, O_RDWR);
1398	if (IS_ERR(filep)) {
1399		put_unused_fd(ret);
1400		ret = PTR_ERR(filep);
1401		device->ops->release(device);
1402		module_put(device->dev->driver->owner);
1403		vfio_device_put(device);
1404		return ret;
1405	}
1406
1407	/*
1408	 * TODO: add an anon_inode interface to do this.
1409	 * Appears to be missing by lack of need rather than
1410	 * explicitly prevented.  Now there's need.
1411	 */
1412	filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1413
1414	atomic_inc(&group->container_users);
1415
1416	fd_install(ret, filep);
1417
1418	if (group->noiommu)
1419		dev_warn(device->dev, "vfio-noiommu device opened by user "
1420			 "(%s:%d)\n", current->comm, task_pid_nr(current));
1421
1422	return ret;
1423}
1424
1425static long vfio_group_fops_unl_ioctl(struct file *filep,
1426				      unsigned int cmd, unsigned long arg)
1427{
1428	struct vfio_group *group = filep->private_data;
1429	long ret = -ENOTTY;
1430
1431	switch (cmd) {
1432	case VFIO_GROUP_GET_STATUS:
1433	{
1434		struct vfio_group_status status;
1435		unsigned long minsz;
1436
1437		minsz = offsetofend(struct vfio_group_status, flags);
1438
1439		if (copy_from_user(&status, (void __user *)arg, minsz))
1440			return -EFAULT;
1441
1442		if (status.argsz < minsz)
1443			return -EINVAL;
1444
1445		status.flags = 0;
1446
1447		if (vfio_group_viable(group))
1448			status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1449
1450		if (group->container)
1451			status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1452
1453		if (copy_to_user((void __user *)arg, &status, minsz))
1454			return -EFAULT;
1455
1456		ret = 0;
1457		break;
1458	}
1459	case VFIO_GROUP_SET_CONTAINER:
1460	{
1461		int fd;
1462
1463		if (get_user(fd, (int __user *)arg))
1464			return -EFAULT;
1465
1466		if (fd < 0)
1467			return -EINVAL;
1468
1469		ret = vfio_group_set_container(group, fd);
1470		break;
1471	}
1472	case VFIO_GROUP_UNSET_CONTAINER:
1473		ret = vfio_group_unset_container(group);
1474		break;
1475	case VFIO_GROUP_GET_DEVICE_FD:
1476	{
1477		char *buf;
1478
1479		buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1480		if (IS_ERR(buf))
1481			return PTR_ERR(buf);
1482
1483		ret = vfio_group_get_device_fd(group, buf);
1484		kfree(buf);
1485		break;
1486	}
1487	}
1488
1489	return ret;
1490}
1491
 
 
 
 
 
 
 
 
 
1492static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1493{
1494	struct vfio_group *group;
1495	int opened;
1496
1497	group = vfio_group_get_from_minor(iminor(inode));
1498	if (!group)
1499		return -ENODEV;
1500
1501	if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1502		vfio_group_put(group);
1503		return -EPERM;
1504	}
1505
1506	/* Do we need multiple instances of the group open?  Seems not. */
1507	opened = atomic_cmpxchg(&group->opened, 0, 1);
1508	if (opened) {
1509		vfio_group_put(group);
1510		return -EBUSY;
1511	}
1512
1513	/* Is something still in use from a previous open? */
1514	if (group->container) {
1515		atomic_dec(&group->opened);
1516		vfio_group_put(group);
1517		return -EBUSY;
1518	}
1519
1520	/* Warn if previous user didn't cleanup and re-init to drop them */
1521	if (WARN_ON(group->notifier.head))
1522		BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1523
1524	filep->private_data = group;
1525
1526	return 0;
1527}
1528
1529static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1530{
1531	struct vfio_group *group = filep->private_data;
1532
1533	filep->private_data = NULL;
1534
1535	vfio_group_try_dissolve_container(group);
1536
1537	atomic_dec(&group->opened);
1538
1539	vfio_group_put(group);
1540
1541	return 0;
1542}
1543
1544static const struct file_operations vfio_group_fops = {
1545	.owner		= THIS_MODULE,
1546	.unlocked_ioctl	= vfio_group_fops_unl_ioctl,
1547	.compat_ioctl	= compat_ptr_ioctl,
 
 
1548	.open		= vfio_group_fops_open,
1549	.release	= vfio_group_fops_release,
1550};
1551
1552/**
1553 * VFIO Device fd
1554 */
1555static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1556{
1557	struct vfio_device *device = filep->private_data;
1558
1559	device->ops->release(device);
1560
1561	module_put(device->dev->driver->owner);
1562
1563	vfio_group_try_dissolve_container(device->group);
1564
1565	vfio_device_put(device);
1566
1567	return 0;
1568}
1569
1570static long vfio_device_fops_unl_ioctl(struct file *filep,
1571				       unsigned int cmd, unsigned long arg)
1572{
1573	struct vfio_device *device = filep->private_data;
1574
1575	if (unlikely(!device->ops->ioctl))
1576		return -EINVAL;
1577
1578	return device->ops->ioctl(device, cmd, arg);
1579}
1580
1581static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1582				     size_t count, loff_t *ppos)
1583{
1584	struct vfio_device *device = filep->private_data;
1585
1586	if (unlikely(!device->ops->read))
1587		return -EINVAL;
1588
1589	return device->ops->read(device, buf, count, ppos);
1590}
1591
1592static ssize_t vfio_device_fops_write(struct file *filep,
1593				      const char __user *buf,
1594				      size_t count, loff_t *ppos)
1595{
1596	struct vfio_device *device = filep->private_data;
1597
1598	if (unlikely(!device->ops->write))
1599		return -EINVAL;
1600
1601	return device->ops->write(device, buf, count, ppos);
1602}
1603
1604static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1605{
1606	struct vfio_device *device = filep->private_data;
1607
1608	if (unlikely(!device->ops->mmap))
1609		return -EINVAL;
1610
1611	return device->ops->mmap(device, vma);
1612}
1613
 
 
 
 
 
 
 
 
 
1614static const struct file_operations vfio_device_fops = {
1615	.owner		= THIS_MODULE,
1616	.release	= vfio_device_fops_release,
1617	.read		= vfio_device_fops_read,
1618	.write		= vfio_device_fops_write,
1619	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1620	.compat_ioctl	= compat_ptr_ioctl,
 
 
1621	.mmap		= vfio_device_fops_mmap,
1622};
1623
1624/**
1625 * External user API, exported by symbols to be linked dynamically.
1626 *
1627 * The protocol includes:
1628 *  1. do normal VFIO init operation:
1629 *	- opening a new container;
1630 *	- attaching group(s) to it;
1631 *	- setting an IOMMU driver for a container.
1632 * When IOMMU is set for a container, all groups in it are
1633 * considered ready to use by an external user.
1634 *
1635 * 2. User space passes a group fd to an external user.
1636 * The external user calls vfio_group_get_external_user()
1637 * to verify that:
1638 *	- the group is initialized;
1639 *	- IOMMU is set for it.
1640 * If both checks passed, vfio_group_get_external_user()
1641 * increments the container user counter to prevent
1642 * the VFIO group from disposal before KVM exits.
1643 *
1644 * 3. The external user calls vfio_external_user_iommu_id()
1645 * to know an IOMMU ID.
1646 *
1647 * 4. When the external KVM finishes, it calls
1648 * vfio_group_put_external_user() to release the VFIO group.
1649 * This call decrements the container user counter.
1650 */
1651struct vfio_group *vfio_group_get_external_user(struct file *filep)
1652{
1653	struct vfio_group *group = filep->private_data;
1654	int ret;
1655
1656	if (filep->f_op != &vfio_group_fops)
1657		return ERR_PTR(-EINVAL);
1658
1659	ret = vfio_group_add_container_user(group);
1660	if (ret)
1661		return ERR_PTR(ret);
1662
1663	vfio_group_get(group);
1664
1665	return group;
1666}
1667EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1668
1669/**
1670 * External user API, exported by symbols to be linked dynamically.
1671 * The external user passes in a device pointer
1672 * to verify that:
1673 *	- A VFIO group is assiciated with the device;
1674 *	- IOMMU is set for the group.
1675 * If both checks passed, vfio_group_get_external_user_from_dev()
1676 * increments the container user counter to prevent the VFIO group
1677 * from disposal before external user exits and returns the pointer
1678 * to the VFIO group.
1679 *
1680 * When the external user finishes using the VFIO group, it calls
1681 * vfio_group_put_external_user() to release the VFIO group and
1682 * decrement the container user counter.
1683 *
1684 * @dev [in]	: device
1685 * Return error PTR or pointer to VFIO group.
1686 */
1687
1688struct vfio_group *vfio_group_get_external_user_from_dev(struct device *dev)
1689{
1690	struct vfio_group *group;
1691	int ret;
1692
1693	group = vfio_group_get_from_dev(dev);
1694	if (!group)
1695		return ERR_PTR(-ENODEV);
1696
1697	ret = vfio_group_add_container_user(group);
1698	if (ret) {
1699		vfio_group_put(group);
1700		return ERR_PTR(ret);
1701	}
1702
1703	return group;
1704}
1705EXPORT_SYMBOL_GPL(vfio_group_get_external_user_from_dev);
1706
1707void vfio_group_put_external_user(struct vfio_group *group)
1708{
1709	vfio_group_try_dissolve_container(group);
1710	vfio_group_put(group);
1711}
1712EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1713
1714bool vfio_external_group_match_file(struct vfio_group *test_group,
1715				    struct file *filep)
1716{
1717	struct vfio_group *group = filep->private_data;
1718
1719	return (filep->f_op == &vfio_group_fops) && (group == test_group);
1720}
1721EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1722
1723int vfio_external_user_iommu_id(struct vfio_group *group)
1724{
1725	return iommu_group_id(group->iommu_group);
1726}
1727EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1728
1729long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1730{
1731	return vfio_ioctl_check_extension(group->container, arg);
1732}
1733EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1734
1735/**
1736 * Sub-module support
1737 */
1738/*
1739 * Helper for managing a buffer of info chain capabilities, allocate or
1740 * reallocate a buffer with additional @size, filling in @id and @version
1741 * of the capability.  A pointer to the new capability is returned.
1742 *
1743 * NB. The chain is based at the head of the buffer, so new entries are
1744 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1745 * next offsets prior to copying to the user buffer.
1746 */
1747struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1748					       size_t size, u16 id, u16 version)
1749{
1750	void *buf;
1751	struct vfio_info_cap_header *header, *tmp;
1752
1753	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1754	if (!buf) {
1755		kfree(caps->buf);
1756		caps->size = 0;
1757		return ERR_PTR(-ENOMEM);
1758	}
1759
1760	caps->buf = buf;
1761	header = buf + caps->size;
1762
1763	/* Eventually copied to user buffer, zero */
1764	memset(header, 0, size);
1765
1766	header->id = id;
1767	header->version = version;
1768
1769	/* Add to the end of the capability chain */
1770	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1771		; /* nothing */
1772
1773	tmp->next = caps->size;
1774	caps->size += size;
1775
1776	return header;
1777}
1778EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1779
1780void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1781{
1782	struct vfio_info_cap_header *tmp;
1783	void *buf = (void *)caps->buf;
1784
1785	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1786		tmp->next += offset;
1787}
1788EXPORT_SYMBOL(vfio_info_cap_shift);
1789
1790int vfio_info_add_capability(struct vfio_info_cap *caps,
1791			     struct vfio_info_cap_header *cap, size_t size)
1792{
1793	struct vfio_info_cap_header *header;
1794
1795	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1796	if (IS_ERR(header))
1797		return PTR_ERR(header);
1798
1799	memcpy(header + 1, cap + 1, size - sizeof(*header));
1800
1801	return 0;
1802}
1803EXPORT_SYMBOL(vfio_info_add_capability);
1804
1805int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1806				       int max_irq_type, size_t *data_size)
1807{
1808	unsigned long minsz;
1809	size_t size;
1810
1811	minsz = offsetofend(struct vfio_irq_set, count);
1812
1813	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1814	    (hdr->count >= (U32_MAX - hdr->start)) ||
1815	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1816				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1817		return -EINVAL;
1818
1819	if (data_size)
1820		*data_size = 0;
1821
1822	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1823		return -EINVAL;
1824
1825	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1826	case VFIO_IRQ_SET_DATA_NONE:
1827		size = 0;
1828		break;
1829	case VFIO_IRQ_SET_DATA_BOOL:
1830		size = sizeof(uint8_t);
1831		break;
1832	case VFIO_IRQ_SET_DATA_EVENTFD:
1833		size = sizeof(int32_t);
1834		break;
1835	default:
1836		return -EINVAL;
1837	}
1838
1839	if (size) {
1840		if (hdr->argsz - minsz < hdr->count * size)
1841			return -EINVAL;
1842
1843		if (!data_size)
1844			return -EINVAL;
1845
1846		*data_size = hdr->count * size;
1847	}
1848
1849	return 0;
1850}
1851EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1852
1853/*
1854 * Pin a set of guest PFNs and return their associated host PFNs for local
1855 * domain only.
1856 * @dev [in]     : device
1857 * @user_pfn [in]: array of user/guest PFNs to be pinned.
1858 * @npage [in]   : count of elements in user_pfn array.  This count should not
1859 *		   be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1860 * @prot [in]    : protection flags
1861 * @phys_pfn[out]: array of host PFNs
1862 * Return error or number of pages pinned.
1863 */
1864int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1865		   int prot, unsigned long *phys_pfn)
1866{
1867	struct vfio_container *container;
1868	struct vfio_group *group;
1869	struct vfio_iommu_driver *driver;
1870	int ret;
1871
1872	if (!dev || !user_pfn || !phys_pfn || !npage)
1873		return -EINVAL;
1874
1875	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1876		return -E2BIG;
1877
1878	group = vfio_group_get_from_dev(dev);
1879	if (!group)
1880		return -ENODEV;
1881
1882	if (group->dev_counter > 1) {
1883		ret = -EINVAL;
1884		goto err_pin_pages;
1885	}
1886
1887	ret = vfio_group_add_container_user(group);
1888	if (ret)
1889		goto err_pin_pages;
1890
1891	container = group->container;
1892	driver = container->iommu_driver;
1893	if (likely(driver && driver->ops->pin_pages))
1894		ret = driver->ops->pin_pages(container->iommu_data,
1895					     group->iommu_group, user_pfn,
1896					     npage, prot, phys_pfn);
1897	else
1898		ret = -ENOTTY;
1899
1900	vfio_group_try_dissolve_container(group);
1901
1902err_pin_pages:
1903	vfio_group_put(group);
1904	return ret;
1905}
1906EXPORT_SYMBOL(vfio_pin_pages);
1907
1908/*
1909 * Unpin set of host PFNs for local domain only.
1910 * @dev [in]     : device
1911 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1912 *		   PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1913 * @npage [in]   : count of elements in user_pfn array.  This count should not
1914 *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1915 * Return error or number of pages unpinned.
1916 */
1917int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
1918{
1919	struct vfio_container *container;
1920	struct vfio_group *group;
1921	struct vfio_iommu_driver *driver;
1922	int ret;
1923
1924	if (!dev || !user_pfn || !npage)
1925		return -EINVAL;
1926
1927	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1928		return -E2BIG;
1929
1930	group = vfio_group_get_from_dev(dev);
1931	if (!group)
1932		return -ENODEV;
1933
1934	ret = vfio_group_add_container_user(group);
1935	if (ret)
1936		goto err_unpin_pages;
1937
1938	container = group->container;
1939	driver = container->iommu_driver;
1940	if (likely(driver && driver->ops->unpin_pages))
1941		ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
1942					       npage);
1943	else
1944		ret = -ENOTTY;
1945
1946	vfio_group_try_dissolve_container(group);
1947
1948err_unpin_pages:
1949	vfio_group_put(group);
1950	return ret;
1951}
1952EXPORT_SYMBOL(vfio_unpin_pages);
1953
1954/*
1955 * Pin a set of guest IOVA PFNs and return their associated host PFNs for a
1956 * VFIO group.
1957 *
1958 * The caller needs to call vfio_group_get_external_user() or
1959 * vfio_group_get_external_user_from_dev() prior to calling this interface,
1960 * so as to prevent the VFIO group from disposal in the middle of the call.
1961 * But it can keep the reference to the VFIO group for several calls into
1962 * this interface.
1963 * After finishing using of the VFIO group, the caller needs to release the
1964 * VFIO group by calling vfio_group_put_external_user().
1965 *
1966 * @group [in]		: VFIO group
1967 * @user_iova_pfn [in]	: array of user/guest IOVA PFNs to be pinned.
1968 * @npage [in]		: count of elements in user_iova_pfn array.
1969 *			  This count should not be greater
1970 *			  VFIO_PIN_PAGES_MAX_ENTRIES.
1971 * @prot [in]		: protection flags
1972 * @phys_pfn [out]	: array of host PFNs
1973 * Return error or number of pages pinned.
1974 */
1975int vfio_group_pin_pages(struct vfio_group *group,
1976			 unsigned long *user_iova_pfn, int npage,
1977			 int prot, unsigned long *phys_pfn)
1978{
1979	struct vfio_container *container;
1980	struct vfio_iommu_driver *driver;
1981	int ret;
1982
1983	if (!group || !user_iova_pfn || !phys_pfn || !npage)
1984		return -EINVAL;
1985
1986	if (group->dev_counter > 1)
1987		return -EINVAL;
1988
1989	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1990		return -E2BIG;
1991
1992	container = group->container;
1993	driver = container->iommu_driver;
1994	if (likely(driver && driver->ops->pin_pages))
1995		ret = driver->ops->pin_pages(container->iommu_data,
1996					     group->iommu_group, user_iova_pfn,
1997					     npage, prot, phys_pfn);
1998	else
1999		ret = -ENOTTY;
2000
2001	return ret;
2002}
2003EXPORT_SYMBOL(vfio_group_pin_pages);
2004
2005/*
2006 * Unpin a set of guest IOVA PFNs for a VFIO group.
2007 *
2008 * The caller needs to call vfio_group_get_external_user() or
2009 * vfio_group_get_external_user_from_dev() prior to calling this interface,
2010 * so as to prevent the VFIO group from disposal in the middle of the call.
2011 * But it can keep the reference to the VFIO group for several calls into
2012 * this interface.
2013 * After finishing using of the VFIO group, the caller needs to release the
2014 * VFIO group by calling vfio_group_put_external_user().
2015 *
2016 * @group [in]		: vfio group
2017 * @user_iova_pfn [in]	: array of user/guest IOVA PFNs to be unpinned.
2018 * @npage [in]		: count of elements in user_iova_pfn array.
2019 *			  This count should not be greater than
2020 *			  VFIO_PIN_PAGES_MAX_ENTRIES.
2021 * Return error or number of pages unpinned.
2022 */
2023int vfio_group_unpin_pages(struct vfio_group *group,
2024			   unsigned long *user_iova_pfn, int npage)
2025{
2026	struct vfio_container *container;
2027	struct vfio_iommu_driver *driver;
2028	int ret;
2029
2030	if (!group || !user_iova_pfn || !npage)
2031		return -EINVAL;
2032
2033	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2034		return -E2BIG;
2035
2036	container = group->container;
2037	driver = container->iommu_driver;
2038	if (likely(driver && driver->ops->unpin_pages))
2039		ret = driver->ops->unpin_pages(container->iommu_data,
2040					       user_iova_pfn, npage);
2041	else
2042		ret = -ENOTTY;
2043
2044	return ret;
2045}
2046EXPORT_SYMBOL(vfio_group_unpin_pages);
2047
2048
2049/*
2050 * This interface allows the CPUs to perform some sort of virtual DMA on
2051 * behalf of the device.
2052 *
2053 * CPUs read/write from/into a range of IOVAs pointing to user space memory
2054 * into/from a kernel buffer.
2055 *
2056 * As the read/write of user space memory is conducted via the CPUs and is
2057 * not a real device DMA, it is not necessary to pin the user space memory.
2058 *
2059 * The caller needs to call vfio_group_get_external_user() or
2060 * vfio_group_get_external_user_from_dev() prior to calling this interface,
2061 * so as to prevent the VFIO group from disposal in the middle of the call.
2062 * But it can keep the reference to the VFIO group for several calls into
2063 * this interface.
2064 * After finishing using of the VFIO group, the caller needs to release the
2065 * VFIO group by calling vfio_group_put_external_user().
2066 *
2067 * @group [in]		: VFIO group
2068 * @user_iova [in]	: base IOVA of a user space buffer
2069 * @data [in]		: pointer to kernel buffer
2070 * @len [in]		: kernel buffer length
2071 * @write		: indicate read or write
2072 * Return error code on failure or 0 on success.
2073 */
2074int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova,
2075		void *data, size_t len, bool write)
2076{
2077	struct vfio_container *container;
2078	struct vfio_iommu_driver *driver;
2079	int ret = 0;
2080
2081	if (!group || !data || len <= 0)
2082		return -EINVAL;
2083
2084	container = group->container;
2085	driver = container->iommu_driver;
2086
2087	if (likely(driver && driver->ops->dma_rw))
2088		ret = driver->ops->dma_rw(container->iommu_data,
2089					  user_iova, data, len, write);
2090	else
2091		ret = -ENOTTY;
2092
2093	return ret;
2094}
2095EXPORT_SYMBOL(vfio_dma_rw);
2096
2097static int vfio_register_iommu_notifier(struct vfio_group *group,
2098					unsigned long *events,
2099					struct notifier_block *nb)
2100{
2101	struct vfio_container *container;
2102	struct vfio_iommu_driver *driver;
2103	int ret;
2104
2105	ret = vfio_group_add_container_user(group);
2106	if (ret)
2107		return -EINVAL;
2108
2109	container = group->container;
2110	driver = container->iommu_driver;
2111	if (likely(driver && driver->ops->register_notifier))
2112		ret = driver->ops->register_notifier(container->iommu_data,
2113						     events, nb);
2114	else
2115		ret = -ENOTTY;
2116
2117	vfio_group_try_dissolve_container(group);
2118
2119	return ret;
2120}
2121
2122static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2123					  struct notifier_block *nb)
2124{
2125	struct vfio_container *container;
2126	struct vfio_iommu_driver *driver;
2127	int ret;
2128
2129	ret = vfio_group_add_container_user(group);
2130	if (ret)
2131		return -EINVAL;
2132
2133	container = group->container;
2134	driver = container->iommu_driver;
2135	if (likely(driver && driver->ops->unregister_notifier))
2136		ret = driver->ops->unregister_notifier(container->iommu_data,
2137						       nb);
2138	else
2139		ret = -ENOTTY;
2140
2141	vfio_group_try_dissolve_container(group);
2142
2143	return ret;
2144}
2145
2146void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2147{
2148	group->kvm = kvm;
2149	blocking_notifier_call_chain(&group->notifier,
2150				VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2151}
2152EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2153
2154static int vfio_register_group_notifier(struct vfio_group *group,
2155					unsigned long *events,
2156					struct notifier_block *nb)
2157{
2158	int ret;
2159	bool set_kvm = false;
2160
2161	if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2162		set_kvm = true;
2163
2164	/* clear known events */
2165	*events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2166
2167	/* refuse to continue if still events remaining */
2168	if (*events)
2169		return -EINVAL;
2170
2171	ret = vfio_group_add_container_user(group);
2172	if (ret)
2173		return -EINVAL;
2174
2175	ret = blocking_notifier_chain_register(&group->notifier, nb);
2176
2177	/*
2178	 * The attaching of kvm and vfio_group might already happen, so
2179	 * here we replay once upon registration.
2180	 */
2181	if (!ret && set_kvm && group->kvm)
2182		blocking_notifier_call_chain(&group->notifier,
2183					VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2184
2185	vfio_group_try_dissolve_container(group);
2186
2187	return ret;
2188}
2189
2190static int vfio_unregister_group_notifier(struct vfio_group *group,
2191					 struct notifier_block *nb)
2192{
2193	int ret;
2194
2195	ret = vfio_group_add_container_user(group);
2196	if (ret)
2197		return -EINVAL;
2198
2199	ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2200
2201	vfio_group_try_dissolve_container(group);
2202
2203	return ret;
2204}
2205
2206int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2207			   unsigned long *events, struct notifier_block *nb)
2208{
2209	struct vfio_group *group;
2210	int ret;
2211
2212	if (!dev || !nb || !events || (*events == 0))
2213		return -EINVAL;
2214
2215	group = vfio_group_get_from_dev(dev);
2216	if (!group)
2217		return -ENODEV;
2218
2219	switch (type) {
2220	case VFIO_IOMMU_NOTIFY:
2221		ret = vfio_register_iommu_notifier(group, events, nb);
2222		break;
2223	case VFIO_GROUP_NOTIFY:
2224		ret = vfio_register_group_notifier(group, events, nb);
2225		break;
2226	default:
2227		ret = -EINVAL;
2228	}
2229
2230	vfio_group_put(group);
2231	return ret;
2232}
2233EXPORT_SYMBOL(vfio_register_notifier);
2234
2235int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2236			     struct notifier_block *nb)
2237{
2238	struct vfio_group *group;
2239	int ret;
2240
2241	if (!dev || !nb)
2242		return -EINVAL;
2243
2244	group = vfio_group_get_from_dev(dev);
2245	if (!group)
2246		return -ENODEV;
2247
2248	switch (type) {
2249	case VFIO_IOMMU_NOTIFY:
2250		ret = vfio_unregister_iommu_notifier(group, nb);
2251		break;
2252	case VFIO_GROUP_NOTIFY:
2253		ret = vfio_unregister_group_notifier(group, nb);
2254		break;
2255	default:
2256		ret = -EINVAL;
2257	}
2258
2259	vfio_group_put(group);
2260	return ret;
2261}
2262EXPORT_SYMBOL(vfio_unregister_notifier);
2263
2264struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group)
2265{
2266	struct vfio_container *container;
2267	struct vfio_iommu_driver *driver;
2268
2269	if (!group)
2270		return ERR_PTR(-EINVAL);
2271
2272	container = group->container;
2273	driver = container->iommu_driver;
2274	if (likely(driver && driver->ops->group_iommu_domain))
2275		return driver->ops->group_iommu_domain(container->iommu_data,
2276						       group->iommu_group);
2277
2278	return ERR_PTR(-ENOTTY);
2279}
2280EXPORT_SYMBOL_GPL(vfio_group_iommu_domain);
2281
2282/**
2283 * Module/class support
2284 */
2285static char *vfio_devnode(struct device *dev, umode_t *mode)
2286{
2287	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2288}
2289
2290static struct miscdevice vfio_dev = {
2291	.minor = VFIO_MINOR,
2292	.name = "vfio",
2293	.fops = &vfio_fops,
2294	.nodename = "vfio/vfio",
2295	.mode = S_IRUGO | S_IWUGO,
2296};
2297
2298static int __init vfio_init(void)
2299{
2300	int ret;
2301
2302	idr_init(&vfio.group_idr);
2303	mutex_init(&vfio.group_lock);
2304	mutex_init(&vfio.iommu_drivers_lock);
2305	INIT_LIST_HEAD(&vfio.group_list);
2306	INIT_LIST_HEAD(&vfio.iommu_drivers_list);
 
2307
2308	ret = misc_register(&vfio_dev);
2309	if (ret) {
2310		pr_err("vfio: misc device register failed\n");
2311		return ret;
2312	}
2313
2314	/* /dev/vfio/$GROUP */
2315	vfio.class = class_create(THIS_MODULE, "vfio");
2316	if (IS_ERR(vfio.class)) {
2317		ret = PTR_ERR(vfio.class);
2318		goto err_class;
2319	}
2320
2321	vfio.class->devnode = vfio_devnode;
2322
2323	ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2324	if (ret)
2325		goto err_alloc_chrdev;
2326
2327	cdev_init(&vfio.group_cdev, &vfio_group_fops);
2328	ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK + 1);
2329	if (ret)
2330		goto err_cdev_add;
2331
2332	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2333
2334#ifdef CONFIG_VFIO_NOIOMMU
2335	vfio_register_iommu_driver(&vfio_noiommu_ops);
2336#endif
2337	return 0;
2338
2339err_cdev_add:
2340	unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2341err_alloc_chrdev:
2342	class_destroy(vfio.class);
2343	vfio.class = NULL;
2344err_class:
2345	misc_deregister(&vfio_dev);
2346	return ret;
2347}
2348
2349static void __exit vfio_cleanup(void)
2350{
2351	WARN_ON(!list_empty(&vfio.group_list));
2352
2353#ifdef CONFIG_VFIO_NOIOMMU
2354	vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2355#endif
2356	idr_destroy(&vfio.group_idr);
2357	cdev_del(&vfio.group_cdev);
2358	unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2359	class_destroy(vfio.class);
2360	vfio.class = NULL;
2361	misc_deregister(&vfio_dev);
2362}
2363
2364module_init(vfio_init);
2365module_exit(vfio_cleanup);
2366
2367MODULE_VERSION(DRIVER_VERSION);
2368MODULE_LICENSE("GPL v2");
2369MODULE_AUTHOR(DRIVER_AUTHOR);
2370MODULE_DESCRIPTION(DRIVER_DESC);
2371MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2372MODULE_ALIAS("devname:vfio/vfio");
2373MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");