Loading...
1/*
2 * VFIO core
3 *
4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
13 * Author: Tom Lyon, pugs@cisco.com
14 */
15
16#include <linux/cdev.h>
17#include <linux/compat.h>
18#include <linux/device.h>
19#include <linux/file.h>
20#include <linux/anon_inodes.h>
21#include <linux/fs.h>
22#include <linux/idr.h>
23#include <linux/iommu.h>
24#include <linux/list.h>
25#include <linux/miscdevice.h>
26#include <linux/module.h>
27#include <linux/mutex.h>
28#include <linux/pci.h>
29#include <linux/rwsem.h>
30#include <linux/sched.h>
31#include <linux/slab.h>
32#include <linux/stat.h>
33#include <linux/string.h>
34#include <linux/uaccess.h>
35#include <linux/vfio.h>
36#include <linux/wait.h>
37
38#define DRIVER_VERSION "0.3"
39#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
40#define DRIVER_DESC "VFIO - User Level meta-driver"
41
42static struct vfio {
43 struct class *class;
44 struct list_head iommu_drivers_list;
45 struct mutex iommu_drivers_lock;
46 struct list_head group_list;
47 struct idr group_idr;
48 struct mutex group_lock;
49 struct cdev group_cdev;
50 dev_t group_devt;
51 wait_queue_head_t release_q;
52} vfio;
53
54struct vfio_iommu_driver {
55 const struct vfio_iommu_driver_ops *ops;
56 struct list_head vfio_next;
57};
58
59struct vfio_container {
60 struct kref kref;
61 struct list_head group_list;
62 struct rw_semaphore group_lock;
63 struct vfio_iommu_driver *iommu_driver;
64 void *iommu_data;
65 bool noiommu;
66};
67
68struct vfio_unbound_dev {
69 struct device *dev;
70 struct list_head unbound_next;
71};
72
73struct vfio_group {
74 struct kref kref;
75 int minor;
76 atomic_t container_users;
77 struct iommu_group *iommu_group;
78 struct vfio_container *container;
79 struct list_head device_list;
80 struct mutex device_lock;
81 struct device *dev;
82 struct notifier_block nb;
83 struct list_head vfio_next;
84 struct list_head container_next;
85 struct list_head unbound_list;
86 struct mutex unbound_lock;
87 atomic_t opened;
88 bool noiommu;
89};
90
91struct vfio_device {
92 struct kref kref;
93 struct device *dev;
94 const struct vfio_device_ops *ops;
95 struct vfio_group *group;
96 struct list_head group_next;
97 void *device_data;
98};
99
100#ifdef CONFIG_VFIO_NOIOMMU
101static bool noiommu __read_mostly;
102module_param_named(enable_unsafe_noiommu_mode,
103 noiommu, bool, S_IRUGO | S_IWUSR);
104MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
105#endif
106
107/*
108 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
109 * and remove functions, any use cases other than acquiring the first
110 * reference for the purpose of calling vfio_add_group_dev() or removing
111 * that symmetric reference after vfio_del_group_dev() should use the raw
112 * iommu_group_{get,put} functions. In particular, vfio_iommu_group_put()
113 * removes the device from the dummy group and cannot be nested.
114 */
115struct iommu_group *vfio_iommu_group_get(struct device *dev)
116{
117 struct iommu_group *group;
118 int __maybe_unused ret;
119
120 group = iommu_group_get(dev);
121
122#ifdef CONFIG_VFIO_NOIOMMU
123 /*
124 * With noiommu enabled, an IOMMU group will be created for a device
125 * that doesn't already have one and doesn't have an iommu_ops on their
126 * bus. We set iommudata simply to be able to identify these groups
127 * as special use and for reclamation later.
128 */
129 if (group || !noiommu || iommu_present(dev->bus))
130 return group;
131
132 group = iommu_group_alloc();
133 if (IS_ERR(group))
134 return NULL;
135
136 iommu_group_set_name(group, "vfio-noiommu");
137 iommu_group_set_iommudata(group, &noiommu, NULL);
138 ret = iommu_group_add_device(group, dev);
139 iommu_group_put(group);
140 if (ret)
141 return NULL;
142
143 /*
144 * Where to taint? At this point we've added an IOMMU group for a
145 * device that is not backed by iommu_ops, therefore any iommu_
146 * callback using iommu_ops can legitimately Oops. So, while we may
147 * be about to give a DMA capable device to a user without IOMMU
148 * protection, which is clearly taint-worthy, let's go ahead and do
149 * it here.
150 */
151 add_taint(TAINT_USER, LOCKDEP_STILL_OK);
152 dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
153#endif
154
155 return group;
156}
157EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
158
159void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
160{
161#ifdef CONFIG_VFIO_NOIOMMU
162 if (iommu_group_get_iommudata(group) == &noiommu)
163 iommu_group_remove_device(dev);
164#endif
165
166 iommu_group_put(group);
167}
168EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
169
170#ifdef CONFIG_VFIO_NOIOMMU
171static void *vfio_noiommu_open(unsigned long arg)
172{
173 if (arg != VFIO_NOIOMMU_IOMMU)
174 return ERR_PTR(-EINVAL);
175 if (!capable(CAP_SYS_RAWIO))
176 return ERR_PTR(-EPERM);
177
178 return NULL;
179}
180
181static void vfio_noiommu_release(void *iommu_data)
182{
183}
184
185static long vfio_noiommu_ioctl(void *iommu_data,
186 unsigned int cmd, unsigned long arg)
187{
188 if (cmd == VFIO_CHECK_EXTENSION)
189 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
190
191 return -ENOTTY;
192}
193
194static int vfio_noiommu_attach_group(void *iommu_data,
195 struct iommu_group *iommu_group)
196{
197 return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
198}
199
200static void vfio_noiommu_detach_group(void *iommu_data,
201 struct iommu_group *iommu_group)
202{
203}
204
205static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
206 .name = "vfio-noiommu",
207 .owner = THIS_MODULE,
208 .open = vfio_noiommu_open,
209 .release = vfio_noiommu_release,
210 .ioctl = vfio_noiommu_ioctl,
211 .attach_group = vfio_noiommu_attach_group,
212 .detach_group = vfio_noiommu_detach_group,
213};
214#endif
215
216
217/**
218 * IOMMU driver registration
219 */
220int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
221{
222 struct vfio_iommu_driver *driver, *tmp;
223
224 driver = kzalloc(sizeof(*driver), GFP_KERNEL);
225 if (!driver)
226 return -ENOMEM;
227
228 driver->ops = ops;
229
230 mutex_lock(&vfio.iommu_drivers_lock);
231
232 /* Check for duplicates */
233 list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
234 if (tmp->ops == ops) {
235 mutex_unlock(&vfio.iommu_drivers_lock);
236 kfree(driver);
237 return -EINVAL;
238 }
239 }
240
241 list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
242
243 mutex_unlock(&vfio.iommu_drivers_lock);
244
245 return 0;
246}
247EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
248
249void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
250{
251 struct vfio_iommu_driver *driver;
252
253 mutex_lock(&vfio.iommu_drivers_lock);
254 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
255 if (driver->ops == ops) {
256 list_del(&driver->vfio_next);
257 mutex_unlock(&vfio.iommu_drivers_lock);
258 kfree(driver);
259 return;
260 }
261 }
262 mutex_unlock(&vfio.iommu_drivers_lock);
263}
264EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
265
266/**
267 * Group minor allocation/free - both called with vfio.group_lock held
268 */
269static int vfio_alloc_group_minor(struct vfio_group *group)
270{
271 return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
272}
273
274static void vfio_free_group_minor(int minor)
275{
276 idr_remove(&vfio.group_idr, minor);
277}
278
279static int vfio_iommu_group_notifier(struct notifier_block *nb,
280 unsigned long action, void *data);
281static void vfio_group_get(struct vfio_group *group);
282
283/**
284 * Container objects - containers are created when /dev/vfio/vfio is
285 * opened, but their lifecycle extends until the last user is done, so
286 * it's freed via kref. Must support container/group/device being
287 * closed in any order.
288 */
289static void vfio_container_get(struct vfio_container *container)
290{
291 kref_get(&container->kref);
292}
293
294static void vfio_container_release(struct kref *kref)
295{
296 struct vfio_container *container;
297 container = container_of(kref, struct vfio_container, kref);
298
299 kfree(container);
300}
301
302static void vfio_container_put(struct vfio_container *container)
303{
304 kref_put(&container->kref, vfio_container_release);
305}
306
307static void vfio_group_unlock_and_free(struct vfio_group *group)
308{
309 mutex_unlock(&vfio.group_lock);
310 /*
311 * Unregister outside of lock. A spurious callback is harmless now
312 * that the group is no longer in vfio.group_list.
313 */
314 iommu_group_unregister_notifier(group->iommu_group, &group->nb);
315 kfree(group);
316}
317
318/**
319 * Group objects - create, release, get, put, search
320 */
321static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
322{
323 struct vfio_group *group, *tmp;
324 struct device *dev;
325 int ret, minor;
326
327 group = kzalloc(sizeof(*group), GFP_KERNEL);
328 if (!group)
329 return ERR_PTR(-ENOMEM);
330
331 kref_init(&group->kref);
332 INIT_LIST_HEAD(&group->device_list);
333 mutex_init(&group->device_lock);
334 INIT_LIST_HEAD(&group->unbound_list);
335 mutex_init(&group->unbound_lock);
336 atomic_set(&group->container_users, 0);
337 atomic_set(&group->opened, 0);
338 group->iommu_group = iommu_group;
339#ifdef CONFIG_VFIO_NOIOMMU
340 group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
341#endif
342
343 group->nb.notifier_call = vfio_iommu_group_notifier;
344
345 /*
346 * blocking notifiers acquire a rwsem around registering and hold
347 * it around callback. Therefore, need to register outside of
348 * vfio.group_lock to avoid A-B/B-A contention. Our callback won't
349 * do anything unless it can find the group in vfio.group_list, so
350 * no harm in registering early.
351 */
352 ret = iommu_group_register_notifier(iommu_group, &group->nb);
353 if (ret) {
354 kfree(group);
355 return ERR_PTR(ret);
356 }
357
358 mutex_lock(&vfio.group_lock);
359
360 /* Did we race creating this group? */
361 list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
362 if (tmp->iommu_group == iommu_group) {
363 vfio_group_get(tmp);
364 vfio_group_unlock_and_free(group);
365 return tmp;
366 }
367 }
368
369 minor = vfio_alloc_group_minor(group);
370 if (minor < 0) {
371 vfio_group_unlock_and_free(group);
372 return ERR_PTR(minor);
373 }
374
375 dev = device_create(vfio.class, NULL,
376 MKDEV(MAJOR(vfio.group_devt), minor),
377 group, "%s%d", group->noiommu ? "noiommu-" : "",
378 iommu_group_id(iommu_group));
379 if (IS_ERR(dev)) {
380 vfio_free_group_minor(minor);
381 vfio_group_unlock_and_free(group);
382 return (struct vfio_group *)dev; /* ERR_PTR */
383 }
384
385 group->minor = minor;
386 group->dev = dev;
387
388 list_add(&group->vfio_next, &vfio.group_list);
389
390 mutex_unlock(&vfio.group_lock);
391
392 return group;
393}
394
395/* called with vfio.group_lock held */
396static void vfio_group_release(struct kref *kref)
397{
398 struct vfio_group *group = container_of(kref, struct vfio_group, kref);
399 struct vfio_unbound_dev *unbound, *tmp;
400 struct iommu_group *iommu_group = group->iommu_group;
401
402 WARN_ON(!list_empty(&group->device_list));
403
404 list_for_each_entry_safe(unbound, tmp,
405 &group->unbound_list, unbound_next) {
406 list_del(&unbound->unbound_next);
407 kfree(unbound);
408 }
409
410 device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
411 list_del(&group->vfio_next);
412 vfio_free_group_minor(group->minor);
413 vfio_group_unlock_and_free(group);
414 iommu_group_put(iommu_group);
415}
416
417static void vfio_group_put(struct vfio_group *group)
418{
419 kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
420}
421
422/* Assume group_lock or group reference is held */
423static void vfio_group_get(struct vfio_group *group)
424{
425 kref_get(&group->kref);
426}
427
428/*
429 * Not really a try as we will sleep for mutex, but we need to make
430 * sure the group pointer is valid under lock and get a reference.
431 */
432static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
433{
434 struct vfio_group *target = group;
435
436 mutex_lock(&vfio.group_lock);
437 list_for_each_entry(group, &vfio.group_list, vfio_next) {
438 if (group == target) {
439 vfio_group_get(group);
440 mutex_unlock(&vfio.group_lock);
441 return group;
442 }
443 }
444 mutex_unlock(&vfio.group_lock);
445
446 return NULL;
447}
448
449static
450struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
451{
452 struct vfio_group *group;
453
454 mutex_lock(&vfio.group_lock);
455 list_for_each_entry(group, &vfio.group_list, vfio_next) {
456 if (group->iommu_group == iommu_group) {
457 vfio_group_get(group);
458 mutex_unlock(&vfio.group_lock);
459 return group;
460 }
461 }
462 mutex_unlock(&vfio.group_lock);
463
464 return NULL;
465}
466
467static struct vfio_group *vfio_group_get_from_minor(int minor)
468{
469 struct vfio_group *group;
470
471 mutex_lock(&vfio.group_lock);
472 group = idr_find(&vfio.group_idr, minor);
473 if (!group) {
474 mutex_unlock(&vfio.group_lock);
475 return NULL;
476 }
477 vfio_group_get(group);
478 mutex_unlock(&vfio.group_lock);
479
480 return group;
481}
482
483/**
484 * Device objects - create, release, get, put, search
485 */
486static
487struct vfio_device *vfio_group_create_device(struct vfio_group *group,
488 struct device *dev,
489 const struct vfio_device_ops *ops,
490 void *device_data)
491{
492 struct vfio_device *device;
493
494 device = kzalloc(sizeof(*device), GFP_KERNEL);
495 if (!device)
496 return ERR_PTR(-ENOMEM);
497
498 kref_init(&device->kref);
499 device->dev = dev;
500 device->group = group;
501 device->ops = ops;
502 device->device_data = device_data;
503 dev_set_drvdata(dev, device);
504
505 /* No need to get group_lock, caller has group reference */
506 vfio_group_get(group);
507
508 mutex_lock(&group->device_lock);
509 list_add(&device->group_next, &group->device_list);
510 mutex_unlock(&group->device_lock);
511
512 return device;
513}
514
515static void vfio_device_release(struct kref *kref)
516{
517 struct vfio_device *device = container_of(kref,
518 struct vfio_device, kref);
519 struct vfio_group *group = device->group;
520
521 list_del(&device->group_next);
522 mutex_unlock(&group->device_lock);
523
524 dev_set_drvdata(device->dev, NULL);
525
526 kfree(device);
527
528 /* vfio_del_group_dev may be waiting for this device */
529 wake_up(&vfio.release_q);
530}
531
532/* Device reference always implies a group reference */
533void vfio_device_put(struct vfio_device *device)
534{
535 struct vfio_group *group = device->group;
536 kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
537 vfio_group_put(group);
538}
539EXPORT_SYMBOL_GPL(vfio_device_put);
540
541static void vfio_device_get(struct vfio_device *device)
542{
543 vfio_group_get(device->group);
544 kref_get(&device->kref);
545}
546
547static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
548 struct device *dev)
549{
550 struct vfio_device *device;
551
552 mutex_lock(&group->device_lock);
553 list_for_each_entry(device, &group->device_list, group_next) {
554 if (device->dev == dev) {
555 vfio_device_get(device);
556 mutex_unlock(&group->device_lock);
557 return device;
558 }
559 }
560 mutex_unlock(&group->device_lock);
561 return NULL;
562}
563
564/*
565 * Some drivers, like pci-stub, are only used to prevent other drivers from
566 * claiming a device and are therefore perfectly legitimate for a user owned
567 * group. The pci-stub driver has no dependencies on DMA or the IOVA mapping
568 * of the device, but it does prevent the user from having direct access to
569 * the device, which is useful in some circumstances.
570 *
571 * We also assume that we can include PCI interconnect devices, ie. bridges.
572 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
573 * then all of the downstream devices will be part of the same IOMMU group as
574 * the bridge. Thus, if placing the bridge into the user owned IOVA space
575 * breaks anything, it only does so for user owned devices downstream. Note
576 * that error notification via MSI can be affected for platforms that handle
577 * MSI within the same IOVA space as DMA.
578 */
579static const char * const vfio_driver_whitelist[] = { "pci-stub" };
580
581static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
582{
583 int i;
584
585 if (dev_is_pci(dev)) {
586 struct pci_dev *pdev = to_pci_dev(dev);
587
588 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
589 return true;
590 }
591
592 for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) {
593 if (!strcmp(drv->name, vfio_driver_whitelist[i]))
594 return true;
595 }
596
597 return false;
598}
599
600/*
601 * A vfio group is viable for use by userspace if all devices are in
602 * one of the following states:
603 * - driver-less
604 * - bound to a vfio driver
605 * - bound to a whitelisted driver
606 * - a PCI interconnect device
607 *
608 * We use two methods to determine whether a device is bound to a vfio
609 * driver. The first is to test whether the device exists in the vfio
610 * group. The second is to test if the device exists on the group
611 * unbound_list, indicating it's in the middle of transitioning from
612 * a vfio driver to driver-less.
613 */
614static int vfio_dev_viable(struct device *dev, void *data)
615{
616 struct vfio_group *group = data;
617 struct vfio_device *device;
618 struct device_driver *drv = ACCESS_ONCE(dev->driver);
619 struct vfio_unbound_dev *unbound;
620 int ret = -EINVAL;
621
622 mutex_lock(&group->unbound_lock);
623 list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
624 if (dev == unbound->dev) {
625 ret = 0;
626 break;
627 }
628 }
629 mutex_unlock(&group->unbound_lock);
630
631 if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
632 return 0;
633
634 device = vfio_group_get_device(group, dev);
635 if (device) {
636 vfio_device_put(device);
637 return 0;
638 }
639
640 return ret;
641}
642
643/**
644 * Async device support
645 */
646static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
647{
648 struct vfio_device *device;
649
650 /* Do we already know about it? We shouldn't */
651 device = vfio_group_get_device(group, dev);
652 if (WARN_ON_ONCE(device)) {
653 vfio_device_put(device);
654 return 0;
655 }
656
657 /* Nothing to do for idle groups */
658 if (!atomic_read(&group->container_users))
659 return 0;
660
661 /* TODO Prevent device auto probing */
662 WARN(1, "Device %s added to live group %d!\n", dev_name(dev),
663 iommu_group_id(group->iommu_group));
664
665 return 0;
666}
667
668static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
669{
670 /* We don't care what happens when the group isn't in use */
671 if (!atomic_read(&group->container_users))
672 return 0;
673
674 return vfio_dev_viable(dev, group);
675}
676
677static int vfio_iommu_group_notifier(struct notifier_block *nb,
678 unsigned long action, void *data)
679{
680 struct vfio_group *group = container_of(nb, struct vfio_group, nb);
681 struct device *dev = data;
682 struct vfio_unbound_dev *unbound;
683
684 /*
685 * Need to go through a group_lock lookup to get a reference or we
686 * risk racing a group being removed. Ignore spurious notifies.
687 */
688 group = vfio_group_try_get(group);
689 if (!group)
690 return NOTIFY_OK;
691
692 switch (action) {
693 case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
694 vfio_group_nb_add_dev(group, dev);
695 break;
696 case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
697 /*
698 * Nothing to do here. If the device is in use, then the
699 * vfio sub-driver should block the remove callback until
700 * it is unused. If the device is unused or attached to a
701 * stub driver, then it should be released and we don't
702 * care that it will be going away.
703 */
704 break;
705 case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
706 pr_debug("%s: Device %s, group %d binding to driver\n",
707 __func__, dev_name(dev),
708 iommu_group_id(group->iommu_group));
709 break;
710 case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
711 pr_debug("%s: Device %s, group %d bound to driver %s\n",
712 __func__, dev_name(dev),
713 iommu_group_id(group->iommu_group), dev->driver->name);
714 BUG_ON(vfio_group_nb_verify(group, dev));
715 break;
716 case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
717 pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
718 __func__, dev_name(dev),
719 iommu_group_id(group->iommu_group), dev->driver->name);
720 break;
721 case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
722 pr_debug("%s: Device %s, group %d unbound from driver\n",
723 __func__, dev_name(dev),
724 iommu_group_id(group->iommu_group));
725 /*
726 * XXX An unbound device in a live group is ok, but we'd
727 * really like to avoid the above BUG_ON by preventing other
728 * drivers from binding to it. Once that occurs, we have to
729 * stop the system to maintain isolation. At a minimum, we'd
730 * want a toggle to disable driver auto probe for this device.
731 */
732
733 mutex_lock(&group->unbound_lock);
734 list_for_each_entry(unbound,
735 &group->unbound_list, unbound_next) {
736 if (dev == unbound->dev) {
737 list_del(&unbound->unbound_next);
738 kfree(unbound);
739 break;
740 }
741 }
742 mutex_unlock(&group->unbound_lock);
743 break;
744 }
745
746 vfio_group_put(group);
747 return NOTIFY_OK;
748}
749
750/**
751 * VFIO driver API
752 */
753int vfio_add_group_dev(struct device *dev,
754 const struct vfio_device_ops *ops, void *device_data)
755{
756 struct iommu_group *iommu_group;
757 struct vfio_group *group;
758 struct vfio_device *device;
759
760 iommu_group = iommu_group_get(dev);
761 if (!iommu_group)
762 return -EINVAL;
763
764 group = vfio_group_get_from_iommu(iommu_group);
765 if (!group) {
766 group = vfio_create_group(iommu_group);
767 if (IS_ERR(group)) {
768 iommu_group_put(iommu_group);
769 return PTR_ERR(group);
770 }
771 } else {
772 /*
773 * A found vfio_group already holds a reference to the
774 * iommu_group. A created vfio_group keeps the reference.
775 */
776 iommu_group_put(iommu_group);
777 }
778
779 device = vfio_group_get_device(group, dev);
780 if (device) {
781 WARN(1, "Device %s already exists on group %d\n",
782 dev_name(dev), iommu_group_id(iommu_group));
783 vfio_device_put(device);
784 vfio_group_put(group);
785 return -EBUSY;
786 }
787
788 device = vfio_group_create_device(group, dev, ops, device_data);
789 if (IS_ERR(device)) {
790 vfio_group_put(group);
791 return PTR_ERR(device);
792 }
793
794 /*
795 * Drop all but the vfio_device reference. The vfio_device holds
796 * a reference to the vfio_group, which holds a reference to the
797 * iommu_group.
798 */
799 vfio_group_put(group);
800
801 return 0;
802}
803EXPORT_SYMBOL_GPL(vfio_add_group_dev);
804
805/**
806 * Get a reference to the vfio_device for a device. Even if the
807 * caller thinks they own the device, they could be racing with a
808 * release call path, so we can't trust drvdata for the shortcut.
809 * Go the long way around, from the iommu_group to the vfio_group
810 * to the vfio_device.
811 */
812struct vfio_device *vfio_device_get_from_dev(struct device *dev)
813{
814 struct iommu_group *iommu_group;
815 struct vfio_group *group;
816 struct vfio_device *device;
817
818 iommu_group = iommu_group_get(dev);
819 if (!iommu_group)
820 return NULL;
821
822 group = vfio_group_get_from_iommu(iommu_group);
823 iommu_group_put(iommu_group);
824 if (!group)
825 return NULL;
826
827 device = vfio_group_get_device(group, dev);
828 vfio_group_put(group);
829
830 return device;
831}
832EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
833
834static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
835 char *buf)
836{
837 struct vfio_device *it, *device = NULL;
838
839 mutex_lock(&group->device_lock);
840 list_for_each_entry(it, &group->device_list, group_next) {
841 if (!strcmp(dev_name(it->dev), buf)) {
842 device = it;
843 vfio_device_get(device);
844 break;
845 }
846 }
847 mutex_unlock(&group->device_lock);
848
849 return device;
850}
851
852/*
853 * Caller must hold a reference to the vfio_device
854 */
855void *vfio_device_data(struct vfio_device *device)
856{
857 return device->device_data;
858}
859EXPORT_SYMBOL_GPL(vfio_device_data);
860
861/* Given a referenced group, check if it contains the device */
862static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
863{
864 struct vfio_device *device;
865
866 device = vfio_group_get_device(group, dev);
867 if (!device)
868 return false;
869
870 vfio_device_put(device);
871 return true;
872}
873
874/*
875 * Decrement the device reference count and wait for the device to be
876 * removed. Open file descriptors for the device... */
877void *vfio_del_group_dev(struct device *dev)
878{
879 struct vfio_device *device = dev_get_drvdata(dev);
880 struct vfio_group *group = device->group;
881 void *device_data = device->device_data;
882 struct vfio_unbound_dev *unbound;
883 unsigned int i = 0;
884 long ret;
885 bool interrupted = false;
886
887 /*
888 * The group exists so long as we have a device reference. Get
889 * a group reference and use it to scan for the device going away.
890 */
891 vfio_group_get(group);
892
893 /*
894 * When the device is removed from the group, the group suddenly
895 * becomes non-viable; the device has a driver (until the unbind
896 * completes), but it's not present in the group. This is bad news
897 * for any external users that need to re-acquire a group reference
898 * in order to match and release their existing reference. To
899 * solve this, we track such devices on the unbound_list to bridge
900 * the gap until they're fully unbound.
901 */
902 unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
903 if (unbound) {
904 unbound->dev = dev;
905 mutex_lock(&group->unbound_lock);
906 list_add(&unbound->unbound_next, &group->unbound_list);
907 mutex_unlock(&group->unbound_lock);
908 }
909 WARN_ON(!unbound);
910
911 vfio_device_put(device);
912
913 /*
914 * If the device is still present in the group after the above
915 * 'put', then it is in use and we need to request it from the
916 * bus driver. The driver may in turn need to request the
917 * device from the user. We send the request on an arbitrary
918 * interval with counter to allow the driver to take escalating
919 * measures to release the device if it has the ability to do so.
920 */
921 do {
922 device = vfio_group_get_device(group, dev);
923 if (!device)
924 break;
925
926 if (device->ops->request)
927 device->ops->request(device_data, i++);
928
929 vfio_device_put(device);
930
931 if (interrupted) {
932 ret = wait_event_timeout(vfio.release_q,
933 !vfio_dev_present(group, dev), HZ * 10);
934 } else {
935 ret = wait_event_interruptible_timeout(vfio.release_q,
936 !vfio_dev_present(group, dev), HZ * 10);
937 if (ret == -ERESTARTSYS) {
938 interrupted = true;
939 dev_warn(dev,
940 "Device is currently in use, task"
941 " \"%s\" (%d) "
942 "blocked until device is released",
943 current->comm, task_pid_nr(current));
944 }
945 }
946 } while (ret <= 0);
947
948 vfio_group_put(group);
949
950 return device_data;
951}
952EXPORT_SYMBOL_GPL(vfio_del_group_dev);
953
954/**
955 * VFIO base fd, /dev/vfio/vfio
956 */
957static long vfio_ioctl_check_extension(struct vfio_container *container,
958 unsigned long arg)
959{
960 struct vfio_iommu_driver *driver;
961 long ret = 0;
962
963 down_read(&container->group_lock);
964
965 driver = container->iommu_driver;
966
967 switch (arg) {
968 /* No base extensions yet */
969 default:
970 /*
971 * If no driver is set, poll all registered drivers for
972 * extensions and return the first positive result. If
973 * a driver is already set, further queries will be passed
974 * only to that driver.
975 */
976 if (!driver) {
977 mutex_lock(&vfio.iommu_drivers_lock);
978 list_for_each_entry(driver, &vfio.iommu_drivers_list,
979 vfio_next) {
980
981#ifdef CONFIG_VFIO_NOIOMMU
982 if (!list_empty(&container->group_list) &&
983 (container->noiommu !=
984 (driver->ops == &vfio_noiommu_ops)))
985 continue;
986#endif
987
988 if (!try_module_get(driver->ops->owner))
989 continue;
990
991 ret = driver->ops->ioctl(NULL,
992 VFIO_CHECK_EXTENSION,
993 arg);
994 module_put(driver->ops->owner);
995 if (ret > 0)
996 break;
997 }
998 mutex_unlock(&vfio.iommu_drivers_lock);
999 } else
1000 ret = driver->ops->ioctl(container->iommu_data,
1001 VFIO_CHECK_EXTENSION, arg);
1002 }
1003
1004 up_read(&container->group_lock);
1005
1006 return ret;
1007}
1008
1009/* hold write lock on container->group_lock */
1010static int __vfio_container_attach_groups(struct vfio_container *container,
1011 struct vfio_iommu_driver *driver,
1012 void *data)
1013{
1014 struct vfio_group *group;
1015 int ret = -ENODEV;
1016
1017 list_for_each_entry(group, &container->group_list, container_next) {
1018 ret = driver->ops->attach_group(data, group->iommu_group);
1019 if (ret)
1020 goto unwind;
1021 }
1022
1023 return ret;
1024
1025unwind:
1026 list_for_each_entry_continue_reverse(group, &container->group_list,
1027 container_next) {
1028 driver->ops->detach_group(data, group->iommu_group);
1029 }
1030
1031 return ret;
1032}
1033
1034static long vfio_ioctl_set_iommu(struct vfio_container *container,
1035 unsigned long arg)
1036{
1037 struct vfio_iommu_driver *driver;
1038 long ret = -ENODEV;
1039
1040 down_write(&container->group_lock);
1041
1042 /*
1043 * The container is designed to be an unprivileged interface while
1044 * the group can be assigned to specific users. Therefore, only by
1045 * adding a group to a container does the user get the privilege of
1046 * enabling the iommu, which may allocate finite resources. There
1047 * is no unset_iommu, but by removing all the groups from a container,
1048 * the container is deprivileged and returns to an unset state.
1049 */
1050 if (list_empty(&container->group_list) || container->iommu_driver) {
1051 up_write(&container->group_lock);
1052 return -EINVAL;
1053 }
1054
1055 mutex_lock(&vfio.iommu_drivers_lock);
1056 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1057 void *data;
1058
1059#ifdef CONFIG_VFIO_NOIOMMU
1060 /*
1061 * Only noiommu containers can use vfio-noiommu and noiommu
1062 * containers can only use vfio-noiommu.
1063 */
1064 if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1065 continue;
1066#endif
1067
1068 if (!try_module_get(driver->ops->owner))
1069 continue;
1070
1071 /*
1072 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1073 * so test which iommu driver reported support for this
1074 * extension and call open on them. We also pass them the
1075 * magic, allowing a single driver to support multiple
1076 * interfaces if they'd like.
1077 */
1078 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1079 module_put(driver->ops->owner);
1080 continue;
1081 }
1082
1083 data = driver->ops->open(arg);
1084 if (IS_ERR(data)) {
1085 ret = PTR_ERR(data);
1086 module_put(driver->ops->owner);
1087 continue;
1088 }
1089
1090 ret = __vfio_container_attach_groups(container, driver, data);
1091 if (ret) {
1092 driver->ops->release(data);
1093 module_put(driver->ops->owner);
1094 continue;
1095 }
1096
1097 container->iommu_driver = driver;
1098 container->iommu_data = data;
1099 break;
1100 }
1101
1102 mutex_unlock(&vfio.iommu_drivers_lock);
1103 up_write(&container->group_lock);
1104
1105 return ret;
1106}
1107
1108static long vfio_fops_unl_ioctl(struct file *filep,
1109 unsigned int cmd, unsigned long arg)
1110{
1111 struct vfio_container *container = filep->private_data;
1112 struct vfio_iommu_driver *driver;
1113 void *data;
1114 long ret = -EINVAL;
1115
1116 if (!container)
1117 return ret;
1118
1119 switch (cmd) {
1120 case VFIO_GET_API_VERSION:
1121 ret = VFIO_API_VERSION;
1122 break;
1123 case VFIO_CHECK_EXTENSION:
1124 ret = vfio_ioctl_check_extension(container, arg);
1125 break;
1126 case VFIO_SET_IOMMU:
1127 ret = vfio_ioctl_set_iommu(container, arg);
1128 break;
1129 default:
1130 down_read(&container->group_lock);
1131
1132 driver = container->iommu_driver;
1133 data = container->iommu_data;
1134
1135 if (driver) /* passthrough all unrecognized ioctls */
1136 ret = driver->ops->ioctl(data, cmd, arg);
1137
1138 up_read(&container->group_lock);
1139 }
1140
1141 return ret;
1142}
1143
1144#ifdef CONFIG_COMPAT
1145static long vfio_fops_compat_ioctl(struct file *filep,
1146 unsigned int cmd, unsigned long arg)
1147{
1148 arg = (unsigned long)compat_ptr(arg);
1149 return vfio_fops_unl_ioctl(filep, cmd, arg);
1150}
1151#endif /* CONFIG_COMPAT */
1152
1153static int vfio_fops_open(struct inode *inode, struct file *filep)
1154{
1155 struct vfio_container *container;
1156
1157 container = kzalloc(sizeof(*container), GFP_KERNEL);
1158 if (!container)
1159 return -ENOMEM;
1160
1161 INIT_LIST_HEAD(&container->group_list);
1162 init_rwsem(&container->group_lock);
1163 kref_init(&container->kref);
1164
1165 filep->private_data = container;
1166
1167 return 0;
1168}
1169
1170static int vfio_fops_release(struct inode *inode, struct file *filep)
1171{
1172 struct vfio_container *container = filep->private_data;
1173
1174 filep->private_data = NULL;
1175
1176 vfio_container_put(container);
1177
1178 return 0;
1179}
1180
1181/*
1182 * Once an iommu driver is set, we optionally pass read/write/mmap
1183 * on to the driver, allowing management interfaces beyond ioctl.
1184 */
1185static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1186 size_t count, loff_t *ppos)
1187{
1188 struct vfio_container *container = filep->private_data;
1189 struct vfio_iommu_driver *driver;
1190 ssize_t ret = -EINVAL;
1191
1192 down_read(&container->group_lock);
1193
1194 driver = container->iommu_driver;
1195 if (likely(driver && driver->ops->read))
1196 ret = driver->ops->read(container->iommu_data,
1197 buf, count, ppos);
1198
1199 up_read(&container->group_lock);
1200
1201 return ret;
1202}
1203
1204static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1205 size_t count, loff_t *ppos)
1206{
1207 struct vfio_container *container = filep->private_data;
1208 struct vfio_iommu_driver *driver;
1209 ssize_t ret = -EINVAL;
1210
1211 down_read(&container->group_lock);
1212
1213 driver = container->iommu_driver;
1214 if (likely(driver && driver->ops->write))
1215 ret = driver->ops->write(container->iommu_data,
1216 buf, count, ppos);
1217
1218 up_read(&container->group_lock);
1219
1220 return ret;
1221}
1222
1223static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1224{
1225 struct vfio_container *container = filep->private_data;
1226 struct vfio_iommu_driver *driver;
1227 int ret = -EINVAL;
1228
1229 down_read(&container->group_lock);
1230
1231 driver = container->iommu_driver;
1232 if (likely(driver && driver->ops->mmap))
1233 ret = driver->ops->mmap(container->iommu_data, vma);
1234
1235 up_read(&container->group_lock);
1236
1237 return ret;
1238}
1239
1240static const struct file_operations vfio_fops = {
1241 .owner = THIS_MODULE,
1242 .open = vfio_fops_open,
1243 .release = vfio_fops_release,
1244 .read = vfio_fops_read,
1245 .write = vfio_fops_write,
1246 .unlocked_ioctl = vfio_fops_unl_ioctl,
1247#ifdef CONFIG_COMPAT
1248 .compat_ioctl = vfio_fops_compat_ioctl,
1249#endif
1250 .mmap = vfio_fops_mmap,
1251};
1252
1253/**
1254 * VFIO Group fd, /dev/vfio/$GROUP
1255 */
1256static void __vfio_group_unset_container(struct vfio_group *group)
1257{
1258 struct vfio_container *container = group->container;
1259 struct vfio_iommu_driver *driver;
1260
1261 down_write(&container->group_lock);
1262
1263 driver = container->iommu_driver;
1264 if (driver)
1265 driver->ops->detach_group(container->iommu_data,
1266 group->iommu_group);
1267
1268 group->container = NULL;
1269 list_del(&group->container_next);
1270
1271 /* Detaching the last group deprivileges a container, remove iommu */
1272 if (driver && list_empty(&container->group_list)) {
1273 driver->ops->release(container->iommu_data);
1274 module_put(driver->ops->owner);
1275 container->iommu_driver = NULL;
1276 container->iommu_data = NULL;
1277 }
1278
1279 up_write(&container->group_lock);
1280
1281 vfio_container_put(container);
1282}
1283
1284/*
1285 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1286 * if there was no container to unset. Since the ioctl is called on
1287 * the group, we know that still exists, therefore the only valid
1288 * transition here is 1->0.
1289 */
1290static int vfio_group_unset_container(struct vfio_group *group)
1291{
1292 int users = atomic_cmpxchg(&group->container_users, 1, 0);
1293
1294 if (!users)
1295 return -EINVAL;
1296 if (users != 1)
1297 return -EBUSY;
1298
1299 __vfio_group_unset_container(group);
1300
1301 return 0;
1302}
1303
1304/*
1305 * When removing container users, anything that removes the last user
1306 * implicitly removes the group from the container. That is, if the
1307 * group file descriptor is closed, as well as any device file descriptors,
1308 * the group is free.
1309 */
1310static void vfio_group_try_dissolve_container(struct vfio_group *group)
1311{
1312 if (0 == atomic_dec_if_positive(&group->container_users))
1313 __vfio_group_unset_container(group);
1314}
1315
1316static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1317{
1318 struct fd f;
1319 struct vfio_container *container;
1320 struct vfio_iommu_driver *driver;
1321 int ret = 0;
1322
1323 if (atomic_read(&group->container_users))
1324 return -EINVAL;
1325
1326 if (group->noiommu && !capable(CAP_SYS_RAWIO))
1327 return -EPERM;
1328
1329 f = fdget(container_fd);
1330 if (!f.file)
1331 return -EBADF;
1332
1333 /* Sanity check, is this really our fd? */
1334 if (f.file->f_op != &vfio_fops) {
1335 fdput(f);
1336 return -EINVAL;
1337 }
1338
1339 container = f.file->private_data;
1340 WARN_ON(!container); /* fget ensures we don't race vfio_release */
1341
1342 down_write(&container->group_lock);
1343
1344 /* Real groups and fake groups cannot mix */
1345 if (!list_empty(&container->group_list) &&
1346 container->noiommu != group->noiommu) {
1347 ret = -EPERM;
1348 goto unlock_out;
1349 }
1350
1351 driver = container->iommu_driver;
1352 if (driver) {
1353 ret = driver->ops->attach_group(container->iommu_data,
1354 group->iommu_group);
1355 if (ret)
1356 goto unlock_out;
1357 }
1358
1359 group->container = container;
1360 container->noiommu = group->noiommu;
1361 list_add(&group->container_next, &container->group_list);
1362
1363 /* Get a reference on the container and mark a user within the group */
1364 vfio_container_get(container);
1365 atomic_inc(&group->container_users);
1366
1367unlock_out:
1368 up_write(&container->group_lock);
1369 fdput(f);
1370 return ret;
1371}
1372
1373static bool vfio_group_viable(struct vfio_group *group)
1374{
1375 return (iommu_group_for_each_dev(group->iommu_group,
1376 group, vfio_dev_viable) == 0);
1377}
1378
1379static const struct file_operations vfio_device_fops;
1380
1381static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1382{
1383 struct vfio_device *device;
1384 struct file *filep;
1385 int ret;
1386
1387 if (0 == atomic_read(&group->container_users) ||
1388 !group->container->iommu_driver || !vfio_group_viable(group))
1389 return -EINVAL;
1390
1391 if (group->noiommu && !capable(CAP_SYS_RAWIO))
1392 return -EPERM;
1393
1394 device = vfio_device_get_from_name(group, buf);
1395 if (!device)
1396 return -ENODEV;
1397
1398 ret = device->ops->open(device->device_data);
1399 if (ret) {
1400 vfio_device_put(device);
1401 return ret;
1402 }
1403
1404 /*
1405 * We can't use anon_inode_getfd() because we need to modify
1406 * the f_mode flags directly to allow more than just ioctls
1407 */
1408 ret = get_unused_fd_flags(O_CLOEXEC);
1409 if (ret < 0) {
1410 device->ops->release(device->device_data);
1411 vfio_device_put(device);
1412 return ret;
1413 }
1414
1415 filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1416 device, O_RDWR);
1417 if (IS_ERR(filep)) {
1418 put_unused_fd(ret);
1419 ret = PTR_ERR(filep);
1420 device->ops->release(device->device_data);
1421 vfio_device_put(device);
1422 return ret;
1423 }
1424
1425 /*
1426 * TODO: add an anon_inode interface to do this.
1427 * Appears to be missing by lack of need rather than
1428 * explicitly prevented. Now there's need.
1429 */
1430 filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1431
1432 atomic_inc(&group->container_users);
1433
1434 fd_install(ret, filep);
1435
1436 if (group->noiommu)
1437 dev_warn(device->dev, "vfio-noiommu device opened by user "
1438 "(%s:%d)\n", current->comm, task_pid_nr(current));
1439
1440 return ret;
1441}
1442
1443static long vfio_group_fops_unl_ioctl(struct file *filep,
1444 unsigned int cmd, unsigned long arg)
1445{
1446 struct vfio_group *group = filep->private_data;
1447 long ret = -ENOTTY;
1448
1449 switch (cmd) {
1450 case VFIO_GROUP_GET_STATUS:
1451 {
1452 struct vfio_group_status status;
1453 unsigned long minsz;
1454
1455 minsz = offsetofend(struct vfio_group_status, flags);
1456
1457 if (copy_from_user(&status, (void __user *)arg, minsz))
1458 return -EFAULT;
1459
1460 if (status.argsz < minsz)
1461 return -EINVAL;
1462
1463 status.flags = 0;
1464
1465 if (vfio_group_viable(group))
1466 status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1467
1468 if (group->container)
1469 status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1470
1471 if (copy_to_user((void __user *)arg, &status, minsz))
1472 return -EFAULT;
1473
1474 ret = 0;
1475 break;
1476 }
1477 case VFIO_GROUP_SET_CONTAINER:
1478 {
1479 int fd;
1480
1481 if (get_user(fd, (int __user *)arg))
1482 return -EFAULT;
1483
1484 if (fd < 0)
1485 return -EINVAL;
1486
1487 ret = vfio_group_set_container(group, fd);
1488 break;
1489 }
1490 case VFIO_GROUP_UNSET_CONTAINER:
1491 ret = vfio_group_unset_container(group);
1492 break;
1493 case VFIO_GROUP_GET_DEVICE_FD:
1494 {
1495 char *buf;
1496
1497 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1498 if (IS_ERR(buf))
1499 return PTR_ERR(buf);
1500
1501 ret = vfio_group_get_device_fd(group, buf);
1502 kfree(buf);
1503 break;
1504 }
1505 }
1506
1507 return ret;
1508}
1509
1510#ifdef CONFIG_COMPAT
1511static long vfio_group_fops_compat_ioctl(struct file *filep,
1512 unsigned int cmd, unsigned long arg)
1513{
1514 arg = (unsigned long)compat_ptr(arg);
1515 return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1516}
1517#endif /* CONFIG_COMPAT */
1518
1519static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1520{
1521 struct vfio_group *group;
1522 int opened;
1523
1524 group = vfio_group_get_from_minor(iminor(inode));
1525 if (!group)
1526 return -ENODEV;
1527
1528 if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1529 vfio_group_put(group);
1530 return -EPERM;
1531 }
1532
1533 /* Do we need multiple instances of the group open? Seems not. */
1534 opened = atomic_cmpxchg(&group->opened, 0, 1);
1535 if (opened) {
1536 vfio_group_put(group);
1537 return -EBUSY;
1538 }
1539
1540 /* Is something still in use from a previous open? */
1541 if (group->container) {
1542 atomic_dec(&group->opened);
1543 vfio_group_put(group);
1544 return -EBUSY;
1545 }
1546
1547 filep->private_data = group;
1548
1549 return 0;
1550}
1551
1552static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1553{
1554 struct vfio_group *group = filep->private_data;
1555
1556 filep->private_data = NULL;
1557
1558 vfio_group_try_dissolve_container(group);
1559
1560 atomic_dec(&group->opened);
1561
1562 vfio_group_put(group);
1563
1564 return 0;
1565}
1566
1567static const struct file_operations vfio_group_fops = {
1568 .owner = THIS_MODULE,
1569 .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1570#ifdef CONFIG_COMPAT
1571 .compat_ioctl = vfio_group_fops_compat_ioctl,
1572#endif
1573 .open = vfio_group_fops_open,
1574 .release = vfio_group_fops_release,
1575};
1576
1577/**
1578 * VFIO Device fd
1579 */
1580static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1581{
1582 struct vfio_device *device = filep->private_data;
1583
1584 device->ops->release(device->device_data);
1585
1586 vfio_group_try_dissolve_container(device->group);
1587
1588 vfio_device_put(device);
1589
1590 return 0;
1591}
1592
1593static long vfio_device_fops_unl_ioctl(struct file *filep,
1594 unsigned int cmd, unsigned long arg)
1595{
1596 struct vfio_device *device = filep->private_data;
1597
1598 if (unlikely(!device->ops->ioctl))
1599 return -EINVAL;
1600
1601 return device->ops->ioctl(device->device_data, cmd, arg);
1602}
1603
1604static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1605 size_t count, loff_t *ppos)
1606{
1607 struct vfio_device *device = filep->private_data;
1608
1609 if (unlikely(!device->ops->read))
1610 return -EINVAL;
1611
1612 return device->ops->read(device->device_data, buf, count, ppos);
1613}
1614
1615static ssize_t vfio_device_fops_write(struct file *filep,
1616 const char __user *buf,
1617 size_t count, loff_t *ppos)
1618{
1619 struct vfio_device *device = filep->private_data;
1620
1621 if (unlikely(!device->ops->write))
1622 return -EINVAL;
1623
1624 return device->ops->write(device->device_data, buf, count, ppos);
1625}
1626
1627static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1628{
1629 struct vfio_device *device = filep->private_data;
1630
1631 if (unlikely(!device->ops->mmap))
1632 return -EINVAL;
1633
1634 return device->ops->mmap(device->device_data, vma);
1635}
1636
1637#ifdef CONFIG_COMPAT
1638static long vfio_device_fops_compat_ioctl(struct file *filep,
1639 unsigned int cmd, unsigned long arg)
1640{
1641 arg = (unsigned long)compat_ptr(arg);
1642 return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1643}
1644#endif /* CONFIG_COMPAT */
1645
1646static const struct file_operations vfio_device_fops = {
1647 .owner = THIS_MODULE,
1648 .release = vfio_device_fops_release,
1649 .read = vfio_device_fops_read,
1650 .write = vfio_device_fops_write,
1651 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1652#ifdef CONFIG_COMPAT
1653 .compat_ioctl = vfio_device_fops_compat_ioctl,
1654#endif
1655 .mmap = vfio_device_fops_mmap,
1656};
1657
1658/**
1659 * External user API, exported by symbols to be linked dynamically.
1660 *
1661 * The protocol includes:
1662 * 1. do normal VFIO init operation:
1663 * - opening a new container;
1664 * - attaching group(s) to it;
1665 * - setting an IOMMU driver for a container.
1666 * When IOMMU is set for a container, all groups in it are
1667 * considered ready to use by an external user.
1668 *
1669 * 2. User space passes a group fd to an external user.
1670 * The external user calls vfio_group_get_external_user()
1671 * to verify that:
1672 * - the group is initialized;
1673 * - IOMMU is set for it.
1674 * If both checks passed, vfio_group_get_external_user()
1675 * increments the container user counter to prevent
1676 * the VFIO group from disposal before KVM exits.
1677 *
1678 * 3. The external user calls vfio_external_user_iommu_id()
1679 * to know an IOMMU ID.
1680 *
1681 * 4. When the external KVM finishes, it calls
1682 * vfio_group_put_external_user() to release the VFIO group.
1683 * This call decrements the container user counter.
1684 */
1685struct vfio_group *vfio_group_get_external_user(struct file *filep)
1686{
1687 struct vfio_group *group = filep->private_data;
1688
1689 if (filep->f_op != &vfio_group_fops)
1690 return ERR_PTR(-EINVAL);
1691
1692 if (!atomic_inc_not_zero(&group->container_users))
1693 return ERR_PTR(-EINVAL);
1694
1695 if (group->noiommu) {
1696 atomic_dec(&group->container_users);
1697 return ERR_PTR(-EPERM);
1698 }
1699
1700 if (!group->container->iommu_driver ||
1701 !vfio_group_viable(group)) {
1702 atomic_dec(&group->container_users);
1703 return ERR_PTR(-EINVAL);
1704 }
1705
1706 vfio_group_get(group);
1707
1708 return group;
1709}
1710EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1711
1712void vfio_group_put_external_user(struct vfio_group *group)
1713{
1714 vfio_group_put(group);
1715 vfio_group_try_dissolve_container(group);
1716}
1717EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1718
1719int vfio_external_user_iommu_id(struct vfio_group *group)
1720{
1721 return iommu_group_id(group->iommu_group);
1722}
1723EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1724
1725long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1726{
1727 return vfio_ioctl_check_extension(group->container, arg);
1728}
1729EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1730
1731/**
1732 * Sub-module support
1733 */
1734/*
1735 * Helper for managing a buffer of info chain capabilities, allocate or
1736 * reallocate a buffer with additional @size, filling in @id and @version
1737 * of the capability. A pointer to the new capability is returned.
1738 *
1739 * NB. The chain is based at the head of the buffer, so new entries are
1740 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1741 * next offsets prior to copying to the user buffer.
1742 */
1743struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1744 size_t size, u16 id, u16 version)
1745{
1746 void *buf;
1747 struct vfio_info_cap_header *header, *tmp;
1748
1749 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1750 if (!buf) {
1751 kfree(caps->buf);
1752 caps->size = 0;
1753 return ERR_PTR(-ENOMEM);
1754 }
1755
1756 caps->buf = buf;
1757 header = buf + caps->size;
1758
1759 /* Eventually copied to user buffer, zero */
1760 memset(header, 0, size);
1761
1762 header->id = id;
1763 header->version = version;
1764
1765 /* Add to the end of the capability chain */
1766 for (tmp = caps->buf; tmp->next; tmp = (void *)tmp + tmp->next)
1767 ; /* nothing */
1768
1769 tmp->next = caps->size;
1770 caps->size += size;
1771
1772 return header;
1773}
1774EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1775
1776void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1777{
1778 struct vfio_info_cap_header *tmp;
1779
1780 for (tmp = caps->buf; tmp->next; tmp = (void *)tmp + tmp->next - offset)
1781 tmp->next += offset;
1782}
1783EXPORT_SYMBOL_GPL(vfio_info_cap_shift);
1784
1785/**
1786 * Module/class support
1787 */
1788static char *vfio_devnode(struct device *dev, umode_t *mode)
1789{
1790 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
1791}
1792
1793static struct miscdevice vfio_dev = {
1794 .minor = VFIO_MINOR,
1795 .name = "vfio",
1796 .fops = &vfio_fops,
1797 .nodename = "vfio/vfio",
1798 .mode = S_IRUGO | S_IWUGO,
1799};
1800
1801static int __init vfio_init(void)
1802{
1803 int ret;
1804
1805 idr_init(&vfio.group_idr);
1806 mutex_init(&vfio.group_lock);
1807 mutex_init(&vfio.iommu_drivers_lock);
1808 INIT_LIST_HEAD(&vfio.group_list);
1809 INIT_LIST_HEAD(&vfio.iommu_drivers_list);
1810 init_waitqueue_head(&vfio.release_q);
1811
1812 ret = misc_register(&vfio_dev);
1813 if (ret) {
1814 pr_err("vfio: misc device register failed\n");
1815 return ret;
1816 }
1817
1818 /* /dev/vfio/$GROUP */
1819 vfio.class = class_create(THIS_MODULE, "vfio");
1820 if (IS_ERR(vfio.class)) {
1821 ret = PTR_ERR(vfio.class);
1822 goto err_class;
1823 }
1824
1825 vfio.class->devnode = vfio_devnode;
1826
1827 ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK, "vfio");
1828 if (ret)
1829 goto err_alloc_chrdev;
1830
1831 cdev_init(&vfio.group_cdev, &vfio_group_fops);
1832 ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK);
1833 if (ret)
1834 goto err_cdev_add;
1835
1836 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1837
1838 /*
1839 * Attempt to load known iommu-drivers. This gives us a working
1840 * environment without the user needing to explicitly load iommu
1841 * drivers.
1842 */
1843 request_module_nowait("vfio_iommu_type1");
1844 request_module_nowait("vfio_iommu_spapr_tce");
1845
1846#ifdef CONFIG_VFIO_NOIOMMU
1847 vfio_register_iommu_driver(&vfio_noiommu_ops);
1848#endif
1849 return 0;
1850
1851err_cdev_add:
1852 unregister_chrdev_region(vfio.group_devt, MINORMASK);
1853err_alloc_chrdev:
1854 class_destroy(vfio.class);
1855 vfio.class = NULL;
1856err_class:
1857 misc_deregister(&vfio_dev);
1858 return ret;
1859}
1860
1861static void __exit vfio_cleanup(void)
1862{
1863 WARN_ON(!list_empty(&vfio.group_list));
1864
1865#ifdef CONFIG_VFIO_NOIOMMU
1866 vfio_unregister_iommu_driver(&vfio_noiommu_ops);
1867#endif
1868 idr_destroy(&vfio.group_idr);
1869 cdev_del(&vfio.group_cdev);
1870 unregister_chrdev_region(vfio.group_devt, MINORMASK);
1871 class_destroy(vfio.class);
1872 vfio.class = NULL;
1873 misc_deregister(&vfio_dev);
1874}
1875
1876module_init(vfio_init);
1877module_exit(vfio_cleanup);
1878
1879MODULE_VERSION(DRIVER_VERSION);
1880MODULE_LICENSE("GPL v2");
1881MODULE_AUTHOR(DRIVER_AUTHOR);
1882MODULE_DESCRIPTION(DRIVER_DESC);
1883MODULE_ALIAS_MISCDEV(VFIO_MINOR);
1884MODULE_ALIAS("devname:vfio/vfio");
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * VFIO core
4 *
5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
6 * Author: Alex Williamson <alex.williamson@redhat.com>
7 *
8 * Derived from original vfio:
9 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
10 * Author: Tom Lyon, pugs@cisco.com
11 */
12
13#include <linux/cdev.h>
14#include <linux/compat.h>
15#include <linux/device.h>
16#include <linux/file.h>
17#include <linux/anon_inodes.h>
18#include <linux/fs.h>
19#include <linux/idr.h>
20#include <linux/iommu.h>
21#include <linux/list.h>
22#include <linux/miscdevice.h>
23#include <linux/module.h>
24#include <linux/mutex.h>
25#include <linux/pci.h>
26#include <linux/rwsem.h>
27#include <linux/sched.h>
28#include <linux/slab.h>
29#include <linux/stat.h>
30#include <linux/string.h>
31#include <linux/uaccess.h>
32#include <linux/vfio.h>
33#include <linux/wait.h>
34#include <linux/sched/signal.h>
35
36#define DRIVER_VERSION "0.3"
37#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
38#define DRIVER_DESC "VFIO - User Level meta-driver"
39
40static struct vfio {
41 struct class *class;
42 struct list_head iommu_drivers_list;
43 struct mutex iommu_drivers_lock;
44 struct list_head group_list;
45 struct idr group_idr;
46 struct mutex group_lock;
47 struct cdev group_cdev;
48 dev_t group_devt;
49 wait_queue_head_t release_q;
50} vfio;
51
52struct vfio_iommu_driver {
53 const struct vfio_iommu_driver_ops *ops;
54 struct list_head vfio_next;
55};
56
57struct vfio_container {
58 struct kref kref;
59 struct list_head group_list;
60 struct rw_semaphore group_lock;
61 struct vfio_iommu_driver *iommu_driver;
62 void *iommu_data;
63 bool noiommu;
64};
65
66struct vfio_unbound_dev {
67 struct device *dev;
68 struct list_head unbound_next;
69};
70
71struct vfio_group {
72 struct kref kref;
73 int minor;
74 atomic_t container_users;
75 struct iommu_group *iommu_group;
76 struct vfio_container *container;
77 struct list_head device_list;
78 struct mutex device_lock;
79 struct device *dev;
80 struct notifier_block nb;
81 struct list_head vfio_next;
82 struct list_head container_next;
83 struct list_head unbound_list;
84 struct mutex unbound_lock;
85 atomic_t opened;
86 wait_queue_head_t container_q;
87 bool noiommu;
88 unsigned int dev_counter;
89 struct kvm *kvm;
90 struct blocking_notifier_head notifier;
91};
92
93struct vfio_device {
94 struct kref kref;
95 struct device *dev;
96 const struct vfio_device_ops *ops;
97 struct vfio_group *group;
98 struct list_head group_next;
99 void *device_data;
100};
101
102#ifdef CONFIG_VFIO_NOIOMMU
103static bool noiommu __read_mostly;
104module_param_named(enable_unsafe_noiommu_mode,
105 noiommu, bool, S_IRUGO | S_IWUSR);
106MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
107#endif
108
109/*
110 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
111 * and remove functions, any use cases other than acquiring the first
112 * reference for the purpose of calling vfio_add_group_dev() or removing
113 * that symmetric reference after vfio_del_group_dev() should use the raw
114 * iommu_group_{get,put} functions. In particular, vfio_iommu_group_put()
115 * removes the device from the dummy group and cannot be nested.
116 */
117struct iommu_group *vfio_iommu_group_get(struct device *dev)
118{
119 struct iommu_group *group;
120 int __maybe_unused ret;
121
122 group = iommu_group_get(dev);
123
124#ifdef CONFIG_VFIO_NOIOMMU
125 /*
126 * With noiommu enabled, an IOMMU group will be created for a device
127 * that doesn't already have one and doesn't have an iommu_ops on their
128 * bus. We set iommudata simply to be able to identify these groups
129 * as special use and for reclamation later.
130 */
131 if (group || !noiommu || iommu_present(dev->bus))
132 return group;
133
134 group = iommu_group_alloc();
135 if (IS_ERR(group))
136 return NULL;
137
138 iommu_group_set_name(group, "vfio-noiommu");
139 iommu_group_set_iommudata(group, &noiommu, NULL);
140 ret = iommu_group_add_device(group, dev);
141 if (ret) {
142 iommu_group_put(group);
143 return NULL;
144 }
145
146 /*
147 * Where to taint? At this point we've added an IOMMU group for a
148 * device that is not backed by iommu_ops, therefore any iommu_
149 * callback using iommu_ops can legitimately Oops. So, while we may
150 * be about to give a DMA capable device to a user without IOMMU
151 * protection, which is clearly taint-worthy, let's go ahead and do
152 * it here.
153 */
154 add_taint(TAINT_USER, LOCKDEP_STILL_OK);
155 dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
156#endif
157
158 return group;
159}
160EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
161
162void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
163{
164#ifdef CONFIG_VFIO_NOIOMMU
165 if (iommu_group_get_iommudata(group) == &noiommu)
166 iommu_group_remove_device(dev);
167#endif
168
169 iommu_group_put(group);
170}
171EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
172
173#ifdef CONFIG_VFIO_NOIOMMU
174static void *vfio_noiommu_open(unsigned long arg)
175{
176 if (arg != VFIO_NOIOMMU_IOMMU)
177 return ERR_PTR(-EINVAL);
178 if (!capable(CAP_SYS_RAWIO))
179 return ERR_PTR(-EPERM);
180
181 return NULL;
182}
183
184static void vfio_noiommu_release(void *iommu_data)
185{
186}
187
188static long vfio_noiommu_ioctl(void *iommu_data,
189 unsigned int cmd, unsigned long arg)
190{
191 if (cmd == VFIO_CHECK_EXTENSION)
192 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
193
194 return -ENOTTY;
195}
196
197static int vfio_noiommu_attach_group(void *iommu_data,
198 struct iommu_group *iommu_group)
199{
200 return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
201}
202
203static void vfio_noiommu_detach_group(void *iommu_data,
204 struct iommu_group *iommu_group)
205{
206}
207
208static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
209 .name = "vfio-noiommu",
210 .owner = THIS_MODULE,
211 .open = vfio_noiommu_open,
212 .release = vfio_noiommu_release,
213 .ioctl = vfio_noiommu_ioctl,
214 .attach_group = vfio_noiommu_attach_group,
215 .detach_group = vfio_noiommu_detach_group,
216};
217#endif
218
219
220/**
221 * IOMMU driver registration
222 */
223int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
224{
225 struct vfio_iommu_driver *driver, *tmp;
226
227 driver = kzalloc(sizeof(*driver), GFP_KERNEL);
228 if (!driver)
229 return -ENOMEM;
230
231 driver->ops = ops;
232
233 mutex_lock(&vfio.iommu_drivers_lock);
234
235 /* Check for duplicates */
236 list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
237 if (tmp->ops == ops) {
238 mutex_unlock(&vfio.iommu_drivers_lock);
239 kfree(driver);
240 return -EINVAL;
241 }
242 }
243
244 list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
245
246 mutex_unlock(&vfio.iommu_drivers_lock);
247
248 return 0;
249}
250EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
251
252void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
253{
254 struct vfio_iommu_driver *driver;
255
256 mutex_lock(&vfio.iommu_drivers_lock);
257 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
258 if (driver->ops == ops) {
259 list_del(&driver->vfio_next);
260 mutex_unlock(&vfio.iommu_drivers_lock);
261 kfree(driver);
262 return;
263 }
264 }
265 mutex_unlock(&vfio.iommu_drivers_lock);
266}
267EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
268
269/**
270 * Group minor allocation/free - both called with vfio.group_lock held
271 */
272static int vfio_alloc_group_minor(struct vfio_group *group)
273{
274 return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
275}
276
277static void vfio_free_group_minor(int minor)
278{
279 idr_remove(&vfio.group_idr, minor);
280}
281
282static int vfio_iommu_group_notifier(struct notifier_block *nb,
283 unsigned long action, void *data);
284static void vfio_group_get(struct vfio_group *group);
285
286/**
287 * Container objects - containers are created when /dev/vfio/vfio is
288 * opened, but their lifecycle extends until the last user is done, so
289 * it's freed via kref. Must support container/group/device being
290 * closed in any order.
291 */
292static void vfio_container_get(struct vfio_container *container)
293{
294 kref_get(&container->kref);
295}
296
297static void vfio_container_release(struct kref *kref)
298{
299 struct vfio_container *container;
300 container = container_of(kref, struct vfio_container, kref);
301
302 kfree(container);
303}
304
305static void vfio_container_put(struct vfio_container *container)
306{
307 kref_put(&container->kref, vfio_container_release);
308}
309
310static void vfio_group_unlock_and_free(struct vfio_group *group)
311{
312 mutex_unlock(&vfio.group_lock);
313 /*
314 * Unregister outside of lock. A spurious callback is harmless now
315 * that the group is no longer in vfio.group_list.
316 */
317 iommu_group_unregister_notifier(group->iommu_group, &group->nb);
318 kfree(group);
319}
320
321/**
322 * Group objects - create, release, get, put, search
323 */
324static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
325{
326 struct vfio_group *group, *tmp;
327 struct device *dev;
328 int ret, minor;
329
330 group = kzalloc(sizeof(*group), GFP_KERNEL);
331 if (!group)
332 return ERR_PTR(-ENOMEM);
333
334 kref_init(&group->kref);
335 INIT_LIST_HEAD(&group->device_list);
336 mutex_init(&group->device_lock);
337 INIT_LIST_HEAD(&group->unbound_list);
338 mutex_init(&group->unbound_lock);
339 atomic_set(&group->container_users, 0);
340 atomic_set(&group->opened, 0);
341 init_waitqueue_head(&group->container_q);
342 group->iommu_group = iommu_group;
343#ifdef CONFIG_VFIO_NOIOMMU
344 group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
345#endif
346 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
347
348 group->nb.notifier_call = vfio_iommu_group_notifier;
349
350 /*
351 * blocking notifiers acquire a rwsem around registering and hold
352 * it around callback. Therefore, need to register outside of
353 * vfio.group_lock to avoid A-B/B-A contention. Our callback won't
354 * do anything unless it can find the group in vfio.group_list, so
355 * no harm in registering early.
356 */
357 ret = iommu_group_register_notifier(iommu_group, &group->nb);
358 if (ret) {
359 kfree(group);
360 return ERR_PTR(ret);
361 }
362
363 mutex_lock(&vfio.group_lock);
364
365 /* Did we race creating this group? */
366 list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
367 if (tmp->iommu_group == iommu_group) {
368 vfio_group_get(tmp);
369 vfio_group_unlock_and_free(group);
370 return tmp;
371 }
372 }
373
374 minor = vfio_alloc_group_minor(group);
375 if (minor < 0) {
376 vfio_group_unlock_and_free(group);
377 return ERR_PTR(minor);
378 }
379
380 dev = device_create(vfio.class, NULL,
381 MKDEV(MAJOR(vfio.group_devt), minor),
382 group, "%s%d", group->noiommu ? "noiommu-" : "",
383 iommu_group_id(iommu_group));
384 if (IS_ERR(dev)) {
385 vfio_free_group_minor(minor);
386 vfio_group_unlock_and_free(group);
387 return ERR_CAST(dev);
388 }
389
390 group->minor = minor;
391 group->dev = dev;
392
393 list_add(&group->vfio_next, &vfio.group_list);
394
395 mutex_unlock(&vfio.group_lock);
396
397 return group;
398}
399
400/* called with vfio.group_lock held */
401static void vfio_group_release(struct kref *kref)
402{
403 struct vfio_group *group = container_of(kref, struct vfio_group, kref);
404 struct vfio_unbound_dev *unbound, *tmp;
405 struct iommu_group *iommu_group = group->iommu_group;
406
407 WARN_ON(!list_empty(&group->device_list));
408 WARN_ON(group->notifier.head);
409
410 list_for_each_entry_safe(unbound, tmp,
411 &group->unbound_list, unbound_next) {
412 list_del(&unbound->unbound_next);
413 kfree(unbound);
414 }
415
416 device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
417 list_del(&group->vfio_next);
418 vfio_free_group_minor(group->minor);
419 vfio_group_unlock_and_free(group);
420 iommu_group_put(iommu_group);
421}
422
423static void vfio_group_put(struct vfio_group *group)
424{
425 kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
426}
427
428struct vfio_group_put_work {
429 struct work_struct work;
430 struct vfio_group *group;
431};
432
433static void vfio_group_put_bg(struct work_struct *work)
434{
435 struct vfio_group_put_work *do_work;
436
437 do_work = container_of(work, struct vfio_group_put_work, work);
438
439 vfio_group_put(do_work->group);
440 kfree(do_work);
441}
442
443static void vfio_group_schedule_put(struct vfio_group *group)
444{
445 struct vfio_group_put_work *do_work;
446
447 do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
448 if (WARN_ON(!do_work))
449 return;
450
451 INIT_WORK(&do_work->work, vfio_group_put_bg);
452 do_work->group = group;
453 schedule_work(&do_work->work);
454}
455
456/* Assume group_lock or group reference is held */
457static void vfio_group_get(struct vfio_group *group)
458{
459 kref_get(&group->kref);
460}
461
462/*
463 * Not really a try as we will sleep for mutex, but we need to make
464 * sure the group pointer is valid under lock and get a reference.
465 */
466static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
467{
468 struct vfio_group *target = group;
469
470 mutex_lock(&vfio.group_lock);
471 list_for_each_entry(group, &vfio.group_list, vfio_next) {
472 if (group == target) {
473 vfio_group_get(group);
474 mutex_unlock(&vfio.group_lock);
475 return group;
476 }
477 }
478 mutex_unlock(&vfio.group_lock);
479
480 return NULL;
481}
482
483static
484struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
485{
486 struct vfio_group *group;
487
488 mutex_lock(&vfio.group_lock);
489 list_for_each_entry(group, &vfio.group_list, vfio_next) {
490 if (group->iommu_group == iommu_group) {
491 vfio_group_get(group);
492 mutex_unlock(&vfio.group_lock);
493 return group;
494 }
495 }
496 mutex_unlock(&vfio.group_lock);
497
498 return NULL;
499}
500
501static struct vfio_group *vfio_group_get_from_minor(int minor)
502{
503 struct vfio_group *group;
504
505 mutex_lock(&vfio.group_lock);
506 group = idr_find(&vfio.group_idr, minor);
507 if (!group) {
508 mutex_unlock(&vfio.group_lock);
509 return NULL;
510 }
511 vfio_group_get(group);
512 mutex_unlock(&vfio.group_lock);
513
514 return group;
515}
516
517static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
518{
519 struct iommu_group *iommu_group;
520 struct vfio_group *group;
521
522 iommu_group = iommu_group_get(dev);
523 if (!iommu_group)
524 return NULL;
525
526 group = vfio_group_get_from_iommu(iommu_group);
527 iommu_group_put(iommu_group);
528
529 return group;
530}
531
532/**
533 * Device objects - create, release, get, put, search
534 */
535static
536struct vfio_device *vfio_group_create_device(struct vfio_group *group,
537 struct device *dev,
538 const struct vfio_device_ops *ops,
539 void *device_data)
540{
541 struct vfio_device *device;
542
543 device = kzalloc(sizeof(*device), GFP_KERNEL);
544 if (!device)
545 return ERR_PTR(-ENOMEM);
546
547 kref_init(&device->kref);
548 device->dev = dev;
549 device->group = group;
550 device->ops = ops;
551 device->device_data = device_data;
552 dev_set_drvdata(dev, device);
553
554 /* No need to get group_lock, caller has group reference */
555 vfio_group_get(group);
556
557 mutex_lock(&group->device_lock);
558 list_add(&device->group_next, &group->device_list);
559 group->dev_counter++;
560 mutex_unlock(&group->device_lock);
561
562 return device;
563}
564
565static void vfio_device_release(struct kref *kref)
566{
567 struct vfio_device *device = container_of(kref,
568 struct vfio_device, kref);
569 struct vfio_group *group = device->group;
570
571 list_del(&device->group_next);
572 group->dev_counter--;
573 mutex_unlock(&group->device_lock);
574
575 dev_set_drvdata(device->dev, NULL);
576
577 kfree(device);
578
579 /* vfio_del_group_dev may be waiting for this device */
580 wake_up(&vfio.release_q);
581}
582
583/* Device reference always implies a group reference */
584void vfio_device_put(struct vfio_device *device)
585{
586 struct vfio_group *group = device->group;
587 kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
588 vfio_group_put(group);
589}
590EXPORT_SYMBOL_GPL(vfio_device_put);
591
592static void vfio_device_get(struct vfio_device *device)
593{
594 vfio_group_get(device->group);
595 kref_get(&device->kref);
596}
597
598static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
599 struct device *dev)
600{
601 struct vfio_device *device;
602
603 mutex_lock(&group->device_lock);
604 list_for_each_entry(device, &group->device_list, group_next) {
605 if (device->dev == dev) {
606 vfio_device_get(device);
607 mutex_unlock(&group->device_lock);
608 return device;
609 }
610 }
611 mutex_unlock(&group->device_lock);
612 return NULL;
613}
614
615/*
616 * Some drivers, like pci-stub, are only used to prevent other drivers from
617 * claiming a device and are therefore perfectly legitimate for a user owned
618 * group. The pci-stub driver has no dependencies on DMA or the IOVA mapping
619 * of the device, but it does prevent the user from having direct access to
620 * the device, which is useful in some circumstances.
621 *
622 * We also assume that we can include PCI interconnect devices, ie. bridges.
623 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
624 * then all of the downstream devices will be part of the same IOMMU group as
625 * the bridge. Thus, if placing the bridge into the user owned IOVA space
626 * breaks anything, it only does so for user owned devices downstream. Note
627 * that error notification via MSI can be affected for platforms that handle
628 * MSI within the same IOVA space as DMA.
629 */
630static const char * const vfio_driver_allowed[] = { "pci-stub" };
631
632static bool vfio_dev_driver_allowed(struct device *dev,
633 struct device_driver *drv)
634{
635 if (dev_is_pci(dev)) {
636 struct pci_dev *pdev = to_pci_dev(dev);
637
638 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
639 return true;
640 }
641
642 return match_string(vfio_driver_allowed,
643 ARRAY_SIZE(vfio_driver_allowed),
644 drv->name) >= 0;
645}
646
647/*
648 * A vfio group is viable for use by userspace if all devices are in
649 * one of the following states:
650 * - driver-less
651 * - bound to a vfio driver
652 * - bound to an otherwise allowed driver
653 * - a PCI interconnect device
654 *
655 * We use two methods to determine whether a device is bound to a vfio
656 * driver. The first is to test whether the device exists in the vfio
657 * group. The second is to test if the device exists on the group
658 * unbound_list, indicating it's in the middle of transitioning from
659 * a vfio driver to driver-less.
660 */
661static int vfio_dev_viable(struct device *dev, void *data)
662{
663 struct vfio_group *group = data;
664 struct vfio_device *device;
665 struct device_driver *drv = READ_ONCE(dev->driver);
666 struct vfio_unbound_dev *unbound;
667 int ret = -EINVAL;
668
669 mutex_lock(&group->unbound_lock);
670 list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
671 if (dev == unbound->dev) {
672 ret = 0;
673 break;
674 }
675 }
676 mutex_unlock(&group->unbound_lock);
677
678 if (!ret || !drv || vfio_dev_driver_allowed(dev, drv))
679 return 0;
680
681 device = vfio_group_get_device(group, dev);
682 if (device) {
683 vfio_device_put(device);
684 return 0;
685 }
686
687 return ret;
688}
689
690/**
691 * Async device support
692 */
693static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
694{
695 struct vfio_device *device;
696
697 /* Do we already know about it? We shouldn't */
698 device = vfio_group_get_device(group, dev);
699 if (WARN_ON_ONCE(device)) {
700 vfio_device_put(device);
701 return 0;
702 }
703
704 /* Nothing to do for idle groups */
705 if (!atomic_read(&group->container_users))
706 return 0;
707
708 /* TODO Prevent device auto probing */
709 dev_WARN(dev, "Device added to live group %d!\n",
710 iommu_group_id(group->iommu_group));
711
712 return 0;
713}
714
715static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
716{
717 /* We don't care what happens when the group isn't in use */
718 if (!atomic_read(&group->container_users))
719 return 0;
720
721 return vfio_dev_viable(dev, group);
722}
723
724static int vfio_iommu_group_notifier(struct notifier_block *nb,
725 unsigned long action, void *data)
726{
727 struct vfio_group *group = container_of(nb, struct vfio_group, nb);
728 struct device *dev = data;
729 struct vfio_unbound_dev *unbound;
730
731 /*
732 * Need to go through a group_lock lookup to get a reference or we
733 * risk racing a group being removed. Ignore spurious notifies.
734 */
735 group = vfio_group_try_get(group);
736 if (!group)
737 return NOTIFY_OK;
738
739 switch (action) {
740 case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
741 vfio_group_nb_add_dev(group, dev);
742 break;
743 case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
744 /*
745 * Nothing to do here. If the device is in use, then the
746 * vfio sub-driver should block the remove callback until
747 * it is unused. If the device is unused or attached to a
748 * stub driver, then it should be released and we don't
749 * care that it will be going away.
750 */
751 break;
752 case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
753 dev_dbg(dev, "%s: group %d binding to driver\n", __func__,
754 iommu_group_id(group->iommu_group));
755 break;
756 case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
757 dev_dbg(dev, "%s: group %d bound to driver %s\n", __func__,
758 iommu_group_id(group->iommu_group), dev->driver->name);
759 BUG_ON(vfio_group_nb_verify(group, dev));
760 break;
761 case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
762 dev_dbg(dev, "%s: group %d unbinding from driver %s\n",
763 __func__, iommu_group_id(group->iommu_group),
764 dev->driver->name);
765 break;
766 case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
767 dev_dbg(dev, "%s: group %d unbound from driver\n", __func__,
768 iommu_group_id(group->iommu_group));
769 /*
770 * XXX An unbound device in a live group is ok, but we'd
771 * really like to avoid the above BUG_ON by preventing other
772 * drivers from binding to it. Once that occurs, we have to
773 * stop the system to maintain isolation. At a minimum, we'd
774 * want a toggle to disable driver auto probe for this device.
775 */
776
777 mutex_lock(&group->unbound_lock);
778 list_for_each_entry(unbound,
779 &group->unbound_list, unbound_next) {
780 if (dev == unbound->dev) {
781 list_del(&unbound->unbound_next);
782 kfree(unbound);
783 break;
784 }
785 }
786 mutex_unlock(&group->unbound_lock);
787 break;
788 }
789
790 /*
791 * If we're the last reference to the group, the group will be
792 * released, which includes unregistering the iommu group notifier.
793 * We hold a read-lock on that notifier list, unregistering needs
794 * a write-lock... deadlock. Release our reference asynchronously
795 * to avoid that situation.
796 */
797 vfio_group_schedule_put(group);
798 return NOTIFY_OK;
799}
800
801/**
802 * VFIO driver API
803 */
804int vfio_add_group_dev(struct device *dev,
805 const struct vfio_device_ops *ops, void *device_data)
806{
807 struct iommu_group *iommu_group;
808 struct vfio_group *group;
809 struct vfio_device *device;
810
811 iommu_group = iommu_group_get(dev);
812 if (!iommu_group)
813 return -EINVAL;
814
815 group = vfio_group_get_from_iommu(iommu_group);
816 if (!group) {
817 group = vfio_create_group(iommu_group);
818 if (IS_ERR(group)) {
819 iommu_group_put(iommu_group);
820 return PTR_ERR(group);
821 }
822 } else {
823 /*
824 * A found vfio_group already holds a reference to the
825 * iommu_group. A created vfio_group keeps the reference.
826 */
827 iommu_group_put(iommu_group);
828 }
829
830 device = vfio_group_get_device(group, dev);
831 if (device) {
832 dev_WARN(dev, "Device already exists on group %d\n",
833 iommu_group_id(iommu_group));
834 vfio_device_put(device);
835 vfio_group_put(group);
836 return -EBUSY;
837 }
838
839 device = vfio_group_create_device(group, dev, ops, device_data);
840 if (IS_ERR(device)) {
841 vfio_group_put(group);
842 return PTR_ERR(device);
843 }
844
845 /*
846 * Drop all but the vfio_device reference. The vfio_device holds
847 * a reference to the vfio_group, which holds a reference to the
848 * iommu_group.
849 */
850 vfio_group_put(group);
851
852 return 0;
853}
854EXPORT_SYMBOL_GPL(vfio_add_group_dev);
855
856/**
857 * Get a reference to the vfio_device for a device. Even if the
858 * caller thinks they own the device, they could be racing with a
859 * release call path, so we can't trust drvdata for the shortcut.
860 * Go the long way around, from the iommu_group to the vfio_group
861 * to the vfio_device.
862 */
863struct vfio_device *vfio_device_get_from_dev(struct device *dev)
864{
865 struct vfio_group *group;
866 struct vfio_device *device;
867
868 group = vfio_group_get_from_dev(dev);
869 if (!group)
870 return NULL;
871
872 device = vfio_group_get_device(group, dev);
873 vfio_group_put(group);
874
875 return device;
876}
877EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
878
879static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
880 char *buf)
881{
882 struct vfio_device *it, *device = ERR_PTR(-ENODEV);
883
884 mutex_lock(&group->device_lock);
885 list_for_each_entry(it, &group->device_list, group_next) {
886 int ret;
887
888 if (it->ops->match) {
889 ret = it->ops->match(it->device_data, buf);
890 if (ret < 0) {
891 device = ERR_PTR(ret);
892 break;
893 }
894 } else {
895 ret = !strcmp(dev_name(it->dev), buf);
896 }
897
898 if (ret) {
899 device = it;
900 vfio_device_get(device);
901 break;
902 }
903 }
904 mutex_unlock(&group->device_lock);
905
906 return device;
907}
908
909/*
910 * Caller must hold a reference to the vfio_device
911 */
912void *vfio_device_data(struct vfio_device *device)
913{
914 return device->device_data;
915}
916EXPORT_SYMBOL_GPL(vfio_device_data);
917
918/*
919 * Decrement the device reference count and wait for the device to be
920 * removed. Open file descriptors for the device... */
921void *vfio_del_group_dev(struct device *dev)
922{
923 DEFINE_WAIT_FUNC(wait, woken_wake_function);
924 struct vfio_device *device = dev_get_drvdata(dev);
925 struct vfio_group *group = device->group;
926 void *device_data = device->device_data;
927 struct vfio_unbound_dev *unbound;
928 unsigned int i = 0;
929 bool interrupted = false;
930
931 /*
932 * The group exists so long as we have a device reference. Get
933 * a group reference and use it to scan for the device going away.
934 */
935 vfio_group_get(group);
936
937 /*
938 * When the device is removed from the group, the group suddenly
939 * becomes non-viable; the device has a driver (until the unbind
940 * completes), but it's not present in the group. This is bad news
941 * for any external users that need to re-acquire a group reference
942 * in order to match and release their existing reference. To
943 * solve this, we track such devices on the unbound_list to bridge
944 * the gap until they're fully unbound.
945 */
946 unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
947 if (unbound) {
948 unbound->dev = dev;
949 mutex_lock(&group->unbound_lock);
950 list_add(&unbound->unbound_next, &group->unbound_list);
951 mutex_unlock(&group->unbound_lock);
952 }
953 WARN_ON(!unbound);
954
955 vfio_device_put(device);
956
957 /*
958 * If the device is still present in the group after the above
959 * 'put', then it is in use and we need to request it from the
960 * bus driver. The driver may in turn need to request the
961 * device from the user. We send the request on an arbitrary
962 * interval with counter to allow the driver to take escalating
963 * measures to release the device if it has the ability to do so.
964 */
965 add_wait_queue(&vfio.release_q, &wait);
966
967 do {
968 device = vfio_group_get_device(group, dev);
969 if (!device)
970 break;
971
972 if (device->ops->request)
973 device->ops->request(device_data, i++);
974
975 vfio_device_put(device);
976
977 if (interrupted) {
978 wait_woken(&wait, TASK_UNINTERRUPTIBLE, HZ * 10);
979 } else {
980 wait_woken(&wait, TASK_INTERRUPTIBLE, HZ * 10);
981 if (signal_pending(current)) {
982 interrupted = true;
983 dev_warn(dev,
984 "Device is currently in use, task"
985 " \"%s\" (%d) "
986 "blocked until device is released",
987 current->comm, task_pid_nr(current));
988 }
989 }
990
991 } while (1);
992
993 remove_wait_queue(&vfio.release_q, &wait);
994 /*
995 * In order to support multiple devices per group, devices can be
996 * plucked from the group while other devices in the group are still
997 * in use. The container persists with this group and those remaining
998 * devices still attached. If the user creates an isolation violation
999 * by binding this device to another driver while the group is still in
1000 * use, that's their fault. However, in the case of removing the last,
1001 * or potentially the only, device in the group there can be no other
1002 * in-use devices in the group. The user has done their due diligence
1003 * and we should lay no claims to those devices. In order to do that,
1004 * we need to make sure the group is detached from the container.
1005 * Without this stall, we're potentially racing with a user process
1006 * that may attempt to immediately bind this device to another driver.
1007 */
1008 if (list_empty(&group->device_list))
1009 wait_event(group->container_q, !group->container);
1010
1011 vfio_group_put(group);
1012
1013 return device_data;
1014}
1015EXPORT_SYMBOL_GPL(vfio_del_group_dev);
1016
1017/**
1018 * VFIO base fd, /dev/vfio/vfio
1019 */
1020static long vfio_ioctl_check_extension(struct vfio_container *container,
1021 unsigned long arg)
1022{
1023 struct vfio_iommu_driver *driver;
1024 long ret = 0;
1025
1026 down_read(&container->group_lock);
1027
1028 driver = container->iommu_driver;
1029
1030 switch (arg) {
1031 /* No base extensions yet */
1032 default:
1033 /*
1034 * If no driver is set, poll all registered drivers for
1035 * extensions and return the first positive result. If
1036 * a driver is already set, further queries will be passed
1037 * only to that driver.
1038 */
1039 if (!driver) {
1040 mutex_lock(&vfio.iommu_drivers_lock);
1041 list_for_each_entry(driver, &vfio.iommu_drivers_list,
1042 vfio_next) {
1043
1044#ifdef CONFIG_VFIO_NOIOMMU
1045 if (!list_empty(&container->group_list) &&
1046 (container->noiommu !=
1047 (driver->ops == &vfio_noiommu_ops)))
1048 continue;
1049#endif
1050
1051 if (!try_module_get(driver->ops->owner))
1052 continue;
1053
1054 ret = driver->ops->ioctl(NULL,
1055 VFIO_CHECK_EXTENSION,
1056 arg);
1057 module_put(driver->ops->owner);
1058 if (ret > 0)
1059 break;
1060 }
1061 mutex_unlock(&vfio.iommu_drivers_lock);
1062 } else
1063 ret = driver->ops->ioctl(container->iommu_data,
1064 VFIO_CHECK_EXTENSION, arg);
1065 }
1066
1067 up_read(&container->group_lock);
1068
1069 return ret;
1070}
1071
1072/* hold write lock on container->group_lock */
1073static int __vfio_container_attach_groups(struct vfio_container *container,
1074 struct vfio_iommu_driver *driver,
1075 void *data)
1076{
1077 struct vfio_group *group;
1078 int ret = -ENODEV;
1079
1080 list_for_each_entry(group, &container->group_list, container_next) {
1081 ret = driver->ops->attach_group(data, group->iommu_group);
1082 if (ret)
1083 goto unwind;
1084 }
1085
1086 return ret;
1087
1088unwind:
1089 list_for_each_entry_continue_reverse(group, &container->group_list,
1090 container_next) {
1091 driver->ops->detach_group(data, group->iommu_group);
1092 }
1093
1094 return ret;
1095}
1096
1097static long vfio_ioctl_set_iommu(struct vfio_container *container,
1098 unsigned long arg)
1099{
1100 struct vfio_iommu_driver *driver;
1101 long ret = -ENODEV;
1102
1103 down_write(&container->group_lock);
1104
1105 /*
1106 * The container is designed to be an unprivileged interface while
1107 * the group can be assigned to specific users. Therefore, only by
1108 * adding a group to a container does the user get the privilege of
1109 * enabling the iommu, which may allocate finite resources. There
1110 * is no unset_iommu, but by removing all the groups from a container,
1111 * the container is deprivileged and returns to an unset state.
1112 */
1113 if (list_empty(&container->group_list) || container->iommu_driver) {
1114 up_write(&container->group_lock);
1115 return -EINVAL;
1116 }
1117
1118 mutex_lock(&vfio.iommu_drivers_lock);
1119 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1120 void *data;
1121
1122#ifdef CONFIG_VFIO_NOIOMMU
1123 /*
1124 * Only noiommu containers can use vfio-noiommu and noiommu
1125 * containers can only use vfio-noiommu.
1126 */
1127 if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1128 continue;
1129#endif
1130
1131 if (!try_module_get(driver->ops->owner))
1132 continue;
1133
1134 /*
1135 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1136 * so test which iommu driver reported support for this
1137 * extension and call open on them. We also pass them the
1138 * magic, allowing a single driver to support multiple
1139 * interfaces if they'd like.
1140 */
1141 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1142 module_put(driver->ops->owner);
1143 continue;
1144 }
1145
1146 data = driver->ops->open(arg);
1147 if (IS_ERR(data)) {
1148 ret = PTR_ERR(data);
1149 module_put(driver->ops->owner);
1150 continue;
1151 }
1152
1153 ret = __vfio_container_attach_groups(container, driver, data);
1154 if (ret) {
1155 driver->ops->release(data);
1156 module_put(driver->ops->owner);
1157 continue;
1158 }
1159
1160 container->iommu_driver = driver;
1161 container->iommu_data = data;
1162 break;
1163 }
1164
1165 mutex_unlock(&vfio.iommu_drivers_lock);
1166 up_write(&container->group_lock);
1167
1168 return ret;
1169}
1170
1171static long vfio_fops_unl_ioctl(struct file *filep,
1172 unsigned int cmd, unsigned long arg)
1173{
1174 struct vfio_container *container = filep->private_data;
1175 struct vfio_iommu_driver *driver;
1176 void *data;
1177 long ret = -EINVAL;
1178
1179 if (!container)
1180 return ret;
1181
1182 switch (cmd) {
1183 case VFIO_GET_API_VERSION:
1184 ret = VFIO_API_VERSION;
1185 break;
1186 case VFIO_CHECK_EXTENSION:
1187 ret = vfio_ioctl_check_extension(container, arg);
1188 break;
1189 case VFIO_SET_IOMMU:
1190 ret = vfio_ioctl_set_iommu(container, arg);
1191 break;
1192 default:
1193 driver = container->iommu_driver;
1194 data = container->iommu_data;
1195
1196 if (driver) /* passthrough all unrecognized ioctls */
1197 ret = driver->ops->ioctl(data, cmd, arg);
1198 }
1199
1200 return ret;
1201}
1202
1203static int vfio_fops_open(struct inode *inode, struct file *filep)
1204{
1205 struct vfio_container *container;
1206
1207 container = kzalloc(sizeof(*container), GFP_KERNEL);
1208 if (!container)
1209 return -ENOMEM;
1210
1211 INIT_LIST_HEAD(&container->group_list);
1212 init_rwsem(&container->group_lock);
1213 kref_init(&container->kref);
1214
1215 filep->private_data = container;
1216
1217 return 0;
1218}
1219
1220static int vfio_fops_release(struct inode *inode, struct file *filep)
1221{
1222 struct vfio_container *container = filep->private_data;
1223
1224 filep->private_data = NULL;
1225
1226 vfio_container_put(container);
1227
1228 return 0;
1229}
1230
1231/*
1232 * Once an iommu driver is set, we optionally pass read/write/mmap
1233 * on to the driver, allowing management interfaces beyond ioctl.
1234 */
1235static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1236 size_t count, loff_t *ppos)
1237{
1238 struct vfio_container *container = filep->private_data;
1239 struct vfio_iommu_driver *driver;
1240 ssize_t ret = -EINVAL;
1241
1242 driver = container->iommu_driver;
1243 if (likely(driver && driver->ops->read))
1244 ret = driver->ops->read(container->iommu_data,
1245 buf, count, ppos);
1246
1247 return ret;
1248}
1249
1250static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1251 size_t count, loff_t *ppos)
1252{
1253 struct vfio_container *container = filep->private_data;
1254 struct vfio_iommu_driver *driver;
1255 ssize_t ret = -EINVAL;
1256
1257 driver = container->iommu_driver;
1258 if (likely(driver && driver->ops->write))
1259 ret = driver->ops->write(container->iommu_data,
1260 buf, count, ppos);
1261
1262 return ret;
1263}
1264
1265static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1266{
1267 struct vfio_container *container = filep->private_data;
1268 struct vfio_iommu_driver *driver;
1269 int ret = -EINVAL;
1270
1271 driver = container->iommu_driver;
1272 if (likely(driver && driver->ops->mmap))
1273 ret = driver->ops->mmap(container->iommu_data, vma);
1274
1275 return ret;
1276}
1277
1278static const struct file_operations vfio_fops = {
1279 .owner = THIS_MODULE,
1280 .open = vfio_fops_open,
1281 .release = vfio_fops_release,
1282 .read = vfio_fops_read,
1283 .write = vfio_fops_write,
1284 .unlocked_ioctl = vfio_fops_unl_ioctl,
1285 .compat_ioctl = compat_ptr_ioctl,
1286 .mmap = vfio_fops_mmap,
1287};
1288
1289/**
1290 * VFIO Group fd, /dev/vfio/$GROUP
1291 */
1292static void __vfio_group_unset_container(struct vfio_group *group)
1293{
1294 struct vfio_container *container = group->container;
1295 struct vfio_iommu_driver *driver;
1296
1297 down_write(&container->group_lock);
1298
1299 driver = container->iommu_driver;
1300 if (driver)
1301 driver->ops->detach_group(container->iommu_data,
1302 group->iommu_group);
1303
1304 group->container = NULL;
1305 wake_up(&group->container_q);
1306 list_del(&group->container_next);
1307
1308 /* Detaching the last group deprivileges a container, remove iommu */
1309 if (driver && list_empty(&container->group_list)) {
1310 driver->ops->release(container->iommu_data);
1311 module_put(driver->ops->owner);
1312 container->iommu_driver = NULL;
1313 container->iommu_data = NULL;
1314 }
1315
1316 up_write(&container->group_lock);
1317
1318 vfio_container_put(container);
1319}
1320
1321/*
1322 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1323 * if there was no container to unset. Since the ioctl is called on
1324 * the group, we know that still exists, therefore the only valid
1325 * transition here is 1->0.
1326 */
1327static int vfio_group_unset_container(struct vfio_group *group)
1328{
1329 int users = atomic_cmpxchg(&group->container_users, 1, 0);
1330
1331 if (!users)
1332 return -EINVAL;
1333 if (users != 1)
1334 return -EBUSY;
1335
1336 __vfio_group_unset_container(group);
1337
1338 return 0;
1339}
1340
1341/*
1342 * When removing container users, anything that removes the last user
1343 * implicitly removes the group from the container. That is, if the
1344 * group file descriptor is closed, as well as any device file descriptors,
1345 * the group is free.
1346 */
1347static void vfio_group_try_dissolve_container(struct vfio_group *group)
1348{
1349 if (0 == atomic_dec_if_positive(&group->container_users))
1350 __vfio_group_unset_container(group);
1351}
1352
1353static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1354{
1355 struct fd f;
1356 struct vfio_container *container;
1357 struct vfio_iommu_driver *driver;
1358 int ret = 0;
1359
1360 if (atomic_read(&group->container_users))
1361 return -EINVAL;
1362
1363 if (group->noiommu && !capable(CAP_SYS_RAWIO))
1364 return -EPERM;
1365
1366 f = fdget(container_fd);
1367 if (!f.file)
1368 return -EBADF;
1369
1370 /* Sanity check, is this really our fd? */
1371 if (f.file->f_op != &vfio_fops) {
1372 fdput(f);
1373 return -EINVAL;
1374 }
1375
1376 container = f.file->private_data;
1377 WARN_ON(!container); /* fget ensures we don't race vfio_release */
1378
1379 down_write(&container->group_lock);
1380
1381 /* Real groups and fake groups cannot mix */
1382 if (!list_empty(&container->group_list) &&
1383 container->noiommu != group->noiommu) {
1384 ret = -EPERM;
1385 goto unlock_out;
1386 }
1387
1388 driver = container->iommu_driver;
1389 if (driver) {
1390 ret = driver->ops->attach_group(container->iommu_data,
1391 group->iommu_group);
1392 if (ret)
1393 goto unlock_out;
1394 }
1395
1396 group->container = container;
1397 container->noiommu = group->noiommu;
1398 list_add(&group->container_next, &container->group_list);
1399
1400 /* Get a reference on the container and mark a user within the group */
1401 vfio_container_get(container);
1402 atomic_inc(&group->container_users);
1403
1404unlock_out:
1405 up_write(&container->group_lock);
1406 fdput(f);
1407 return ret;
1408}
1409
1410static bool vfio_group_viable(struct vfio_group *group)
1411{
1412 return (iommu_group_for_each_dev(group->iommu_group,
1413 group, vfio_dev_viable) == 0);
1414}
1415
1416static int vfio_group_add_container_user(struct vfio_group *group)
1417{
1418 if (!atomic_inc_not_zero(&group->container_users))
1419 return -EINVAL;
1420
1421 if (group->noiommu) {
1422 atomic_dec(&group->container_users);
1423 return -EPERM;
1424 }
1425 if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1426 atomic_dec(&group->container_users);
1427 return -EINVAL;
1428 }
1429
1430 return 0;
1431}
1432
1433static const struct file_operations vfio_device_fops;
1434
1435static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1436{
1437 struct vfio_device *device;
1438 struct file *filep;
1439 int ret;
1440
1441 if (0 == atomic_read(&group->container_users) ||
1442 !group->container->iommu_driver || !vfio_group_viable(group))
1443 return -EINVAL;
1444
1445 if (group->noiommu && !capable(CAP_SYS_RAWIO))
1446 return -EPERM;
1447
1448 device = vfio_device_get_from_name(group, buf);
1449 if (IS_ERR(device))
1450 return PTR_ERR(device);
1451
1452 ret = device->ops->open(device->device_data);
1453 if (ret) {
1454 vfio_device_put(device);
1455 return ret;
1456 }
1457
1458 /*
1459 * We can't use anon_inode_getfd() because we need to modify
1460 * the f_mode flags directly to allow more than just ioctls
1461 */
1462 ret = get_unused_fd_flags(O_CLOEXEC);
1463 if (ret < 0) {
1464 device->ops->release(device->device_data);
1465 vfio_device_put(device);
1466 return ret;
1467 }
1468
1469 filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1470 device, O_RDWR);
1471 if (IS_ERR(filep)) {
1472 put_unused_fd(ret);
1473 ret = PTR_ERR(filep);
1474 device->ops->release(device->device_data);
1475 vfio_device_put(device);
1476 return ret;
1477 }
1478
1479 /*
1480 * TODO: add an anon_inode interface to do this.
1481 * Appears to be missing by lack of need rather than
1482 * explicitly prevented. Now there's need.
1483 */
1484 filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1485
1486 atomic_inc(&group->container_users);
1487
1488 fd_install(ret, filep);
1489
1490 if (group->noiommu)
1491 dev_warn(device->dev, "vfio-noiommu device opened by user "
1492 "(%s:%d)\n", current->comm, task_pid_nr(current));
1493
1494 return ret;
1495}
1496
1497static long vfio_group_fops_unl_ioctl(struct file *filep,
1498 unsigned int cmd, unsigned long arg)
1499{
1500 struct vfio_group *group = filep->private_data;
1501 long ret = -ENOTTY;
1502
1503 switch (cmd) {
1504 case VFIO_GROUP_GET_STATUS:
1505 {
1506 struct vfio_group_status status;
1507 unsigned long minsz;
1508
1509 minsz = offsetofend(struct vfio_group_status, flags);
1510
1511 if (copy_from_user(&status, (void __user *)arg, minsz))
1512 return -EFAULT;
1513
1514 if (status.argsz < minsz)
1515 return -EINVAL;
1516
1517 status.flags = 0;
1518
1519 if (vfio_group_viable(group))
1520 status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1521
1522 if (group->container)
1523 status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1524
1525 if (copy_to_user((void __user *)arg, &status, minsz))
1526 return -EFAULT;
1527
1528 ret = 0;
1529 break;
1530 }
1531 case VFIO_GROUP_SET_CONTAINER:
1532 {
1533 int fd;
1534
1535 if (get_user(fd, (int __user *)arg))
1536 return -EFAULT;
1537
1538 if (fd < 0)
1539 return -EINVAL;
1540
1541 ret = vfio_group_set_container(group, fd);
1542 break;
1543 }
1544 case VFIO_GROUP_UNSET_CONTAINER:
1545 ret = vfio_group_unset_container(group);
1546 break;
1547 case VFIO_GROUP_GET_DEVICE_FD:
1548 {
1549 char *buf;
1550
1551 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1552 if (IS_ERR(buf))
1553 return PTR_ERR(buf);
1554
1555 ret = vfio_group_get_device_fd(group, buf);
1556 kfree(buf);
1557 break;
1558 }
1559 }
1560
1561 return ret;
1562}
1563
1564static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1565{
1566 struct vfio_group *group;
1567 int opened;
1568
1569 group = vfio_group_get_from_minor(iminor(inode));
1570 if (!group)
1571 return -ENODEV;
1572
1573 if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1574 vfio_group_put(group);
1575 return -EPERM;
1576 }
1577
1578 /* Do we need multiple instances of the group open? Seems not. */
1579 opened = atomic_cmpxchg(&group->opened, 0, 1);
1580 if (opened) {
1581 vfio_group_put(group);
1582 return -EBUSY;
1583 }
1584
1585 /* Is something still in use from a previous open? */
1586 if (group->container) {
1587 atomic_dec(&group->opened);
1588 vfio_group_put(group);
1589 return -EBUSY;
1590 }
1591
1592 /* Warn if previous user didn't cleanup and re-init to drop them */
1593 if (WARN_ON(group->notifier.head))
1594 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1595
1596 filep->private_data = group;
1597
1598 return 0;
1599}
1600
1601static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1602{
1603 struct vfio_group *group = filep->private_data;
1604
1605 filep->private_data = NULL;
1606
1607 vfio_group_try_dissolve_container(group);
1608
1609 atomic_dec(&group->opened);
1610
1611 vfio_group_put(group);
1612
1613 return 0;
1614}
1615
1616static const struct file_operations vfio_group_fops = {
1617 .owner = THIS_MODULE,
1618 .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1619 .compat_ioctl = compat_ptr_ioctl,
1620 .open = vfio_group_fops_open,
1621 .release = vfio_group_fops_release,
1622};
1623
1624/**
1625 * VFIO Device fd
1626 */
1627static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1628{
1629 struct vfio_device *device = filep->private_data;
1630
1631 device->ops->release(device->device_data);
1632
1633 vfio_group_try_dissolve_container(device->group);
1634
1635 vfio_device_put(device);
1636
1637 return 0;
1638}
1639
1640static long vfio_device_fops_unl_ioctl(struct file *filep,
1641 unsigned int cmd, unsigned long arg)
1642{
1643 struct vfio_device *device = filep->private_data;
1644
1645 if (unlikely(!device->ops->ioctl))
1646 return -EINVAL;
1647
1648 return device->ops->ioctl(device->device_data, cmd, arg);
1649}
1650
1651static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1652 size_t count, loff_t *ppos)
1653{
1654 struct vfio_device *device = filep->private_data;
1655
1656 if (unlikely(!device->ops->read))
1657 return -EINVAL;
1658
1659 return device->ops->read(device->device_data, buf, count, ppos);
1660}
1661
1662static ssize_t vfio_device_fops_write(struct file *filep,
1663 const char __user *buf,
1664 size_t count, loff_t *ppos)
1665{
1666 struct vfio_device *device = filep->private_data;
1667
1668 if (unlikely(!device->ops->write))
1669 return -EINVAL;
1670
1671 return device->ops->write(device->device_data, buf, count, ppos);
1672}
1673
1674static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1675{
1676 struct vfio_device *device = filep->private_data;
1677
1678 if (unlikely(!device->ops->mmap))
1679 return -EINVAL;
1680
1681 return device->ops->mmap(device->device_data, vma);
1682}
1683
1684static const struct file_operations vfio_device_fops = {
1685 .owner = THIS_MODULE,
1686 .release = vfio_device_fops_release,
1687 .read = vfio_device_fops_read,
1688 .write = vfio_device_fops_write,
1689 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1690 .compat_ioctl = compat_ptr_ioctl,
1691 .mmap = vfio_device_fops_mmap,
1692};
1693
1694/**
1695 * External user API, exported by symbols to be linked dynamically.
1696 *
1697 * The protocol includes:
1698 * 1. do normal VFIO init operation:
1699 * - opening a new container;
1700 * - attaching group(s) to it;
1701 * - setting an IOMMU driver for a container.
1702 * When IOMMU is set for a container, all groups in it are
1703 * considered ready to use by an external user.
1704 *
1705 * 2. User space passes a group fd to an external user.
1706 * The external user calls vfio_group_get_external_user()
1707 * to verify that:
1708 * - the group is initialized;
1709 * - IOMMU is set for it.
1710 * If both checks passed, vfio_group_get_external_user()
1711 * increments the container user counter to prevent
1712 * the VFIO group from disposal before KVM exits.
1713 *
1714 * 3. The external user calls vfio_external_user_iommu_id()
1715 * to know an IOMMU ID.
1716 *
1717 * 4. When the external KVM finishes, it calls
1718 * vfio_group_put_external_user() to release the VFIO group.
1719 * This call decrements the container user counter.
1720 */
1721struct vfio_group *vfio_group_get_external_user(struct file *filep)
1722{
1723 struct vfio_group *group = filep->private_data;
1724 int ret;
1725
1726 if (filep->f_op != &vfio_group_fops)
1727 return ERR_PTR(-EINVAL);
1728
1729 ret = vfio_group_add_container_user(group);
1730 if (ret)
1731 return ERR_PTR(ret);
1732
1733 vfio_group_get(group);
1734
1735 return group;
1736}
1737EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1738
1739/**
1740 * External user API, exported by symbols to be linked dynamically.
1741 * The external user passes in a device pointer
1742 * to verify that:
1743 * - A VFIO group is assiciated with the device;
1744 * - IOMMU is set for the group.
1745 * If both checks passed, vfio_group_get_external_user_from_dev()
1746 * increments the container user counter to prevent the VFIO group
1747 * from disposal before external user exits and returns the pointer
1748 * to the VFIO group.
1749 *
1750 * When the external user finishes using the VFIO group, it calls
1751 * vfio_group_put_external_user() to release the VFIO group and
1752 * decrement the container user counter.
1753 *
1754 * @dev [in] : device
1755 * Return error PTR or pointer to VFIO group.
1756 */
1757
1758struct vfio_group *vfio_group_get_external_user_from_dev(struct device *dev)
1759{
1760 struct vfio_group *group;
1761 int ret;
1762
1763 group = vfio_group_get_from_dev(dev);
1764 if (!group)
1765 return ERR_PTR(-ENODEV);
1766
1767 ret = vfio_group_add_container_user(group);
1768 if (ret) {
1769 vfio_group_put(group);
1770 return ERR_PTR(ret);
1771 }
1772
1773 return group;
1774}
1775EXPORT_SYMBOL_GPL(vfio_group_get_external_user_from_dev);
1776
1777void vfio_group_put_external_user(struct vfio_group *group)
1778{
1779 vfio_group_try_dissolve_container(group);
1780 vfio_group_put(group);
1781}
1782EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1783
1784bool vfio_external_group_match_file(struct vfio_group *test_group,
1785 struct file *filep)
1786{
1787 struct vfio_group *group = filep->private_data;
1788
1789 return (filep->f_op == &vfio_group_fops) && (group == test_group);
1790}
1791EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1792
1793int vfio_external_user_iommu_id(struct vfio_group *group)
1794{
1795 return iommu_group_id(group->iommu_group);
1796}
1797EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1798
1799long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1800{
1801 return vfio_ioctl_check_extension(group->container, arg);
1802}
1803EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1804
1805/**
1806 * Sub-module support
1807 */
1808/*
1809 * Helper for managing a buffer of info chain capabilities, allocate or
1810 * reallocate a buffer with additional @size, filling in @id and @version
1811 * of the capability. A pointer to the new capability is returned.
1812 *
1813 * NB. The chain is based at the head of the buffer, so new entries are
1814 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1815 * next offsets prior to copying to the user buffer.
1816 */
1817struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1818 size_t size, u16 id, u16 version)
1819{
1820 void *buf;
1821 struct vfio_info_cap_header *header, *tmp;
1822
1823 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1824 if (!buf) {
1825 kfree(caps->buf);
1826 caps->size = 0;
1827 return ERR_PTR(-ENOMEM);
1828 }
1829
1830 caps->buf = buf;
1831 header = buf + caps->size;
1832
1833 /* Eventually copied to user buffer, zero */
1834 memset(header, 0, size);
1835
1836 header->id = id;
1837 header->version = version;
1838
1839 /* Add to the end of the capability chain */
1840 for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1841 ; /* nothing */
1842
1843 tmp->next = caps->size;
1844 caps->size += size;
1845
1846 return header;
1847}
1848EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1849
1850void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1851{
1852 struct vfio_info_cap_header *tmp;
1853 void *buf = (void *)caps->buf;
1854
1855 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1856 tmp->next += offset;
1857}
1858EXPORT_SYMBOL(vfio_info_cap_shift);
1859
1860int vfio_info_add_capability(struct vfio_info_cap *caps,
1861 struct vfio_info_cap_header *cap, size_t size)
1862{
1863 struct vfio_info_cap_header *header;
1864
1865 header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1866 if (IS_ERR(header))
1867 return PTR_ERR(header);
1868
1869 memcpy(header + 1, cap + 1, size - sizeof(*header));
1870
1871 return 0;
1872}
1873EXPORT_SYMBOL(vfio_info_add_capability);
1874
1875int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1876 int max_irq_type, size_t *data_size)
1877{
1878 unsigned long minsz;
1879 size_t size;
1880
1881 minsz = offsetofend(struct vfio_irq_set, count);
1882
1883 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1884 (hdr->count >= (U32_MAX - hdr->start)) ||
1885 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1886 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1887 return -EINVAL;
1888
1889 if (data_size)
1890 *data_size = 0;
1891
1892 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1893 return -EINVAL;
1894
1895 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1896 case VFIO_IRQ_SET_DATA_NONE:
1897 size = 0;
1898 break;
1899 case VFIO_IRQ_SET_DATA_BOOL:
1900 size = sizeof(uint8_t);
1901 break;
1902 case VFIO_IRQ_SET_DATA_EVENTFD:
1903 size = sizeof(int32_t);
1904 break;
1905 default:
1906 return -EINVAL;
1907 }
1908
1909 if (size) {
1910 if (hdr->argsz - minsz < hdr->count * size)
1911 return -EINVAL;
1912
1913 if (!data_size)
1914 return -EINVAL;
1915
1916 *data_size = hdr->count * size;
1917 }
1918
1919 return 0;
1920}
1921EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1922
1923/*
1924 * Pin a set of guest PFNs and return their associated host PFNs for local
1925 * domain only.
1926 * @dev [in] : device
1927 * @user_pfn [in]: array of user/guest PFNs to be pinned.
1928 * @npage [in] : count of elements in user_pfn array. This count should not
1929 * be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1930 * @prot [in] : protection flags
1931 * @phys_pfn[out]: array of host PFNs
1932 * Return error or number of pages pinned.
1933 */
1934int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1935 int prot, unsigned long *phys_pfn)
1936{
1937 struct vfio_container *container;
1938 struct vfio_group *group;
1939 struct vfio_iommu_driver *driver;
1940 int ret;
1941
1942 if (!dev || !user_pfn || !phys_pfn || !npage)
1943 return -EINVAL;
1944
1945 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1946 return -E2BIG;
1947
1948 group = vfio_group_get_from_dev(dev);
1949 if (!group)
1950 return -ENODEV;
1951
1952 if (group->dev_counter > 1)
1953 return -EINVAL;
1954
1955 ret = vfio_group_add_container_user(group);
1956 if (ret)
1957 goto err_pin_pages;
1958
1959 container = group->container;
1960 driver = container->iommu_driver;
1961 if (likely(driver && driver->ops->pin_pages))
1962 ret = driver->ops->pin_pages(container->iommu_data,
1963 group->iommu_group, user_pfn,
1964 npage, prot, phys_pfn);
1965 else
1966 ret = -ENOTTY;
1967
1968 vfio_group_try_dissolve_container(group);
1969
1970err_pin_pages:
1971 vfio_group_put(group);
1972 return ret;
1973}
1974EXPORT_SYMBOL(vfio_pin_pages);
1975
1976/*
1977 * Unpin set of host PFNs for local domain only.
1978 * @dev [in] : device
1979 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1980 * PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1981 * @npage [in] : count of elements in user_pfn array. This count should not
1982 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1983 * Return error or number of pages unpinned.
1984 */
1985int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
1986{
1987 struct vfio_container *container;
1988 struct vfio_group *group;
1989 struct vfio_iommu_driver *driver;
1990 int ret;
1991
1992 if (!dev || !user_pfn || !npage)
1993 return -EINVAL;
1994
1995 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1996 return -E2BIG;
1997
1998 group = vfio_group_get_from_dev(dev);
1999 if (!group)
2000 return -ENODEV;
2001
2002 ret = vfio_group_add_container_user(group);
2003 if (ret)
2004 goto err_unpin_pages;
2005
2006 container = group->container;
2007 driver = container->iommu_driver;
2008 if (likely(driver && driver->ops->unpin_pages))
2009 ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
2010 npage);
2011 else
2012 ret = -ENOTTY;
2013
2014 vfio_group_try_dissolve_container(group);
2015
2016err_unpin_pages:
2017 vfio_group_put(group);
2018 return ret;
2019}
2020EXPORT_SYMBOL(vfio_unpin_pages);
2021
2022/*
2023 * Pin a set of guest IOVA PFNs and return their associated host PFNs for a
2024 * VFIO group.
2025 *
2026 * The caller needs to call vfio_group_get_external_user() or
2027 * vfio_group_get_external_user_from_dev() prior to calling this interface,
2028 * so as to prevent the VFIO group from disposal in the middle of the call.
2029 * But it can keep the reference to the VFIO group for several calls into
2030 * this interface.
2031 * After finishing using of the VFIO group, the caller needs to release the
2032 * VFIO group by calling vfio_group_put_external_user().
2033 *
2034 * @group [in] : VFIO group
2035 * @user_iova_pfn [in] : array of user/guest IOVA PFNs to be pinned.
2036 * @npage [in] : count of elements in user_iova_pfn array.
2037 * This count should not be greater
2038 * VFIO_PIN_PAGES_MAX_ENTRIES.
2039 * @prot [in] : protection flags
2040 * @phys_pfn [out] : array of host PFNs
2041 * Return error or number of pages pinned.
2042 */
2043int vfio_group_pin_pages(struct vfio_group *group,
2044 unsigned long *user_iova_pfn, int npage,
2045 int prot, unsigned long *phys_pfn)
2046{
2047 struct vfio_container *container;
2048 struct vfio_iommu_driver *driver;
2049 int ret;
2050
2051 if (!group || !user_iova_pfn || !phys_pfn || !npage)
2052 return -EINVAL;
2053
2054 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2055 return -E2BIG;
2056
2057 container = group->container;
2058 driver = container->iommu_driver;
2059 if (likely(driver && driver->ops->pin_pages))
2060 ret = driver->ops->pin_pages(container->iommu_data,
2061 group->iommu_group, user_iova_pfn,
2062 npage, prot, phys_pfn);
2063 else
2064 ret = -ENOTTY;
2065
2066 return ret;
2067}
2068EXPORT_SYMBOL(vfio_group_pin_pages);
2069
2070/*
2071 * Unpin a set of guest IOVA PFNs for a VFIO group.
2072 *
2073 * The caller needs to call vfio_group_get_external_user() or
2074 * vfio_group_get_external_user_from_dev() prior to calling this interface,
2075 * so as to prevent the VFIO group from disposal in the middle of the call.
2076 * But it can keep the reference to the VFIO group for several calls into
2077 * this interface.
2078 * After finishing using of the VFIO group, the caller needs to release the
2079 * VFIO group by calling vfio_group_put_external_user().
2080 *
2081 * @group [in] : vfio group
2082 * @user_iova_pfn [in] : array of user/guest IOVA PFNs to be unpinned.
2083 * @npage [in] : count of elements in user_iova_pfn array.
2084 * This count should not be greater than
2085 * VFIO_PIN_PAGES_MAX_ENTRIES.
2086 * Return error or number of pages unpinned.
2087 */
2088int vfio_group_unpin_pages(struct vfio_group *group,
2089 unsigned long *user_iova_pfn, int npage)
2090{
2091 struct vfio_container *container;
2092 struct vfio_iommu_driver *driver;
2093 int ret;
2094
2095 if (!group || !user_iova_pfn || !npage)
2096 return -EINVAL;
2097
2098 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2099 return -E2BIG;
2100
2101 container = group->container;
2102 driver = container->iommu_driver;
2103 if (likely(driver && driver->ops->unpin_pages))
2104 ret = driver->ops->unpin_pages(container->iommu_data,
2105 user_iova_pfn, npage);
2106 else
2107 ret = -ENOTTY;
2108
2109 return ret;
2110}
2111EXPORT_SYMBOL(vfio_group_unpin_pages);
2112
2113
2114/*
2115 * This interface allows the CPUs to perform some sort of virtual DMA on
2116 * behalf of the device.
2117 *
2118 * CPUs read/write from/into a range of IOVAs pointing to user space memory
2119 * into/from a kernel buffer.
2120 *
2121 * As the read/write of user space memory is conducted via the CPUs and is
2122 * not a real device DMA, it is not necessary to pin the user space memory.
2123 *
2124 * The caller needs to call vfio_group_get_external_user() or
2125 * vfio_group_get_external_user_from_dev() prior to calling this interface,
2126 * so as to prevent the VFIO group from disposal in the middle of the call.
2127 * But it can keep the reference to the VFIO group for several calls into
2128 * this interface.
2129 * After finishing using of the VFIO group, the caller needs to release the
2130 * VFIO group by calling vfio_group_put_external_user().
2131 *
2132 * @group [in] : VFIO group
2133 * @user_iova [in] : base IOVA of a user space buffer
2134 * @data [in] : pointer to kernel buffer
2135 * @len [in] : kernel buffer length
2136 * @write : indicate read or write
2137 * Return error code on failure or 0 on success.
2138 */
2139int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova,
2140 void *data, size_t len, bool write)
2141{
2142 struct vfio_container *container;
2143 struct vfio_iommu_driver *driver;
2144 int ret = 0;
2145
2146 if (!group || !data || len <= 0)
2147 return -EINVAL;
2148
2149 container = group->container;
2150 driver = container->iommu_driver;
2151
2152 if (likely(driver && driver->ops->dma_rw))
2153 ret = driver->ops->dma_rw(container->iommu_data,
2154 user_iova, data, len, write);
2155 else
2156 ret = -ENOTTY;
2157
2158 return ret;
2159}
2160EXPORT_SYMBOL(vfio_dma_rw);
2161
2162static int vfio_register_iommu_notifier(struct vfio_group *group,
2163 unsigned long *events,
2164 struct notifier_block *nb)
2165{
2166 struct vfio_container *container;
2167 struct vfio_iommu_driver *driver;
2168 int ret;
2169
2170 ret = vfio_group_add_container_user(group);
2171 if (ret)
2172 return -EINVAL;
2173
2174 container = group->container;
2175 driver = container->iommu_driver;
2176 if (likely(driver && driver->ops->register_notifier))
2177 ret = driver->ops->register_notifier(container->iommu_data,
2178 events, nb);
2179 else
2180 ret = -ENOTTY;
2181
2182 vfio_group_try_dissolve_container(group);
2183
2184 return ret;
2185}
2186
2187static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2188 struct notifier_block *nb)
2189{
2190 struct vfio_container *container;
2191 struct vfio_iommu_driver *driver;
2192 int ret;
2193
2194 ret = vfio_group_add_container_user(group);
2195 if (ret)
2196 return -EINVAL;
2197
2198 container = group->container;
2199 driver = container->iommu_driver;
2200 if (likely(driver && driver->ops->unregister_notifier))
2201 ret = driver->ops->unregister_notifier(container->iommu_data,
2202 nb);
2203 else
2204 ret = -ENOTTY;
2205
2206 vfio_group_try_dissolve_container(group);
2207
2208 return ret;
2209}
2210
2211void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2212{
2213 group->kvm = kvm;
2214 blocking_notifier_call_chain(&group->notifier,
2215 VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2216}
2217EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2218
2219static int vfio_register_group_notifier(struct vfio_group *group,
2220 unsigned long *events,
2221 struct notifier_block *nb)
2222{
2223 int ret;
2224 bool set_kvm = false;
2225
2226 if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2227 set_kvm = true;
2228
2229 /* clear known events */
2230 *events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2231
2232 /* refuse to continue if still events remaining */
2233 if (*events)
2234 return -EINVAL;
2235
2236 ret = vfio_group_add_container_user(group);
2237 if (ret)
2238 return -EINVAL;
2239
2240 ret = blocking_notifier_chain_register(&group->notifier, nb);
2241
2242 /*
2243 * The attaching of kvm and vfio_group might already happen, so
2244 * here we replay once upon registration.
2245 */
2246 if (!ret && set_kvm && group->kvm)
2247 blocking_notifier_call_chain(&group->notifier,
2248 VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2249
2250 vfio_group_try_dissolve_container(group);
2251
2252 return ret;
2253}
2254
2255static int vfio_unregister_group_notifier(struct vfio_group *group,
2256 struct notifier_block *nb)
2257{
2258 int ret;
2259
2260 ret = vfio_group_add_container_user(group);
2261 if (ret)
2262 return -EINVAL;
2263
2264 ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2265
2266 vfio_group_try_dissolve_container(group);
2267
2268 return ret;
2269}
2270
2271int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2272 unsigned long *events, struct notifier_block *nb)
2273{
2274 struct vfio_group *group;
2275 int ret;
2276
2277 if (!dev || !nb || !events || (*events == 0))
2278 return -EINVAL;
2279
2280 group = vfio_group_get_from_dev(dev);
2281 if (!group)
2282 return -ENODEV;
2283
2284 switch (type) {
2285 case VFIO_IOMMU_NOTIFY:
2286 ret = vfio_register_iommu_notifier(group, events, nb);
2287 break;
2288 case VFIO_GROUP_NOTIFY:
2289 ret = vfio_register_group_notifier(group, events, nb);
2290 break;
2291 default:
2292 ret = -EINVAL;
2293 }
2294
2295 vfio_group_put(group);
2296 return ret;
2297}
2298EXPORT_SYMBOL(vfio_register_notifier);
2299
2300int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2301 struct notifier_block *nb)
2302{
2303 struct vfio_group *group;
2304 int ret;
2305
2306 if (!dev || !nb)
2307 return -EINVAL;
2308
2309 group = vfio_group_get_from_dev(dev);
2310 if (!group)
2311 return -ENODEV;
2312
2313 switch (type) {
2314 case VFIO_IOMMU_NOTIFY:
2315 ret = vfio_unregister_iommu_notifier(group, nb);
2316 break;
2317 case VFIO_GROUP_NOTIFY:
2318 ret = vfio_unregister_group_notifier(group, nb);
2319 break;
2320 default:
2321 ret = -EINVAL;
2322 }
2323
2324 vfio_group_put(group);
2325 return ret;
2326}
2327EXPORT_SYMBOL(vfio_unregister_notifier);
2328
2329/**
2330 * Module/class support
2331 */
2332static char *vfio_devnode(struct device *dev, umode_t *mode)
2333{
2334 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2335}
2336
2337static struct miscdevice vfio_dev = {
2338 .minor = VFIO_MINOR,
2339 .name = "vfio",
2340 .fops = &vfio_fops,
2341 .nodename = "vfio/vfio",
2342 .mode = S_IRUGO | S_IWUGO,
2343};
2344
2345static int __init vfio_init(void)
2346{
2347 int ret;
2348
2349 idr_init(&vfio.group_idr);
2350 mutex_init(&vfio.group_lock);
2351 mutex_init(&vfio.iommu_drivers_lock);
2352 INIT_LIST_HEAD(&vfio.group_list);
2353 INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2354 init_waitqueue_head(&vfio.release_q);
2355
2356 ret = misc_register(&vfio_dev);
2357 if (ret) {
2358 pr_err("vfio: misc device register failed\n");
2359 return ret;
2360 }
2361
2362 /* /dev/vfio/$GROUP */
2363 vfio.class = class_create(THIS_MODULE, "vfio");
2364 if (IS_ERR(vfio.class)) {
2365 ret = PTR_ERR(vfio.class);
2366 goto err_class;
2367 }
2368
2369 vfio.class->devnode = vfio_devnode;
2370
2371 ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2372 if (ret)
2373 goto err_alloc_chrdev;
2374
2375 cdev_init(&vfio.group_cdev, &vfio_group_fops);
2376 ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK + 1);
2377 if (ret)
2378 goto err_cdev_add;
2379
2380 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2381
2382#ifdef CONFIG_VFIO_NOIOMMU
2383 vfio_register_iommu_driver(&vfio_noiommu_ops);
2384#endif
2385 return 0;
2386
2387err_cdev_add:
2388 unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2389err_alloc_chrdev:
2390 class_destroy(vfio.class);
2391 vfio.class = NULL;
2392err_class:
2393 misc_deregister(&vfio_dev);
2394 return ret;
2395}
2396
2397static void __exit vfio_cleanup(void)
2398{
2399 WARN_ON(!list_empty(&vfio.group_list));
2400
2401#ifdef CONFIG_VFIO_NOIOMMU
2402 vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2403#endif
2404 idr_destroy(&vfio.group_idr);
2405 cdev_del(&vfio.group_cdev);
2406 unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2407 class_destroy(vfio.class);
2408 vfio.class = NULL;
2409 misc_deregister(&vfio_dev);
2410}
2411
2412module_init(vfio_init);
2413module_exit(vfio_cleanup);
2414
2415MODULE_VERSION(DRIVER_VERSION);
2416MODULE_LICENSE("GPL v2");
2417MODULE_AUTHOR(DRIVER_AUTHOR);
2418MODULE_DESCRIPTION(DRIVER_DESC);
2419MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2420MODULE_ALIAS("devname:vfio/vfio");
2421MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");