Linux Audio

Check our new training course

Loading...
Note: File does not exist in v4.6.
   1/*
   2 * KVMGT - the implementation of Intel mediated pass-through framework for KVM
   3 *
   4 * Copyright(c) 2014-2016 Intel Corporation. All rights reserved.
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a
   7 * copy of this software and associated documentation files (the "Software"),
   8 * to deal in the Software without restriction, including without limitation
   9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10 * and/or sell copies of the Software, and to permit persons to whom the
  11 * Software is furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice (including the next
  14 * paragraph) shall be included in all copies or substantial portions of the
  15 * Software.
  16 *
  17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 * SOFTWARE.
  24 *
  25 * Authors:
  26 *    Kevin Tian <kevin.tian@intel.com>
  27 *    Jike Song <jike.song@intel.com>
  28 *    Xiaoguang Chen <xiaoguang.chen@intel.com>
  29 */
  30
  31#include <linux/init.h>
  32#include <linux/device.h>
  33#include <linux/mm.h>
  34#include <linux/mmu_context.h>
  35#include <linux/types.h>
  36#include <linux/list.h>
  37#include <linux/rbtree.h>
  38#include <linux/spinlock.h>
  39#include <linux/eventfd.h>
  40#include <linux/uuid.h>
  41#include <linux/kvm_host.h>
  42#include <linux/vfio.h>
  43#include <linux/mdev.h>
  44#include <linux/debugfs.h>
  45
  46#include "i915_drv.h"
  47#include "gvt.h"
  48
  49static const struct intel_gvt_ops *intel_gvt_ops;
  50
  51/* helper macros copied from vfio-pci */
  52#define VFIO_PCI_OFFSET_SHIFT   40
  53#define VFIO_PCI_OFFSET_TO_INDEX(off)   (off >> VFIO_PCI_OFFSET_SHIFT)
  54#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
  55#define VFIO_PCI_OFFSET_MASK    (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
  56
  57#define OPREGION_SIGNATURE "IntelGraphicsMem"
  58
  59struct vfio_region;
  60struct intel_vgpu_regops {
  61	size_t (*rw)(struct intel_vgpu *vgpu, char *buf,
  62			size_t count, loff_t *ppos, bool iswrite);
  63	void (*release)(struct intel_vgpu *vgpu,
  64			struct vfio_region *region);
  65};
  66
  67struct vfio_region {
  68	u32				type;
  69	u32				subtype;
  70	size_t				size;
  71	u32				flags;
  72	const struct intel_vgpu_regops	*ops;
  73	void				*data;
  74};
  75
  76struct kvmgt_pgfn {
  77	gfn_t gfn;
  78	struct hlist_node hnode;
  79};
  80
  81struct kvmgt_guest_info {
  82	struct kvm *kvm;
  83	struct intel_vgpu *vgpu;
  84	struct kvm_page_track_notifier_node track_node;
  85#define NR_BKT (1 << 18)
  86	struct hlist_head ptable[NR_BKT];
  87#undef NR_BKT
  88	struct dentry *debugfs_cache_entries;
  89};
  90
  91struct gvt_dma {
  92	struct intel_vgpu *vgpu;
  93	struct rb_node gfn_node;
  94	struct rb_node dma_addr_node;
  95	gfn_t gfn;
  96	dma_addr_t dma_addr;
  97	struct kref ref;
  98};
  99
 100static inline bool handle_valid(unsigned long handle)
 101{
 102	return !!(handle & ~0xff);
 103}
 104
 105static int kvmgt_guest_init(struct mdev_device *mdev);
 106static void intel_vgpu_release_work(struct work_struct *work);
 107static bool kvmgt_guest_exit(struct kvmgt_guest_info *info);
 108
 109static int gvt_dma_map_page(struct intel_vgpu *vgpu, unsigned long gfn,
 110		dma_addr_t *dma_addr)
 111{
 112	struct device *dev = &vgpu->gvt->dev_priv->drm.pdev->dev;
 113	struct page *page;
 114	unsigned long pfn;
 115	int ret;
 116
 117	/* Pin the page first. */
 118	ret = vfio_pin_pages(mdev_dev(vgpu->vdev.mdev), &gfn, 1,
 119			     IOMMU_READ | IOMMU_WRITE, &pfn);
 120	if (ret != 1) {
 121		gvt_vgpu_err("vfio_pin_pages failed for gfn 0x%lx: %d\n",
 122			     gfn, ret);
 123		return -EINVAL;
 124	}
 125
 126	/* Setup DMA mapping. */
 127	page = pfn_to_page(pfn);
 128	*dma_addr = dma_map_page(dev, page, 0, PAGE_SIZE,
 129				 PCI_DMA_BIDIRECTIONAL);
 130	if (dma_mapping_error(dev, *dma_addr)) {
 131		gvt_vgpu_err("DMA mapping failed for gfn 0x%lx\n", gfn);
 132		vfio_unpin_pages(mdev_dev(vgpu->vdev.mdev), &gfn, 1);
 133		return -ENOMEM;
 134	}
 135
 136	return 0;
 137}
 138
 139static void gvt_dma_unmap_page(struct intel_vgpu *vgpu, unsigned long gfn,
 140		dma_addr_t dma_addr)
 141{
 142	struct device *dev = &vgpu->gvt->dev_priv->drm.pdev->dev;
 143	int ret;
 144
 145	dma_unmap_page(dev, dma_addr, PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
 146	ret = vfio_unpin_pages(mdev_dev(vgpu->vdev.mdev), &gfn, 1);
 147	WARN_ON(ret != 1);
 148}
 149
 150static struct gvt_dma *__gvt_cache_find_dma_addr(struct intel_vgpu *vgpu,
 151		dma_addr_t dma_addr)
 152{
 153	struct rb_node *node = vgpu->vdev.dma_addr_cache.rb_node;
 154	struct gvt_dma *itr;
 155
 156	while (node) {
 157		itr = rb_entry(node, struct gvt_dma, dma_addr_node);
 158
 159		if (dma_addr < itr->dma_addr)
 160			node = node->rb_left;
 161		else if (dma_addr > itr->dma_addr)
 162			node = node->rb_right;
 163		else
 164			return itr;
 165	}
 166	return NULL;
 167}
 168
 169static struct gvt_dma *__gvt_cache_find_gfn(struct intel_vgpu *vgpu, gfn_t gfn)
 170{
 171	struct rb_node *node = vgpu->vdev.gfn_cache.rb_node;
 172	struct gvt_dma *itr;
 173
 174	while (node) {
 175		itr = rb_entry(node, struct gvt_dma, gfn_node);
 176
 177		if (gfn < itr->gfn)
 178			node = node->rb_left;
 179		else if (gfn > itr->gfn)
 180			node = node->rb_right;
 181		else
 182			return itr;
 183	}
 184	return NULL;
 185}
 186
 187static int __gvt_cache_add(struct intel_vgpu *vgpu, gfn_t gfn,
 188		dma_addr_t dma_addr)
 189{
 190	struct gvt_dma *new, *itr;
 191	struct rb_node **link, *parent = NULL;
 192
 193	new = kzalloc(sizeof(struct gvt_dma), GFP_KERNEL);
 194	if (!new)
 195		return -ENOMEM;
 196
 197	new->vgpu = vgpu;
 198	new->gfn = gfn;
 199	new->dma_addr = dma_addr;
 200	kref_init(&new->ref);
 201
 202	/* gfn_cache maps gfn to struct gvt_dma. */
 203	link = &vgpu->vdev.gfn_cache.rb_node;
 204	while (*link) {
 205		parent = *link;
 206		itr = rb_entry(parent, struct gvt_dma, gfn_node);
 207
 208		if (gfn < itr->gfn)
 209			link = &parent->rb_left;
 210		else
 211			link = &parent->rb_right;
 212	}
 213	rb_link_node(&new->gfn_node, parent, link);
 214	rb_insert_color(&new->gfn_node, &vgpu->vdev.gfn_cache);
 215
 216	/* dma_addr_cache maps dma addr to struct gvt_dma. */
 217	parent = NULL;
 218	link = &vgpu->vdev.dma_addr_cache.rb_node;
 219	while (*link) {
 220		parent = *link;
 221		itr = rb_entry(parent, struct gvt_dma, dma_addr_node);
 222
 223		if (dma_addr < itr->dma_addr)
 224			link = &parent->rb_left;
 225		else
 226			link = &parent->rb_right;
 227	}
 228	rb_link_node(&new->dma_addr_node, parent, link);
 229	rb_insert_color(&new->dma_addr_node, &vgpu->vdev.dma_addr_cache);
 230
 231	vgpu->vdev.nr_cache_entries++;
 232	return 0;
 233}
 234
 235static void __gvt_cache_remove_entry(struct intel_vgpu *vgpu,
 236				struct gvt_dma *entry)
 237{
 238	rb_erase(&entry->gfn_node, &vgpu->vdev.gfn_cache);
 239	rb_erase(&entry->dma_addr_node, &vgpu->vdev.dma_addr_cache);
 240	kfree(entry);
 241	vgpu->vdev.nr_cache_entries--;
 242}
 243
 244static void gvt_cache_destroy(struct intel_vgpu *vgpu)
 245{
 246	struct gvt_dma *dma;
 247	struct rb_node *node = NULL;
 248
 249	for (;;) {
 250		mutex_lock(&vgpu->vdev.cache_lock);
 251		node = rb_first(&vgpu->vdev.gfn_cache);
 252		if (!node) {
 253			mutex_unlock(&vgpu->vdev.cache_lock);
 254			break;
 255		}
 256		dma = rb_entry(node, struct gvt_dma, gfn_node);
 257		gvt_dma_unmap_page(vgpu, dma->gfn, dma->dma_addr);
 258		__gvt_cache_remove_entry(vgpu, dma);
 259		mutex_unlock(&vgpu->vdev.cache_lock);
 260	}
 261}
 262
 263static void gvt_cache_init(struct intel_vgpu *vgpu)
 264{
 265	vgpu->vdev.gfn_cache = RB_ROOT;
 266	vgpu->vdev.dma_addr_cache = RB_ROOT;
 267	vgpu->vdev.nr_cache_entries = 0;
 268	mutex_init(&vgpu->vdev.cache_lock);
 269}
 270
 271static void kvmgt_protect_table_init(struct kvmgt_guest_info *info)
 272{
 273	hash_init(info->ptable);
 274}
 275
 276static void kvmgt_protect_table_destroy(struct kvmgt_guest_info *info)
 277{
 278	struct kvmgt_pgfn *p;
 279	struct hlist_node *tmp;
 280	int i;
 281
 282	hash_for_each_safe(info->ptable, i, tmp, p, hnode) {
 283		hash_del(&p->hnode);
 284		kfree(p);
 285	}
 286}
 287
 288static struct kvmgt_pgfn *
 289__kvmgt_protect_table_find(struct kvmgt_guest_info *info, gfn_t gfn)
 290{
 291	struct kvmgt_pgfn *p, *res = NULL;
 292
 293	hash_for_each_possible(info->ptable, p, hnode, gfn) {
 294		if (gfn == p->gfn) {
 295			res = p;
 296			break;
 297		}
 298	}
 299
 300	return res;
 301}
 302
 303static bool kvmgt_gfn_is_write_protected(struct kvmgt_guest_info *info,
 304				gfn_t gfn)
 305{
 306	struct kvmgt_pgfn *p;
 307
 308	p = __kvmgt_protect_table_find(info, gfn);
 309	return !!p;
 310}
 311
 312static void kvmgt_protect_table_add(struct kvmgt_guest_info *info, gfn_t gfn)
 313{
 314	struct kvmgt_pgfn *p;
 315
 316	if (kvmgt_gfn_is_write_protected(info, gfn))
 317		return;
 318
 319	p = kzalloc(sizeof(struct kvmgt_pgfn), GFP_ATOMIC);
 320	if (WARN(!p, "gfn: 0x%llx\n", gfn))
 321		return;
 322
 323	p->gfn = gfn;
 324	hash_add(info->ptable, &p->hnode, gfn);
 325}
 326
 327static void kvmgt_protect_table_del(struct kvmgt_guest_info *info,
 328				gfn_t gfn)
 329{
 330	struct kvmgt_pgfn *p;
 331
 332	p = __kvmgt_protect_table_find(info, gfn);
 333	if (p) {
 334		hash_del(&p->hnode);
 335		kfree(p);
 336	}
 337}
 338
 339static size_t intel_vgpu_reg_rw_opregion(struct intel_vgpu *vgpu, char *buf,
 340		size_t count, loff_t *ppos, bool iswrite)
 341{
 342	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) -
 343			VFIO_PCI_NUM_REGIONS;
 344	void *base = vgpu->vdev.region[i].data;
 345	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
 346
 347	if (pos >= vgpu->vdev.region[i].size || iswrite) {
 348		gvt_vgpu_err("invalid op or offset for Intel vgpu OpRegion\n");
 349		return -EINVAL;
 350	}
 351	count = min(count, (size_t)(vgpu->vdev.region[i].size - pos));
 352	memcpy(buf, base + pos, count);
 353
 354	return count;
 355}
 356
 357static void intel_vgpu_reg_release_opregion(struct intel_vgpu *vgpu,
 358		struct vfio_region *region)
 359{
 360}
 361
 362static const struct intel_vgpu_regops intel_vgpu_regops_opregion = {
 363	.rw = intel_vgpu_reg_rw_opregion,
 364	.release = intel_vgpu_reg_release_opregion,
 365};
 366
 367static int intel_vgpu_register_reg(struct intel_vgpu *vgpu,
 368		unsigned int type, unsigned int subtype,
 369		const struct intel_vgpu_regops *ops,
 370		size_t size, u32 flags, void *data)
 371{
 372	struct vfio_region *region;
 373
 374	region = krealloc(vgpu->vdev.region,
 375			(vgpu->vdev.num_regions + 1) * sizeof(*region),
 376			GFP_KERNEL);
 377	if (!region)
 378		return -ENOMEM;
 379
 380	vgpu->vdev.region = region;
 381	vgpu->vdev.region[vgpu->vdev.num_regions].type = type;
 382	vgpu->vdev.region[vgpu->vdev.num_regions].subtype = subtype;
 383	vgpu->vdev.region[vgpu->vdev.num_regions].ops = ops;
 384	vgpu->vdev.region[vgpu->vdev.num_regions].size = size;
 385	vgpu->vdev.region[vgpu->vdev.num_regions].flags = flags;
 386	vgpu->vdev.region[vgpu->vdev.num_regions].data = data;
 387	vgpu->vdev.num_regions++;
 388	return 0;
 389}
 390
 391static int kvmgt_get_vfio_device(void *p_vgpu)
 392{
 393	struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
 394
 395	vgpu->vdev.vfio_device = vfio_device_get_from_dev(
 396		mdev_dev(vgpu->vdev.mdev));
 397	if (!vgpu->vdev.vfio_device) {
 398		gvt_vgpu_err("failed to get vfio device\n");
 399		return -ENODEV;
 400	}
 401	return 0;
 402}
 403
 404
 405static int kvmgt_set_opregion(void *p_vgpu)
 406{
 407	struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
 408	void *base;
 409	int ret;
 410
 411	/* Each vgpu has its own opregion, although VFIO would create another
 412	 * one later. This one is used to expose opregion to VFIO. And the
 413	 * other one created by VFIO later, is used by guest actually.
 414	 */
 415	base = vgpu_opregion(vgpu)->va;
 416	if (!base)
 417		return -ENOMEM;
 418
 419	if (memcmp(base, OPREGION_SIGNATURE, 16)) {
 420		memunmap(base);
 421		return -EINVAL;
 422	}
 423
 424	ret = intel_vgpu_register_reg(vgpu,
 425			PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
 426			VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
 427			&intel_vgpu_regops_opregion, OPREGION_SIZE,
 428			VFIO_REGION_INFO_FLAG_READ, base);
 429
 430	return ret;
 431}
 432
 433static void kvmgt_put_vfio_device(void *vgpu)
 434{
 435	if (WARN_ON(!((struct intel_vgpu *)vgpu)->vdev.vfio_device))
 436		return;
 437
 438	vfio_device_put(((struct intel_vgpu *)vgpu)->vdev.vfio_device);
 439}
 440
 441static int intel_vgpu_create(struct kobject *kobj, struct mdev_device *mdev)
 442{
 443	struct intel_vgpu *vgpu = NULL;
 444	struct intel_vgpu_type *type;
 445	struct device *pdev;
 446	void *gvt;
 447	int ret;
 448
 449	pdev = mdev_parent_dev(mdev);
 450	gvt = kdev_to_i915(pdev)->gvt;
 451
 452	type = intel_gvt_ops->gvt_find_vgpu_type(gvt, kobject_name(kobj));
 453	if (!type) {
 454		gvt_vgpu_err("failed to find type %s to create\n",
 455						kobject_name(kobj));
 456		ret = -EINVAL;
 457		goto out;
 458	}
 459
 460	vgpu = intel_gvt_ops->vgpu_create(gvt, type);
 461	if (IS_ERR_OR_NULL(vgpu)) {
 462		ret = vgpu == NULL ? -EFAULT : PTR_ERR(vgpu);
 463		gvt_err("failed to create intel vgpu: %d\n", ret);
 464		goto out;
 465	}
 466
 467	INIT_WORK(&vgpu->vdev.release_work, intel_vgpu_release_work);
 468
 469	vgpu->vdev.mdev = mdev;
 470	mdev_set_drvdata(mdev, vgpu);
 471
 472	gvt_dbg_core("intel_vgpu_create succeeded for mdev: %s\n",
 473		     dev_name(mdev_dev(mdev)));
 474	ret = 0;
 475
 476out:
 477	return ret;
 478}
 479
 480static int intel_vgpu_remove(struct mdev_device *mdev)
 481{
 482	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
 483
 484	if (handle_valid(vgpu->handle))
 485		return -EBUSY;
 486
 487	intel_gvt_ops->vgpu_destroy(vgpu);
 488	return 0;
 489}
 490
 491static int intel_vgpu_iommu_notifier(struct notifier_block *nb,
 492				     unsigned long action, void *data)
 493{
 494	struct intel_vgpu *vgpu = container_of(nb,
 495					struct intel_vgpu,
 496					vdev.iommu_notifier);
 497
 498	if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP) {
 499		struct vfio_iommu_type1_dma_unmap *unmap = data;
 500		struct gvt_dma *entry;
 501		unsigned long iov_pfn, end_iov_pfn;
 502
 503		iov_pfn = unmap->iova >> PAGE_SHIFT;
 504		end_iov_pfn = iov_pfn + unmap->size / PAGE_SIZE;
 505
 506		mutex_lock(&vgpu->vdev.cache_lock);
 507		for (; iov_pfn < end_iov_pfn; iov_pfn++) {
 508			entry = __gvt_cache_find_gfn(vgpu, iov_pfn);
 509			if (!entry)
 510				continue;
 511
 512			gvt_dma_unmap_page(vgpu, entry->gfn, entry->dma_addr);
 513			__gvt_cache_remove_entry(vgpu, entry);
 514		}
 515		mutex_unlock(&vgpu->vdev.cache_lock);
 516	}
 517
 518	return NOTIFY_OK;
 519}
 520
 521static int intel_vgpu_group_notifier(struct notifier_block *nb,
 522				     unsigned long action, void *data)
 523{
 524	struct intel_vgpu *vgpu = container_of(nb,
 525					struct intel_vgpu,
 526					vdev.group_notifier);
 527
 528	/* the only action we care about */
 529	if (action == VFIO_GROUP_NOTIFY_SET_KVM) {
 530		vgpu->vdev.kvm = data;
 531
 532		if (!data)
 533			schedule_work(&vgpu->vdev.release_work);
 534	}
 535
 536	return NOTIFY_OK;
 537}
 538
 539static int intel_vgpu_open(struct mdev_device *mdev)
 540{
 541	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
 542	unsigned long events;
 543	int ret;
 544
 545	vgpu->vdev.iommu_notifier.notifier_call = intel_vgpu_iommu_notifier;
 546	vgpu->vdev.group_notifier.notifier_call = intel_vgpu_group_notifier;
 547
 548	events = VFIO_IOMMU_NOTIFY_DMA_UNMAP;
 549	ret = vfio_register_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, &events,
 550				&vgpu->vdev.iommu_notifier);
 551	if (ret != 0) {
 552		gvt_vgpu_err("vfio_register_notifier for iommu failed: %d\n",
 553			ret);
 554		goto out;
 555	}
 556
 557	events = VFIO_GROUP_NOTIFY_SET_KVM;
 558	ret = vfio_register_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY, &events,
 559				&vgpu->vdev.group_notifier);
 560	if (ret != 0) {
 561		gvt_vgpu_err("vfio_register_notifier for group failed: %d\n",
 562			ret);
 563		goto undo_iommu;
 564	}
 565
 566	ret = kvmgt_guest_init(mdev);
 567	if (ret)
 568		goto undo_group;
 569
 570	intel_gvt_ops->vgpu_activate(vgpu);
 571
 572	atomic_set(&vgpu->vdev.released, 0);
 573	return ret;
 574
 575undo_group:
 576	vfio_unregister_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY,
 577					&vgpu->vdev.group_notifier);
 578
 579undo_iommu:
 580	vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY,
 581					&vgpu->vdev.iommu_notifier);
 582out:
 583	return ret;
 584}
 585
 586static void __intel_vgpu_release(struct intel_vgpu *vgpu)
 587{
 588	struct kvmgt_guest_info *info;
 589	int ret;
 590
 591	if (!handle_valid(vgpu->handle))
 592		return;
 593
 594	if (atomic_cmpxchg(&vgpu->vdev.released, 0, 1))
 595		return;
 596
 597	intel_gvt_ops->vgpu_deactivate(vgpu);
 598
 599	ret = vfio_unregister_notifier(mdev_dev(vgpu->vdev.mdev), VFIO_IOMMU_NOTIFY,
 600					&vgpu->vdev.iommu_notifier);
 601	WARN(ret, "vfio_unregister_notifier for iommu failed: %d\n", ret);
 602
 603	ret = vfio_unregister_notifier(mdev_dev(vgpu->vdev.mdev), VFIO_GROUP_NOTIFY,
 604					&vgpu->vdev.group_notifier);
 605	WARN(ret, "vfio_unregister_notifier for group failed: %d\n", ret);
 606
 607	info = (struct kvmgt_guest_info *)vgpu->handle;
 608	kvmgt_guest_exit(info);
 609
 610	vgpu->vdev.kvm = NULL;
 611	vgpu->handle = 0;
 612}
 613
 614static void intel_vgpu_release(struct mdev_device *mdev)
 615{
 616	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
 617
 618	__intel_vgpu_release(vgpu);
 619}
 620
 621static void intel_vgpu_release_work(struct work_struct *work)
 622{
 623	struct intel_vgpu *vgpu = container_of(work, struct intel_vgpu,
 624					vdev.release_work);
 625
 626	__intel_vgpu_release(vgpu);
 627}
 628
 629static uint64_t intel_vgpu_get_bar_addr(struct intel_vgpu *vgpu, int bar)
 630{
 631	u32 start_lo, start_hi;
 632	u32 mem_type;
 633
 634	start_lo = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
 635			PCI_BASE_ADDRESS_MEM_MASK;
 636	mem_type = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
 637			PCI_BASE_ADDRESS_MEM_TYPE_MASK;
 638
 639	switch (mem_type) {
 640	case PCI_BASE_ADDRESS_MEM_TYPE_64:
 641		start_hi = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space
 642						+ bar + 4));
 643		break;
 644	case PCI_BASE_ADDRESS_MEM_TYPE_32:
 645	case PCI_BASE_ADDRESS_MEM_TYPE_1M:
 646		/* 1M mem BAR treated as 32-bit BAR */
 647	default:
 648		/* mem unknown type treated as 32-bit BAR */
 649		start_hi = 0;
 650		break;
 651	}
 652
 653	return ((u64)start_hi << 32) | start_lo;
 654}
 655
 656static int intel_vgpu_bar_rw(struct intel_vgpu *vgpu, int bar, uint64_t off,
 657			     void *buf, unsigned int count, bool is_write)
 658{
 659	uint64_t bar_start = intel_vgpu_get_bar_addr(vgpu, bar);
 660	int ret;
 661
 662	if (is_write)
 663		ret = intel_gvt_ops->emulate_mmio_write(vgpu,
 664					bar_start + off, buf, count);
 665	else
 666		ret = intel_gvt_ops->emulate_mmio_read(vgpu,
 667					bar_start + off, buf, count);
 668	return ret;
 669}
 670
 671static inline bool intel_vgpu_in_aperture(struct intel_vgpu *vgpu, uint64_t off)
 672{
 673	return off >= vgpu_aperture_offset(vgpu) &&
 674	       off < vgpu_aperture_offset(vgpu) + vgpu_aperture_sz(vgpu);
 675}
 676
 677static int intel_vgpu_aperture_rw(struct intel_vgpu *vgpu, uint64_t off,
 678		void *buf, unsigned long count, bool is_write)
 679{
 680	void *aperture_va;
 681
 682	if (!intel_vgpu_in_aperture(vgpu, off) ||
 683	    !intel_vgpu_in_aperture(vgpu, off + count)) {
 684		gvt_vgpu_err("Invalid aperture offset %llu\n", off);
 685		return -EINVAL;
 686	}
 687
 688	aperture_va = io_mapping_map_wc(&vgpu->gvt->dev_priv->ggtt.iomap,
 689					ALIGN_DOWN(off, PAGE_SIZE),
 690					count + offset_in_page(off));
 691	if (!aperture_va)
 692		return -EIO;
 693
 694	if (is_write)
 695		memcpy(aperture_va + offset_in_page(off), buf, count);
 696	else
 697		memcpy(buf, aperture_va + offset_in_page(off), count);
 698
 699	io_mapping_unmap(aperture_va);
 700
 701	return 0;
 702}
 703
 704static ssize_t intel_vgpu_rw(struct mdev_device *mdev, char *buf,
 705			size_t count, loff_t *ppos, bool is_write)
 706{
 707	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
 708	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
 709	uint64_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
 710	int ret = -EINVAL;
 711
 712
 713	if (index >= VFIO_PCI_NUM_REGIONS + vgpu->vdev.num_regions) {
 714		gvt_vgpu_err("invalid index: %u\n", index);
 715		return -EINVAL;
 716	}
 717
 718	switch (index) {
 719	case VFIO_PCI_CONFIG_REGION_INDEX:
 720		if (is_write)
 721			ret = intel_gvt_ops->emulate_cfg_write(vgpu, pos,
 722						buf, count);
 723		else
 724			ret = intel_gvt_ops->emulate_cfg_read(vgpu, pos,
 725						buf, count);
 726		break;
 727	case VFIO_PCI_BAR0_REGION_INDEX:
 728		ret = intel_vgpu_bar_rw(vgpu, PCI_BASE_ADDRESS_0, pos,
 729					buf, count, is_write);
 730		break;
 731	case VFIO_PCI_BAR2_REGION_INDEX:
 732		ret = intel_vgpu_aperture_rw(vgpu, pos, buf, count, is_write);
 733		break;
 734	case VFIO_PCI_BAR1_REGION_INDEX:
 735	case VFIO_PCI_BAR3_REGION_INDEX:
 736	case VFIO_PCI_BAR4_REGION_INDEX:
 737	case VFIO_PCI_BAR5_REGION_INDEX:
 738	case VFIO_PCI_VGA_REGION_INDEX:
 739	case VFIO_PCI_ROM_REGION_INDEX:
 740		break;
 741	default:
 742		if (index >= VFIO_PCI_NUM_REGIONS + vgpu->vdev.num_regions)
 743			return -EINVAL;
 744
 745		index -= VFIO_PCI_NUM_REGIONS;
 746		return vgpu->vdev.region[index].ops->rw(vgpu, buf, count,
 747				ppos, is_write);
 748	}
 749
 750	return ret == 0 ? count : ret;
 751}
 752
 753static bool gtt_entry(struct mdev_device *mdev, loff_t *ppos)
 754{
 755	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
 756	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
 757	struct intel_gvt *gvt = vgpu->gvt;
 758	int offset;
 759
 760	/* Only allow MMIO GGTT entry access */
 761	if (index != PCI_BASE_ADDRESS_0)
 762		return false;
 763
 764	offset = (u64)(*ppos & VFIO_PCI_OFFSET_MASK) -
 765		intel_vgpu_get_bar_gpa(vgpu, PCI_BASE_ADDRESS_0);
 766
 767	return (offset >= gvt->device_info.gtt_start_offset &&
 768		offset < gvt->device_info.gtt_start_offset + gvt_ggtt_sz(gvt)) ?
 769			true : false;
 770}
 771
 772static ssize_t intel_vgpu_read(struct mdev_device *mdev, char __user *buf,
 773			size_t count, loff_t *ppos)
 774{
 775	unsigned int done = 0;
 776	int ret;
 777
 778	while (count) {
 779		size_t filled;
 780
 781		/* Only support GGTT entry 8 bytes read */
 782		if (count >= 8 && !(*ppos % 8) &&
 783			gtt_entry(mdev, ppos)) {
 784			u64 val;
 785
 786			ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
 787					ppos, false);
 788			if (ret <= 0)
 789				goto read_err;
 790
 791			if (copy_to_user(buf, &val, sizeof(val)))
 792				goto read_err;
 793
 794			filled = 8;
 795		} else if (count >= 4 && !(*ppos % 4)) {
 796			u32 val;
 797
 798			ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
 799					ppos, false);
 800			if (ret <= 0)
 801				goto read_err;
 802
 803			if (copy_to_user(buf, &val, sizeof(val)))
 804				goto read_err;
 805
 806			filled = 4;
 807		} else if (count >= 2 && !(*ppos % 2)) {
 808			u16 val;
 809
 810			ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
 811					ppos, false);
 812			if (ret <= 0)
 813				goto read_err;
 814
 815			if (copy_to_user(buf, &val, sizeof(val)))
 816				goto read_err;
 817
 818			filled = 2;
 819		} else {
 820			u8 val;
 821
 822			ret = intel_vgpu_rw(mdev, &val, sizeof(val), ppos,
 823					false);
 824			if (ret <= 0)
 825				goto read_err;
 826
 827			if (copy_to_user(buf, &val, sizeof(val)))
 828				goto read_err;
 829
 830			filled = 1;
 831		}
 832
 833		count -= filled;
 834		done += filled;
 835		*ppos += filled;
 836		buf += filled;
 837	}
 838
 839	return done;
 840
 841read_err:
 842	return -EFAULT;
 843}
 844
 845static ssize_t intel_vgpu_write(struct mdev_device *mdev,
 846				const char __user *buf,
 847				size_t count, loff_t *ppos)
 848{
 849	unsigned int done = 0;
 850	int ret;
 851
 852	while (count) {
 853		size_t filled;
 854
 855		/* Only support GGTT entry 8 bytes write */
 856		if (count >= 8 && !(*ppos % 8) &&
 857			gtt_entry(mdev, ppos)) {
 858			u64 val;
 859
 860			if (copy_from_user(&val, buf, sizeof(val)))
 861				goto write_err;
 862
 863			ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
 864					ppos, true);
 865			if (ret <= 0)
 866				goto write_err;
 867
 868			filled = 8;
 869		} else if (count >= 4 && !(*ppos % 4)) {
 870			u32 val;
 871
 872			if (copy_from_user(&val, buf, sizeof(val)))
 873				goto write_err;
 874
 875			ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
 876					ppos, true);
 877			if (ret <= 0)
 878				goto write_err;
 879
 880			filled = 4;
 881		} else if (count >= 2 && !(*ppos % 2)) {
 882			u16 val;
 883
 884			if (copy_from_user(&val, buf, sizeof(val)))
 885				goto write_err;
 886
 887			ret = intel_vgpu_rw(mdev, (char *)&val,
 888					sizeof(val), ppos, true);
 889			if (ret <= 0)
 890				goto write_err;
 891
 892			filled = 2;
 893		} else {
 894			u8 val;
 895
 896			if (copy_from_user(&val, buf, sizeof(val)))
 897				goto write_err;
 898
 899			ret = intel_vgpu_rw(mdev, &val, sizeof(val),
 900					ppos, true);
 901			if (ret <= 0)
 902				goto write_err;
 903
 904			filled = 1;
 905		}
 906
 907		count -= filled;
 908		done += filled;
 909		*ppos += filled;
 910		buf += filled;
 911	}
 912
 913	return done;
 914write_err:
 915	return -EFAULT;
 916}
 917
 918static int intel_vgpu_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
 919{
 920	unsigned int index;
 921	u64 virtaddr;
 922	unsigned long req_size, pgoff = 0;
 923	pgprot_t pg_prot;
 924	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
 925
 926	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
 927	if (index >= VFIO_PCI_ROM_REGION_INDEX)
 928		return -EINVAL;
 929
 930	if (vma->vm_end < vma->vm_start)
 931		return -EINVAL;
 932	if ((vma->vm_flags & VM_SHARED) == 0)
 933		return -EINVAL;
 934	if (index != VFIO_PCI_BAR2_REGION_INDEX)
 935		return -EINVAL;
 936
 937	pg_prot = vma->vm_page_prot;
 938	virtaddr = vma->vm_start;
 939	req_size = vma->vm_end - vma->vm_start;
 940	pgoff = vgpu_aperture_pa_base(vgpu) >> PAGE_SHIFT;
 941
 942	return remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);
 943}
 944
 945static int intel_vgpu_get_irq_count(struct intel_vgpu *vgpu, int type)
 946{
 947	if (type == VFIO_PCI_INTX_IRQ_INDEX || type == VFIO_PCI_MSI_IRQ_INDEX)
 948		return 1;
 949
 950	return 0;
 951}
 952
 953static int intel_vgpu_set_intx_mask(struct intel_vgpu *vgpu,
 954			unsigned int index, unsigned int start,
 955			unsigned int count, uint32_t flags,
 956			void *data)
 957{
 958	return 0;
 959}
 960
 961static int intel_vgpu_set_intx_unmask(struct intel_vgpu *vgpu,
 962			unsigned int index, unsigned int start,
 963			unsigned int count, uint32_t flags, void *data)
 964{
 965	return 0;
 966}
 967
 968static int intel_vgpu_set_intx_trigger(struct intel_vgpu *vgpu,
 969		unsigned int index, unsigned int start, unsigned int count,
 970		uint32_t flags, void *data)
 971{
 972	return 0;
 973}
 974
 975static int intel_vgpu_set_msi_trigger(struct intel_vgpu *vgpu,
 976		unsigned int index, unsigned int start, unsigned int count,
 977		uint32_t flags, void *data)
 978{
 979	struct eventfd_ctx *trigger;
 980
 981	if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
 982		int fd = *(int *)data;
 983
 984		trigger = eventfd_ctx_fdget(fd);
 985		if (IS_ERR(trigger)) {
 986			gvt_vgpu_err("eventfd_ctx_fdget failed\n");
 987			return PTR_ERR(trigger);
 988		}
 989		vgpu->vdev.msi_trigger = trigger;
 990	}
 991
 992	return 0;
 993}
 994
 995static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, uint32_t flags,
 996		unsigned int index, unsigned int start, unsigned int count,
 997		void *data)
 998{
 999	int (*func)(struct intel_vgpu *vgpu, unsigned int index,
1000			unsigned int start, unsigned int count, uint32_t flags,
1001			void *data) = NULL;
1002
1003	switch (index) {
1004	case VFIO_PCI_INTX_IRQ_INDEX:
1005		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1006		case VFIO_IRQ_SET_ACTION_MASK:
1007			func = intel_vgpu_set_intx_mask;
1008			break;
1009		case VFIO_IRQ_SET_ACTION_UNMASK:
1010			func = intel_vgpu_set_intx_unmask;
1011			break;
1012		case VFIO_IRQ_SET_ACTION_TRIGGER:
1013			func = intel_vgpu_set_intx_trigger;
1014			break;
1015		}
1016		break;
1017	case VFIO_PCI_MSI_IRQ_INDEX:
1018		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1019		case VFIO_IRQ_SET_ACTION_MASK:
1020		case VFIO_IRQ_SET_ACTION_UNMASK:
1021			/* XXX Need masking support exported */
1022			break;
1023		case VFIO_IRQ_SET_ACTION_TRIGGER:
1024			func = intel_vgpu_set_msi_trigger;
1025			break;
1026		}
1027		break;
1028	}
1029
1030	if (!func)
1031		return -ENOTTY;
1032
1033	return func(vgpu, index, start, count, flags, data);
1034}
1035
1036static long intel_vgpu_ioctl(struct mdev_device *mdev, unsigned int cmd,
1037			     unsigned long arg)
1038{
1039	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1040	unsigned long minsz;
1041
1042	gvt_dbg_core("vgpu%d ioctl, cmd: %d\n", vgpu->id, cmd);
1043
1044	if (cmd == VFIO_DEVICE_GET_INFO) {
1045		struct vfio_device_info info;
1046
1047		minsz = offsetofend(struct vfio_device_info, num_irqs);
1048
1049		if (copy_from_user(&info, (void __user *)arg, minsz))
1050			return -EFAULT;
1051
1052		if (info.argsz < minsz)
1053			return -EINVAL;
1054
1055		info.flags = VFIO_DEVICE_FLAGS_PCI;
1056		info.flags |= VFIO_DEVICE_FLAGS_RESET;
1057		info.num_regions = VFIO_PCI_NUM_REGIONS +
1058				vgpu->vdev.num_regions;
1059		info.num_irqs = VFIO_PCI_NUM_IRQS;
1060
1061		return copy_to_user((void __user *)arg, &info, minsz) ?
1062			-EFAULT : 0;
1063
1064	} else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
1065		struct vfio_region_info info;
1066		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
1067		int i, ret;
1068		struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
1069		size_t size;
1070		int nr_areas = 1;
1071		int cap_type_id;
1072
1073		minsz = offsetofend(struct vfio_region_info, offset);
1074
1075		if (copy_from_user(&info, (void __user *)arg, minsz))
1076			return -EFAULT;
1077
1078		if (info.argsz < minsz)
1079			return -EINVAL;
1080
1081		switch (info.index) {
1082		case VFIO_PCI_CONFIG_REGION_INDEX:
1083			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1084			info.size = vgpu->gvt->device_info.cfg_space_size;
1085			info.flags = VFIO_REGION_INFO_FLAG_READ |
1086				     VFIO_REGION_INFO_FLAG_WRITE;
1087			break;
1088		case VFIO_PCI_BAR0_REGION_INDEX:
1089			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1090			info.size = vgpu->cfg_space.bar[info.index].size;
1091			if (!info.size) {
1092				info.flags = 0;
1093				break;
1094			}
1095
1096			info.flags = VFIO_REGION_INFO_FLAG_READ |
1097				     VFIO_REGION_INFO_FLAG_WRITE;
1098			break;
1099		case VFIO_PCI_BAR1_REGION_INDEX:
1100			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1101			info.size = 0;
1102			info.flags = 0;
1103			break;
1104		case VFIO_PCI_BAR2_REGION_INDEX:
1105			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1106			info.flags = VFIO_REGION_INFO_FLAG_CAPS |
1107					VFIO_REGION_INFO_FLAG_MMAP |
1108					VFIO_REGION_INFO_FLAG_READ |
1109					VFIO_REGION_INFO_FLAG_WRITE;
1110			info.size = gvt_aperture_sz(vgpu->gvt);
1111
1112			size = sizeof(*sparse) +
1113					(nr_areas * sizeof(*sparse->areas));
1114			sparse = kzalloc(size, GFP_KERNEL);
1115			if (!sparse)
1116				return -ENOMEM;
1117
1118			sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1119			sparse->header.version = 1;
1120			sparse->nr_areas = nr_areas;
1121			cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1122			sparse->areas[0].offset =
1123					PAGE_ALIGN(vgpu_aperture_offset(vgpu));
1124			sparse->areas[0].size = vgpu_aperture_sz(vgpu);
1125			break;
1126
1127		case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
1128			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1129			info.size = 0;
1130			info.flags = 0;
1131
1132			gvt_dbg_core("get region info bar:%d\n", info.index);
1133			break;
1134
1135		case VFIO_PCI_ROM_REGION_INDEX:
1136		case VFIO_PCI_VGA_REGION_INDEX:
1137			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1138			info.size = 0;
1139			info.flags = 0;
1140
1141			gvt_dbg_core("get region info index:%d\n", info.index);
1142			break;
1143		default:
1144			{
1145				struct vfio_region_info_cap_type cap_type = {
1146					.header.id = VFIO_REGION_INFO_CAP_TYPE,
1147					.header.version = 1 };
1148
1149				if (info.index >= VFIO_PCI_NUM_REGIONS +
1150						vgpu->vdev.num_regions)
1151					return -EINVAL;
1152
1153				i = info.index - VFIO_PCI_NUM_REGIONS;
1154
1155				info.offset =
1156					VFIO_PCI_INDEX_TO_OFFSET(info.index);
1157				info.size = vgpu->vdev.region[i].size;
1158				info.flags = vgpu->vdev.region[i].flags;
1159
1160				cap_type.type = vgpu->vdev.region[i].type;
1161				cap_type.subtype = vgpu->vdev.region[i].subtype;
1162
1163				ret = vfio_info_add_capability(&caps,
1164							&cap_type.header,
1165							sizeof(cap_type));
1166				if (ret)
1167					return ret;
1168			}
1169		}
1170
1171		if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) {
1172			switch (cap_type_id) {
1173			case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1174				ret = vfio_info_add_capability(&caps,
1175					&sparse->header, sizeof(*sparse) +
1176					(sparse->nr_areas *
1177						sizeof(*sparse->areas)));
1178				kfree(sparse);
1179				if (ret)
1180					return ret;
1181				break;
1182			default:
1183				return -EINVAL;
1184			}
1185		}
1186
1187		if (caps.size) {
1188			info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
1189			if (info.argsz < sizeof(info) + caps.size) {
1190				info.argsz = sizeof(info) + caps.size;
1191				info.cap_offset = 0;
1192			} else {
1193				vfio_info_cap_shift(&caps, sizeof(info));
1194				if (copy_to_user((void __user *)arg +
1195						  sizeof(info), caps.buf,
1196						  caps.size)) {
1197					kfree(caps.buf);
1198					return -EFAULT;
1199				}
1200				info.cap_offset = sizeof(info);
1201			}
1202
1203			kfree(caps.buf);
1204		}
1205
1206		return copy_to_user((void __user *)arg, &info, minsz) ?
1207			-EFAULT : 0;
1208	} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
1209		struct vfio_irq_info info;
1210
1211		minsz = offsetofend(struct vfio_irq_info, count);
1212
1213		if (copy_from_user(&info, (void __user *)arg, minsz))
1214			return -EFAULT;
1215
1216		if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
1217			return -EINVAL;
1218
1219		switch (info.index) {
1220		case VFIO_PCI_INTX_IRQ_INDEX:
1221		case VFIO_PCI_MSI_IRQ_INDEX:
1222			break;
1223		default:
1224			return -EINVAL;
1225		}
1226
1227		info.flags = VFIO_IRQ_INFO_EVENTFD;
1228
1229		info.count = intel_vgpu_get_irq_count(vgpu, info.index);
1230
1231		if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
1232			info.flags |= (VFIO_IRQ_INFO_MASKABLE |
1233				       VFIO_IRQ_INFO_AUTOMASKED);
1234		else
1235			info.flags |= VFIO_IRQ_INFO_NORESIZE;
1236
1237		return copy_to_user((void __user *)arg, &info, minsz) ?
1238			-EFAULT : 0;
1239	} else if (cmd == VFIO_DEVICE_SET_IRQS) {
1240		struct vfio_irq_set hdr;
1241		u8 *data = NULL;
1242		int ret = 0;
1243		size_t data_size = 0;
1244
1245		minsz = offsetofend(struct vfio_irq_set, count);
1246
1247		if (copy_from_user(&hdr, (void __user *)arg, minsz))
1248			return -EFAULT;
1249
1250		if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
1251			int max = intel_vgpu_get_irq_count(vgpu, hdr.index);
1252
1253			ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
1254						VFIO_PCI_NUM_IRQS, &data_size);
1255			if (ret) {
1256				gvt_vgpu_err("intel:vfio_set_irqs_validate_and_prepare failed\n");
1257				return -EINVAL;
1258			}
1259			if (data_size) {
1260				data = memdup_user((void __user *)(arg + minsz),
1261						   data_size);
1262				if (IS_ERR(data))
1263					return PTR_ERR(data);
1264			}
1265		}
1266
1267		ret = intel_vgpu_set_irqs(vgpu, hdr.flags, hdr.index,
1268					hdr.start, hdr.count, data);
1269		kfree(data);
1270
1271		return ret;
1272	} else if (cmd == VFIO_DEVICE_RESET) {
1273		intel_gvt_ops->vgpu_reset(vgpu);
1274		return 0;
1275	} else if (cmd == VFIO_DEVICE_QUERY_GFX_PLANE) {
1276		struct vfio_device_gfx_plane_info dmabuf;
1277		int ret = 0;
1278
1279		minsz = offsetofend(struct vfio_device_gfx_plane_info,
1280				    dmabuf_id);
1281		if (copy_from_user(&dmabuf, (void __user *)arg, minsz))
1282			return -EFAULT;
1283		if (dmabuf.argsz < minsz)
1284			return -EINVAL;
1285
1286		ret = intel_gvt_ops->vgpu_query_plane(vgpu, &dmabuf);
1287		if (ret != 0)
1288			return ret;
1289
1290		return copy_to_user((void __user *)arg, &dmabuf, minsz) ?
1291								-EFAULT : 0;
1292	} else if (cmd == VFIO_DEVICE_GET_GFX_DMABUF) {
1293		__u32 dmabuf_id;
1294		__s32 dmabuf_fd;
1295
1296		if (get_user(dmabuf_id, (__u32 __user *)arg))
1297			return -EFAULT;
1298
1299		dmabuf_fd = intel_gvt_ops->vgpu_get_dmabuf(vgpu, dmabuf_id);
1300		return dmabuf_fd;
1301
1302	}
1303
1304	return -ENOTTY;
1305}
1306
1307static ssize_t
1308vgpu_id_show(struct device *dev, struct device_attribute *attr,
1309	     char *buf)
1310{
1311	struct mdev_device *mdev = mdev_from_dev(dev);
1312
1313	if (mdev) {
1314		struct intel_vgpu *vgpu = (struct intel_vgpu *)
1315			mdev_get_drvdata(mdev);
1316		return sprintf(buf, "%d\n", vgpu->id);
1317	}
1318	return sprintf(buf, "\n");
1319}
1320
1321static ssize_t
1322hw_id_show(struct device *dev, struct device_attribute *attr,
1323	   char *buf)
1324{
1325	struct mdev_device *mdev = mdev_from_dev(dev);
1326
1327	if (mdev) {
1328		struct intel_vgpu *vgpu = (struct intel_vgpu *)
1329			mdev_get_drvdata(mdev);
1330		return sprintf(buf, "%u\n",
1331			       vgpu->submission.shadow_ctx->hw_id);
1332	}
1333	return sprintf(buf, "\n");
1334}
1335
1336static DEVICE_ATTR_RO(vgpu_id);
1337static DEVICE_ATTR_RO(hw_id);
1338
1339static struct attribute *intel_vgpu_attrs[] = {
1340	&dev_attr_vgpu_id.attr,
1341	&dev_attr_hw_id.attr,
1342	NULL
1343};
1344
1345static const struct attribute_group intel_vgpu_group = {
1346	.name = "intel_vgpu",
1347	.attrs = intel_vgpu_attrs,
1348};
1349
1350static const struct attribute_group *intel_vgpu_groups[] = {
1351	&intel_vgpu_group,
1352	NULL,
1353};
1354
1355static struct mdev_parent_ops intel_vgpu_ops = {
1356	.mdev_attr_groups       = intel_vgpu_groups,
1357	.create			= intel_vgpu_create,
1358	.remove			= intel_vgpu_remove,
1359
1360	.open			= intel_vgpu_open,
1361	.release		= intel_vgpu_release,
1362
1363	.read			= intel_vgpu_read,
1364	.write			= intel_vgpu_write,
1365	.mmap			= intel_vgpu_mmap,
1366	.ioctl			= intel_vgpu_ioctl,
1367};
1368
1369static int kvmgt_host_init(struct device *dev, void *gvt, const void *ops)
1370{
1371	struct attribute **kvm_type_attrs;
1372	struct attribute_group **kvm_vgpu_type_groups;
1373
1374	intel_gvt_ops = ops;
1375	if (!intel_gvt_ops->get_gvt_attrs(&kvm_type_attrs,
1376			&kvm_vgpu_type_groups))
1377		return -EFAULT;
1378	intel_vgpu_ops.supported_type_groups = kvm_vgpu_type_groups;
1379
1380	return mdev_register_device(dev, &intel_vgpu_ops);
1381}
1382
1383static void kvmgt_host_exit(struct device *dev, void *gvt)
1384{
1385	mdev_unregister_device(dev);
1386}
1387
1388static int kvmgt_page_track_add(unsigned long handle, u64 gfn)
1389{
1390	struct kvmgt_guest_info *info;
1391	struct kvm *kvm;
1392	struct kvm_memory_slot *slot;
1393	int idx;
1394
1395	if (!handle_valid(handle))
1396		return -ESRCH;
1397
1398	info = (struct kvmgt_guest_info *)handle;
1399	kvm = info->kvm;
1400
1401	idx = srcu_read_lock(&kvm->srcu);
1402	slot = gfn_to_memslot(kvm, gfn);
1403	if (!slot) {
1404		srcu_read_unlock(&kvm->srcu, idx);
1405		return -EINVAL;
1406	}
1407
1408	spin_lock(&kvm->mmu_lock);
1409
1410	if (kvmgt_gfn_is_write_protected(info, gfn))
1411		goto out;
1412
1413	kvm_slot_page_track_add_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1414	kvmgt_protect_table_add(info, gfn);
1415
1416out:
1417	spin_unlock(&kvm->mmu_lock);
1418	srcu_read_unlock(&kvm->srcu, idx);
1419	return 0;
1420}
1421
1422static int kvmgt_page_track_remove(unsigned long handle, u64 gfn)
1423{
1424	struct kvmgt_guest_info *info;
1425	struct kvm *kvm;
1426	struct kvm_memory_slot *slot;
1427	int idx;
1428
1429	if (!handle_valid(handle))
1430		return 0;
1431
1432	info = (struct kvmgt_guest_info *)handle;
1433	kvm = info->kvm;
1434
1435	idx = srcu_read_lock(&kvm->srcu);
1436	slot = gfn_to_memslot(kvm, gfn);
1437	if (!slot) {
1438		srcu_read_unlock(&kvm->srcu, idx);
1439		return -EINVAL;
1440	}
1441
1442	spin_lock(&kvm->mmu_lock);
1443
1444	if (!kvmgt_gfn_is_write_protected(info, gfn))
1445		goto out;
1446
1447	kvm_slot_page_track_remove_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1448	kvmgt_protect_table_del(info, gfn);
1449
1450out:
1451	spin_unlock(&kvm->mmu_lock);
1452	srcu_read_unlock(&kvm->srcu, idx);
1453	return 0;
1454}
1455
1456static void kvmgt_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1457		const u8 *val, int len,
1458		struct kvm_page_track_notifier_node *node)
1459{
1460	struct kvmgt_guest_info *info = container_of(node,
1461					struct kvmgt_guest_info, track_node);
1462
1463	if (kvmgt_gfn_is_write_protected(info, gpa_to_gfn(gpa)))
1464		intel_gvt_ops->write_protect_handler(info->vgpu, gpa,
1465						     (void *)val, len);
1466}
1467
1468static void kvmgt_page_track_flush_slot(struct kvm *kvm,
1469		struct kvm_memory_slot *slot,
1470		struct kvm_page_track_notifier_node *node)
1471{
1472	int i;
1473	gfn_t gfn;
1474	struct kvmgt_guest_info *info = container_of(node,
1475					struct kvmgt_guest_info, track_node);
1476
1477	spin_lock(&kvm->mmu_lock);
1478	for (i = 0; i < slot->npages; i++) {
1479		gfn = slot->base_gfn + i;
1480		if (kvmgt_gfn_is_write_protected(info, gfn)) {
1481			kvm_slot_page_track_remove_page(kvm, slot, gfn,
1482						KVM_PAGE_TRACK_WRITE);
1483			kvmgt_protect_table_del(info, gfn);
1484		}
1485	}
1486	spin_unlock(&kvm->mmu_lock);
1487}
1488
1489static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu, struct kvm *kvm)
1490{
1491	struct intel_vgpu *itr;
1492	struct kvmgt_guest_info *info;
1493	int id;
1494	bool ret = false;
1495
1496	mutex_lock(&vgpu->gvt->lock);
1497	for_each_active_vgpu(vgpu->gvt, itr, id) {
1498		if (!handle_valid(itr->handle))
1499			continue;
1500
1501		info = (struct kvmgt_guest_info *)itr->handle;
1502		if (kvm && kvm == info->kvm) {
1503			ret = true;
1504			goto out;
1505		}
1506	}
1507out:
1508	mutex_unlock(&vgpu->gvt->lock);
1509	return ret;
1510}
1511
1512static int kvmgt_guest_init(struct mdev_device *mdev)
1513{
1514	struct kvmgt_guest_info *info;
1515	struct intel_vgpu *vgpu;
1516	struct kvm *kvm;
1517
1518	vgpu = mdev_get_drvdata(mdev);
1519	if (handle_valid(vgpu->handle))
1520		return -EEXIST;
1521
1522	kvm = vgpu->vdev.kvm;
1523	if (!kvm || kvm->mm != current->mm) {
1524		gvt_vgpu_err("KVM is required to use Intel vGPU\n");
1525		return -ESRCH;
1526	}
1527
1528	if (__kvmgt_vgpu_exist(vgpu, kvm))
1529		return -EEXIST;
1530
1531	info = vzalloc(sizeof(struct kvmgt_guest_info));
1532	if (!info)
1533		return -ENOMEM;
1534
1535	vgpu->handle = (unsigned long)info;
1536	info->vgpu = vgpu;
1537	info->kvm = kvm;
1538	kvm_get_kvm(info->kvm);
1539
1540	kvmgt_protect_table_init(info);
1541	gvt_cache_init(vgpu);
1542
1543	mutex_init(&vgpu->dmabuf_lock);
1544	init_completion(&vgpu->vblank_done);
1545
1546	info->track_node.track_write = kvmgt_page_track_write;
1547	info->track_node.track_flush_slot = kvmgt_page_track_flush_slot;
1548	kvm_page_track_register_notifier(kvm, &info->track_node);
1549
1550	info->debugfs_cache_entries = debugfs_create_ulong(
1551						"kvmgt_nr_cache_entries",
1552						0444, vgpu->debugfs,
1553						&vgpu->vdev.nr_cache_entries);
1554	if (!info->debugfs_cache_entries)
1555		gvt_vgpu_err("Cannot create kvmgt debugfs entry\n");
1556
1557	return 0;
1558}
1559
1560static bool kvmgt_guest_exit(struct kvmgt_guest_info *info)
1561{
1562	debugfs_remove(info->debugfs_cache_entries);
1563
1564	kvm_page_track_unregister_notifier(info->kvm, &info->track_node);
1565	kvm_put_kvm(info->kvm);
1566	kvmgt_protect_table_destroy(info);
1567	gvt_cache_destroy(info->vgpu);
1568	vfree(info);
1569
1570	return true;
1571}
1572
1573static int kvmgt_attach_vgpu(void *vgpu, unsigned long *handle)
1574{
1575	/* nothing to do here */
1576	return 0;
1577}
1578
1579static void kvmgt_detach_vgpu(unsigned long handle)
1580{
1581	/* nothing to do here */
1582}
1583
1584static int kvmgt_inject_msi(unsigned long handle, u32 addr, u16 data)
1585{
1586	struct kvmgt_guest_info *info;
1587	struct intel_vgpu *vgpu;
1588
1589	if (!handle_valid(handle))
1590		return -ESRCH;
1591
1592	info = (struct kvmgt_guest_info *)handle;
1593	vgpu = info->vgpu;
1594
1595	if (eventfd_signal(vgpu->vdev.msi_trigger, 1) == 1)
1596		return 0;
1597
1598	return -EFAULT;
1599}
1600
1601static unsigned long kvmgt_gfn_to_pfn(unsigned long handle, unsigned long gfn)
1602{
1603	struct kvmgt_guest_info *info;
1604	kvm_pfn_t pfn;
1605
1606	if (!handle_valid(handle))
1607		return INTEL_GVT_INVALID_ADDR;
1608
1609	info = (struct kvmgt_guest_info *)handle;
1610
1611	pfn = gfn_to_pfn(info->kvm, gfn);
1612	if (is_error_noslot_pfn(pfn))
1613		return INTEL_GVT_INVALID_ADDR;
1614
1615	return pfn;
1616}
1617
1618int kvmgt_dma_map_guest_page(unsigned long handle, unsigned long gfn,
1619		dma_addr_t *dma_addr)
1620{
1621	struct kvmgt_guest_info *info;
1622	struct intel_vgpu *vgpu;
1623	struct gvt_dma *entry;
1624	int ret;
1625
1626	if (!handle_valid(handle))
1627		return -EINVAL;
1628
1629	info = (struct kvmgt_guest_info *)handle;
1630	vgpu = info->vgpu;
1631
1632	mutex_lock(&info->vgpu->vdev.cache_lock);
1633
1634	entry = __gvt_cache_find_gfn(info->vgpu, gfn);
1635	if (!entry) {
1636		ret = gvt_dma_map_page(vgpu, gfn, dma_addr);
1637		if (ret)
1638			goto err_unlock;
1639
1640		ret = __gvt_cache_add(info->vgpu, gfn, *dma_addr);
1641		if (ret)
1642			goto err_unmap;
1643	} else {
1644		kref_get(&entry->ref);
1645		*dma_addr = entry->dma_addr;
1646	}
1647
1648	mutex_unlock(&info->vgpu->vdev.cache_lock);
1649	return 0;
1650
1651err_unmap:
1652	gvt_dma_unmap_page(vgpu, gfn, *dma_addr);
1653err_unlock:
1654	mutex_unlock(&info->vgpu->vdev.cache_lock);
1655	return ret;
1656}
1657
1658static void __gvt_dma_release(struct kref *ref)
1659{
1660	struct gvt_dma *entry = container_of(ref, typeof(*entry), ref);
1661
1662	gvt_dma_unmap_page(entry->vgpu, entry->gfn, entry->dma_addr);
1663	__gvt_cache_remove_entry(entry->vgpu, entry);
1664}
1665
1666void kvmgt_dma_unmap_guest_page(unsigned long handle, dma_addr_t dma_addr)
1667{
1668	struct kvmgt_guest_info *info;
1669	struct gvt_dma *entry;
1670
1671	if (!handle_valid(handle))
1672		return;
1673
1674	info = (struct kvmgt_guest_info *)handle;
1675
1676	mutex_lock(&info->vgpu->vdev.cache_lock);
1677	entry = __gvt_cache_find_dma_addr(info->vgpu, dma_addr);
1678	if (entry)
1679		kref_put(&entry->ref, __gvt_dma_release);
1680	mutex_unlock(&info->vgpu->vdev.cache_lock);
1681}
1682
1683static int kvmgt_rw_gpa(unsigned long handle, unsigned long gpa,
1684			void *buf, unsigned long len, bool write)
1685{
1686	struct kvmgt_guest_info *info;
1687	struct kvm *kvm;
1688	int idx, ret;
1689	bool kthread = current->mm == NULL;
1690
1691	if (!handle_valid(handle))
1692		return -ESRCH;
1693
1694	info = (struct kvmgt_guest_info *)handle;
1695	kvm = info->kvm;
1696
1697	if (kthread)
1698		use_mm(kvm->mm);
1699
1700	idx = srcu_read_lock(&kvm->srcu);
1701	ret = write ? kvm_write_guest(kvm, gpa, buf, len) :
1702		      kvm_read_guest(kvm, gpa, buf, len);
1703	srcu_read_unlock(&kvm->srcu, idx);
1704
1705	if (kthread)
1706		unuse_mm(kvm->mm);
1707
1708	return ret;
1709}
1710
1711static int kvmgt_read_gpa(unsigned long handle, unsigned long gpa,
1712			void *buf, unsigned long len)
1713{
1714	return kvmgt_rw_gpa(handle, gpa, buf, len, false);
1715}
1716
1717static int kvmgt_write_gpa(unsigned long handle, unsigned long gpa,
1718			void *buf, unsigned long len)
1719{
1720	return kvmgt_rw_gpa(handle, gpa, buf, len, true);
1721}
1722
1723static unsigned long kvmgt_virt_to_pfn(void *addr)
1724{
1725	return PFN_DOWN(__pa(addr));
1726}
1727
1728static bool kvmgt_is_valid_gfn(unsigned long handle, unsigned long gfn)
1729{
1730	struct kvmgt_guest_info *info;
1731	struct kvm *kvm;
1732
1733	if (!handle_valid(handle))
1734		return false;
1735
1736	info = (struct kvmgt_guest_info *)handle;
1737	kvm = info->kvm;
1738
1739	return kvm_is_visible_gfn(kvm, gfn);
1740
1741}
1742
1743struct intel_gvt_mpt kvmgt_mpt = {
1744	.host_init = kvmgt_host_init,
1745	.host_exit = kvmgt_host_exit,
1746	.attach_vgpu = kvmgt_attach_vgpu,
1747	.detach_vgpu = kvmgt_detach_vgpu,
1748	.inject_msi = kvmgt_inject_msi,
1749	.from_virt_to_mfn = kvmgt_virt_to_pfn,
1750	.enable_page_track = kvmgt_page_track_add,
1751	.disable_page_track = kvmgt_page_track_remove,
1752	.read_gpa = kvmgt_read_gpa,
1753	.write_gpa = kvmgt_write_gpa,
1754	.gfn_to_mfn = kvmgt_gfn_to_pfn,
1755	.dma_map_guest_page = kvmgt_dma_map_guest_page,
1756	.dma_unmap_guest_page = kvmgt_dma_unmap_guest_page,
1757	.set_opregion = kvmgt_set_opregion,
1758	.get_vfio_device = kvmgt_get_vfio_device,
1759	.put_vfio_device = kvmgt_put_vfio_device,
1760	.is_valid_gfn = kvmgt_is_valid_gfn,
1761};
1762EXPORT_SYMBOL_GPL(kvmgt_mpt);
1763
1764static int __init kvmgt_init(void)
1765{
1766	return 0;
1767}
1768
1769static void __exit kvmgt_exit(void)
1770{
1771}
1772
1773module_init(kvmgt_init);
1774module_exit(kvmgt_exit);
1775
1776MODULE_LICENSE("GPL and additional rights");
1777MODULE_AUTHOR("Intel Corporation");